├── Docker ├── Dockerfile ├── build_docker.sh ├── make_simg.sh ├── push_docker.sh ├── run_docker.sh ├── run_eval.sh └── run_singularity.sh ├── PerlLib ├── DelimParser.pm ├── Fasta_reader.pm ├── Overlap_piler.pm ├── Pipeliner.pm └── Process_cmd.pm ├── README.md ├── alt_methods ├── STAR-Fusion │ └── uger │ │ ├── starF_v1.5_hg19.cmd │ │ └── starF_v1.5_hg19.conf ├── TrinityFusion │ ├── uger │ │ ├── TrinityFusion-C │ │ │ ├── TrinityFusion-C.hg19.cmd │ │ │ └── TrinityFusion-C.hg19.conf │ │ ├── TrinityFusion-D │ │ │ ├── TrinityFusion-D.hg19.cmd │ │ │ └── TrinityFusion-D.hg19.conf │ │ └── TrinityFusion-UC │ │ │ ├── TrinityFusion-UC.hg19.cmd │ │ │ └── TrinityFusion-UC.hg19.conf │ └── wdl │ │ ├── TrinityFusion.wdl │ │ ├── inputs.json │ │ ├── make_wdl_input_template.sh │ │ └── run.sh ├── arriba │ ├── README.md │ ├── arriba_wrapper.pl │ └── uger │ │ ├── arriba.uger.cmd │ │ └── arriba.uger.conf ├── prada │ ├── Dockerfile │ ├── VERSION.txt │ └── build_docker.sh ├── star-seqr │ ├── docker │ │ ├── make_simg.sh │ │ └── run_test.sh │ └── uger │ │ ├── star-seqr.uger.cmd │ │ └── star-seqr.uger.conf └── starchip │ ├── Docker │ ├── Dockerfile │ ├── Pipeliner.pm │ ├── VERSION.txt │ ├── build_docker.sh │ ├── make_simg.sh │ ├── push_docker.sh │ └── starchip_wrapper.pl │ ├── README.md │ ├── cleanMe.sh │ ├── run_test.sh │ ├── test_data │ ├── reads_1.fq.gz │ └── reads_2.fq.gz │ ├── test_outdir │ ├── Aligned.out.bam │ ├── Chimeric.out.junction │ ├── Chimeric.out.sam │ ├── Log.final.out │ ├── Log.out │ ├── Log.progress.out │ ├── ReadsPerGene.out.tab │ ├── SJ.out.tab │ ├── Unmapped.out.mate1 │ ├── Unmapped.out.mate2 │ ├── __starchip_chkpts │ │ ├── pipeliner.2350.cmds │ │ ├── pipeliner.2900.cmds │ │ ├── star_align.ok │ │ └── starchip.ok │ ├── starchip.summary │ └── starchip.summary.annotated │ └── uger │ ├── starchip.uger.cmd │ └── starchip.uger.conf ├── benchmarking ├── FusionProgParsers │ ├── ARRIBA_hc_parser.pm │ ├── ARRIBA_parser.pm │ ├── ChimPipe_parser.pm │ ├── ChimeraScan_parser.pm │ ├── DEFUSE_parser.pm │ ├── EricScript_parser.pm │ ├── FusionCatcher_KP_parser.pm │ ├── FusionCatcher_parser.pm │ ├── FusionHunter_parser.pm │ ├── FusionInspector_parser.pm │ ├── InFusion_parser.pm │ ├── JAFFA_parser.pm │ ├── MapSplice_parser.pm │ ├── NFuse_parser.pm │ ├── PIZZLY_parser.pm │ ├── PRADA_parser.pm │ ├── SOAPfuse_parser.pm │ ├── STARCHIP_parser.pm │ ├── STARFusion_parser.pm │ ├── STARSEQR_parser.pm │ ├── TopHatFusion_parser.pm │ └── TrinityFusion_parser.pm ├── Venn_analysis_strategy.pl ├── aggregate_peak_F1_stats.R ├── all_TP_FP_FN_to_ROC.pl ├── all_TP_FP_FN_to_ROC.vary_minF_minS.pl ├── all_TP_FP_FN_to_ROC.vary_minF_minS.plot.Rscript ├── calc_PR.py ├── collect_preds.pl ├── collected_preds_to_fusion_prog_support_listing.pl ├── compare_A_vs_B_scored_preds.pl ├── define_truth_n_unsure_set.pl ├── examine_FPs.pl ├── filter_collected_preds.pl ├── fusion_preds_sensitivity_vs_expr.avg_replicates.pl ├── fusion_preds_sensitivity_vs_expr.pl ├── fusion_preds_to_TP_FP_FN.pl ├── fusion_preds_to_matrix.pl ├── fusion_progs_agree_to_matrix.pl ├── fusion_sample_TPs_to_matrix.pl ├── map_gene_symbols_to_gencode.pl ├── notes ├── plotters │ ├── AUC_barplot.Rscript │ ├── AUC_boxplot.from_separate_auc_files.Rscript │ ├── AUC_boxplot.from_single_summary_AUC_file.Rscript │ ├── plotPRcurves.R │ ├── plot_AUC_50_vs_101_boxplots.Rscript │ ├── plot_F1_vs_min_frags.R │ ├── plot_ROC.Rscript │ ├── plot_TP_FP_vs_minSum_per_prog.R │ ├── plot_all_auc_barplots.Rscript │ ├── plot_before_vs_after_filt_TP_FP_compare.Rscript │ ├── plot_median_accuracy_ranking_vs_median_runtime.R │ ├── plot_peak_F1_scatter.R │ └── plot_upsetR.R └── run_prediction_accuracy_assessment_pipeline.pl ├── cancer_cell_lines ├── Edgren_subset │ ├── analyze_Edgren_subset.pl │ ├── cleanMe.sh │ ├── edgren.truthset │ ├── edgren.truthset.raw │ ├── eval_edgren_min_agree.consolidated.pl │ ├── eval_edgren_min_agree.pl │ ├── examine_validated_enrichment.R │ └── runMe.sh ├── SuppTable-cancer_cell_lines.csv ├── analyze_cancer_data.pl ├── cleanMe.sh ├── progs_select.txt └── runMe.sh ├── cleanMe.sh ├── progs_restrict.txt ├── resources ├── genes.aliases ├── genes.coords.gz ├── notes ├── notes.paralog_clustering.2020.txt ├── paralog_clusters.2020.I3.dat ├── paralog_clusters.2020.I5.dat ├── paralog_clusters.dat └── paralog_clusters.dat.2019 ├── runMe.sh ├── runtime_analysis ├── STAR_F_multicore │ └── runtimes.txt ├── all_progs_cancer │ ├── __origfmt │ │ └── runtimes.orignames.dat │ └── runtimes.txt ├── cleanMe.sh └── runMe.sh ├── simulated_data ├── SuppTable-sim_reads.csv ├── analyze_simulated_data.pl ├── cleanMe.sh ├── runMe.sh ├── sim_101 │ ├── cleanMe.sh │ ├── runMe.sh │ ├── sim_101.fusion_TPM_values.dat │ └── sim_101.truth_set.dat └── sim_50 │ ├── cleanMe.sh │ ├── runMe.sh │ ├── sim_50.fusion_TPM_values.dat │ └── sim_50.truth_set.dat └── util ├── Terra ├── organize_FI_results_for_benchmarking.py └── organize_StarF_results_for_benchmarking.py ├── __get_figs_for_paper.pl ├── basic_accuracy_analysis.pl ├── boxplot_runtimes.Rscript ├── capture_PR_AUC_for_plotting.pl ├── make_file_listing_input_table.pl ├── make_supp_AUC_table.pl ├── make_supp_ROC_table.pl ├── paralog_clustering_util ├── README.md ├── blast_outfmt6_replace_trans_id_w_gene_symbol.pl ├── get_top_blast_pairs.pl ├── outfmt6_add_percent_match_length.group_segments.pl ├── outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl └── outfmt6_add_percent_match_length.pl └── terra_partition_to_sample_dirs.py /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | MAINTAINER bhaas@broadinstitute.org 3 | 4 | RUN apt-get update && apt-get install -y gcc g++ perl python automake make \ 5 | wget git curl libdb-dev \ 6 | zlib1g-dev bzip2 libncurses5-dev \ 7 | texlive-latex-base \ 8 | default-jre \ 9 | python-pip python-dev \ 10 | gfortran \ 11 | build-essential libghc-zlib-dev libncurses-dev libbz2-dev liblzma-dev libpcre3-dev libxml2-dev \ 12 | libblas-dev gfortran git unzip ftp libzmq3-dev nano ftp fort77 libreadline-dev \ 13 | libcurl4-openssl-dev libx11-dev libxt-dev \ 14 | x11-common libcairo2-dev libpng-dev libreadline-dev libjpeg-dev pkg-config libtbb-dev \ 15 | && apt-get clean 16 | 17 | RUN curl -L https://cpanmin.us | perl - App::cpanminus 18 | 19 | RUN cpanm install DB_File 20 | RUN cpanm install URI::Escape 21 | RUN cpanm install JSON::XS 22 | 23 | 24 | ## set up tool config and deployment area: 25 | 26 | ENV SRC /usr/local/src 27 | ENV BIN /usr/local/bin 28 | 29 | 30 | ##### 31 | # Install R 32 | 33 | WORKDIR $SRC 34 | 35 | ENV R_VERSION=R-3.5.2 36 | 37 | RUN curl https://cran.r-project.org/src/base/R-3/$R_VERSION.tar.gz -o $R_VERSION.tar.gz && \ 38 | tar xvf $R_VERSION.tar.gz && \ 39 | cd $R_VERSION && \ 40 | ./configure && make && make install 41 | 42 | 43 | 44 | RUN curl -L https://cpanmin.us | perl - App::cpanminus 45 | 46 | RUN cpanm install DB_File 47 | RUN cpanm install Set::IntervalTree 48 | 49 | 50 | 51 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("ggplot2", dep = TRUE)' 52 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("Biobase", dep = TRUE)' 53 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("qvalue", dep = TRUE)' 54 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("fastcluster", dep = TRUE)' 55 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("tidyr", dep = TRUE)' 56 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("devtools", dep = TRUE)' 57 | 58 | ## get my hacked version of upsetR 59 | WORKDIR $SRC 60 | RUN git clone https://github.com/brianjohnhaas/UpSetR.git && \ 61 | cd UpSetR && \ 62 | git checkout d72b0b5 && \ 63 | Rscript -e 'install.packages("./", repos=NULL, type="source", INSTALL_opts = "--with-keep.source")' 64 | 65 | 66 | ######### 67 | # Install FusionAnnotator 68 | 69 | WORKDIR $SRC 70 | RUN git clone https://github.com/FusionAnnotator/FusionAnnotator.git && \ 71 | cd FusionAnnotator && \ 72 | git checkout 0dc2edc25f7881fd552236c5e12b302cef6eea7a 73 | 74 | ENV FUSION_ANNOTATOR ${SRC}/FusionAnnotator 75 | 76 | 77 | ######## 78 | # Install Trinity (just for plotting utilities) 79 | WORKDIR $SRC 80 | RUN git clone https://github.com/trinityrnaseq/trinityrnaseq.git && \ 81 | cd trinityrnaseq && \ 82 | git checkout 514756d12c614046a4ad50fd63b34e59cdec4c9a 83 | 84 | ENV TRINITY_HOME ${SRC}/trinityrnaseq 85 | 86 | ############## 87 | # Install fusion benchmarking 88 | WORKDIR $SRC 89 | RUN git clone https://github.com/fusiontranscripts/FusionBenchmarking.git && \ 90 | cd FusionBenchmarking && \ 91 | git checkout fa3d7bc0ef3757a3c5c65c2f80e216128cfc9f8e 92 | 93 | 94 | ENV LC_ALL=C 95 | 96 | ## mini ctat genome lib used by fusion annotator: 97 | COPY ctat_genome_lib_dir $SRC/ctat_genome_lib_dir/ 98 | 99 | ENV CTAT_GENOME_LIB $SRC/ctat_genome_lib_dir 100 | 101 | COPY run_eval.sh / 102 | 103 | CMD ["/run_eval.sh"] 104 | 105 | -------------------------------------------------------------------------------- /Docker/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build -t trinityctat/fusionbenchmarking . 4 | 5 | -------------------------------------------------------------------------------- /Docker/make_simg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | singularity build trinityctat.fusionbenchmarking.simg docker://trinityctat/fusionbenchmarking 6 | 7 | -------------------------------------------------------------------------------- /Docker/push_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker push trinityctat/fusionbenchmarking 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Docker/run_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker run --rm -it -v `pwd`:/data trinityctat/fusionbenchmarking $* 4 | 5 | 6 | -------------------------------------------------------------------------------- /Docker/run_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | cd /data 6 | 7 | cp -r /usr/local/src/FusionBenchmarking FusionBenchmarkingWorkspace 8 | 9 | PROGS_RESTRICT=`pwd`/FusionBenchmarkingWorkspace/progs_restrict.txt 10 | 11 | cd FusionBenchmarkingWorkspace/cancer_cell_lines && ./runMe.sh ${PROGS_RESTRICT} 12 | 13 | cd ../simulated_data && ./runMe.sh ${PROGS_RESTRICT} 14 | 15 | echo done 16 | 17 | 18 | -------------------------------------------------------------------------------- /Docker/run_singularity.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | singularity exec -e -B `pwd`:/data trinityctat.fusionbenchmarking.simg /run_eval.sh 6 | -------------------------------------------------------------------------------- /PerlLib/Fasta_reader.pm: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/perl -w 2 | 3 | # lightweight fasta reader capabilities: 4 | package Fasta_reader; 5 | 6 | use strict; 7 | use warnings; 8 | use Carp; 9 | 10 | sub new { 11 | my ($packagename, $fastaFile) = @_; 12 | 13 | ## note: fastaFile can be a filename or an IO::Handle 14 | 15 | 16 | my $self = { fastaFile => undef,, 17 | fileHandle => undef }; 18 | 19 | bless ($self, $packagename); 20 | 21 | ## create filehandle 22 | my $filehandle = undef; 23 | 24 | if (ref $fastaFile eq 'IO::Handle') { 25 | $filehandle = $fastaFile; 26 | } 27 | else { 28 | if ($fastaFile =~ /\.gz$/) { 29 | open ($filehandle, "gunzip -c $fastaFile | ") or confess "Error, cannot open file $fastaFile using 'gunzip -c'"; 30 | } 31 | else { 32 | open ($filehandle, $fastaFile) or die "Error: Couldn't open $fastaFile\n"; 33 | } 34 | $self->{fastaFile} = $fastaFile; 35 | } 36 | 37 | $self->{fileHandle} = $filehandle; 38 | 39 | return ($self); 40 | } 41 | 42 | 43 | 44 | #### next() fetches next Sequence object. 45 | sub next { 46 | my $self = shift; 47 | my $orig_record_sep = $/; 48 | $/="\n>"; 49 | my $filehandle = $self->{fileHandle}; 50 | my $next_text_input = <$filehandle>; 51 | 52 | if (defined($next_text_input) && $next_text_input !~ /\w/) { 53 | ## must have been some whitespace at start of fasta file, before first entry. 54 | ## try again: 55 | $next_text_input = <$filehandle>; 56 | } 57 | 58 | my $seqobj = undef; 59 | 60 | if ($next_text_input) { 61 | $next_text_input =~ s/^>|>$//g; #remove trailing > char. 62 | $next_text_input =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove cntrl chars 63 | my ($header, @seqlines) = split (/\n/, $next_text_input); 64 | my $sequence = join ("", @seqlines); 65 | $sequence =~ s/\s//g; 66 | 67 | $seqobj = Sequence->new($header, $sequence); 68 | } 69 | 70 | $/ = $orig_record_sep; #reset the record separator to original setting. 71 | 72 | return ($seqobj); #returns null if not instantiated. 73 | } 74 | 75 | 76 | #### finish() closes the open filehandle to the query database. 77 | sub finish { 78 | my $self = shift; 79 | my $filehandle = $self->{fileHandle}; 80 | close $filehandle; 81 | $self->{fileHandle} = undef; 82 | } 83 | 84 | #### 85 | sub retrieve_all_seqs_hash { 86 | my $self = shift; 87 | 88 | my %acc_to_seq; 89 | 90 | while (my $seq_obj = $self->next()) { 91 | my $acc = $seq_obj->get_accession(); 92 | my $sequence = $seq_obj->get_sequence(); 93 | 94 | $acc_to_seq{$acc} = $sequence; 95 | } 96 | 97 | return(%acc_to_seq); 98 | } 99 | 100 | 101 | 102 | ############################################## 103 | package Sequence; 104 | use strict; 105 | 106 | sub new { 107 | my ($packagename, $header, $sequence) = @_; 108 | 109 | ## extract an accession from the header: 110 | my ($acc, $rest) = split (/\s+/, $header, 2); 111 | 112 | my $self = { accession => $acc, 113 | header => $header, 114 | sequence => $sequence, 115 | filename => undef }; 116 | bless ($self, $packagename); 117 | return ($self); 118 | } 119 | 120 | #### 121 | sub get_accession { 122 | my $self = shift; 123 | return ($self->{accession}); 124 | } 125 | 126 | #### 127 | sub get_header { 128 | my $self = shift; 129 | return ($self->{header}); 130 | } 131 | 132 | #### 133 | sub get_sequence { 134 | my $self = shift; 135 | return ($self->{sequence}); 136 | } 137 | 138 | #### 139 | sub get_FASTA_format { 140 | my $self = shift; 141 | my %settings = @_; 142 | 143 | my $fasta_line_len = $settings{fasta_line_len} || 60; 144 | 145 | my $header = $self->get_header(); 146 | my $sequence = $self->get_sequence(); 147 | if ($fasta_line_len > 0) { 148 | $sequence =~ s/(\S{$fasta_line_len})/$1\n/g; 149 | chomp $sequence; 150 | } 151 | my $fasta_entry = ">$header\n$sequence\n"; 152 | return ($fasta_entry); 153 | } 154 | 155 | 156 | #### 157 | sub write_fasta_file { 158 | my $self = shift; 159 | my $filename = shift; 160 | 161 | my ($accession, $header, $sequence) = ($self->{accession}, $self->{header}, $self->{sequence}); 162 | 163 | my $fasta_entry = $self->get_FASTA_format(); 164 | 165 | my $tempfile; 166 | if ($filename) { 167 | $tempfile = $filename; 168 | } else { 169 | my $acc = $accession; 170 | $acc =~ s/\W/_/g; 171 | $tempfile = "$acc.fasta"; 172 | } 173 | 174 | open (TMP, ">$tempfile") or die "ERROR! Couldn't write a temporary file in current directory.\n"; 175 | print TMP $fasta_entry; 176 | close TMP; 177 | return ($tempfile); 178 | } 179 | 180 | #### 181 | sub get_core_read_name { 182 | my $self = shift; 183 | 184 | my $acc = $self->get_accession(); 185 | $acc =~ s|/[12]$||; 186 | return($acc); 187 | } 188 | 189 | 190 | 1; #EOM 191 | 192 | 193 | -------------------------------------------------------------------------------- /PerlLib/Process_cmd.pm: -------------------------------------------------------------------------------- 1 | package Process_cmd; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | use Cwd; 7 | 8 | require Exporter; 9 | our @ISA = qw(Exporter); 10 | our @EXPORT = qw(process_cmd ensure_full_path); 11 | 12 | 13 | sub process_cmd { 14 | my ($cmd) = @_; 15 | 16 | print STDERR "CMD: $cmd\n"; 17 | 18 | my $ret = system($cmd); 19 | if ($ret) { 20 | confess "Error, cmd:\n$cmd\n died with ret ($ret)"; 21 | } 22 | 23 | return; 24 | } 25 | 26 | 27 | sub ensure_full_path { 28 | my ($path) = @_; 29 | 30 | unless ($path =~ m|^/|) { 31 | $path = cwd() . "/$path"; 32 | } 33 | 34 | return($path); 35 | } 36 | 37 | 38 | 39 | 1; #EOM 40 | 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fusion Transcript Benchmarking 2 | 3 | See [wiki](https://github.com/fusiontranscripts/FusionBenchmarking/wiki) for documentation. 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /alt_methods/STAR-Fusion/uger/starF_v1.5_hg19.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf starF_v1.5_hg19.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7 --name StarF_v1.5_hg19 2 | -------------------------------------------------------------------------------- /alt_methods/STAR-Fusion/uger/starF_v1.5_hg19.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=STAR_FUSION_v1.5_hg19_Apr042019 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/STAR-Fusion/SINGULARITY/star-fusion.v1.5.0.simg /usr/local/src/STAR-Fusion/STAR-Fusion --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} -O {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir 11 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-C/TrinityFusion-C.hg19.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-C.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_C_hg19 2 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-C/TrinityFusion-C.hg19.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=TRINITY_FUSION_C_hg19 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg /usr/local/src/TrinityFusion/TrinityFusion --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir --chimeric_junctions {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Chimeric.out.junction 11 | 12 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-D/TrinityFusion-D.hg19.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-D.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_D_hg19 2 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-D/TrinityFusion-D.hg19.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=TRINITY_FUSION_D_hg19 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg /usr/local/src/TrinityFusion/TrinityFusion --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir --max_memory 20G 11 | 12 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-UC/TrinityFusion-UC.hg19.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-UC.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_UC_hg19 2 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/uger/TrinityFusion-UC/TrinityFusion-UC.hg19.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=TRINITY_FUSION_UC_hg19 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg /usr/local/src/TrinityFusion/TrinityFusion --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir --chimeric_junctions {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Chimeric.out.junction --aligned_bam {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Aligned.out.bam 11 | 12 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/wdl/TrinityFusion.wdl: -------------------------------------------------------------------------------- 1 | 2 | task TRINITY_FUSION_UC_TASK { 3 | 4 | String sample_name 5 | File left_fq 6 | File right_fq 7 | File genome_lib_tar 8 | File chimeric_junctions_file 9 | File aligned_bam 10 | 11 | command <<< 12 | 13 | set -e 14 | 15 | # untar the genome lib 16 | tar xvf ${genome_lib_tar} 17 | rm ${genome_lib_tar} 18 | 19 | # TrinityFusion 20 | 21 | /usr/local/src/TrinityFusion/TrinityFusion \ 22 | --left_fq ${left_fq} \ 23 | --right_fq ${right_fq} \ 24 | --chimeric_junctions ${chimeric_junctions_file} \ 25 | --aligned_bam ${aligned_bam} \ 26 | --CPU 10 \ 27 | --genome_lib_dir ctat_genome_lib_build_dir \ 28 | --output_dir ${sample_name} 29 | 30 | 31 | cp ${sample_name}/TrinityFusion-UC.fusion_predictions.tsv ${sample_name}.TrinityFusion-UC.fusion_predictions.tsv 32 | 33 | gzip ${sample_name}.TrinityFusion-UC.fusion_predictions.tsv 34 | 35 | >>> 36 | 37 | output { 38 | File TrinityFusion_UC="${sample_name}.TrinityFusion-UC.fusion_predictions.tsv.gz" 39 | } 40 | 41 | 42 | runtime { 43 | docker: "trinityctat/trinityfusion:0.2.0" 44 | disks: "local-disk 500 SSD" 45 | memory: "30G" 46 | cpu: "10" 47 | preemptible: 0 48 | maxRetries: 0 49 | } 50 | } 51 | 52 | 53 | task TRINITY_FUSION_D_TASK { 54 | 55 | String sample_name 56 | File left_fq 57 | File right_fq 58 | File genome_lib_tar 59 | 60 | 61 | command <<< 62 | 63 | set -e 64 | 65 | # untar the genome lib 66 | tar xvf ${genome_lib_tar} 67 | rm ${genome_lib_tar} 68 | 69 | # TrinityFusion 70 | 71 | /usr/local/src/TrinityFusion/TrinityFusion \ 72 | --left_fq ${left_fq} \ 73 | --right_fq ${right_fq} \ 74 | --CPU 10 \ 75 | --genome_lib_dir ctat_genome_lib_build_dir \ 76 | --output_dir ${sample_name} 77 | 78 | 79 | cp ${sample_name}/TrinityFusion-D.fusion_predictions.tsv ${sample_name}.TrinityFusion-D.fusion_predictions.tsv 80 | 81 | gzip ${sample_name}.TrinityFusion-D.fusion_predictions.tsv 82 | 83 | >>> 84 | 85 | output { 86 | File TrinityFusion_D="${sample_name}.TrinityFusion-D.fusion_predictions.tsv.gz" 87 | } 88 | 89 | 90 | runtime { 91 | docker: "trinityctat/trinityfusion:0.2.0" 92 | disks: "local-disk 500 SSD" 93 | memory: "30G" 94 | cpu: "10" 95 | preemptible: 0 96 | maxRetries: 0 97 | } 98 | } 99 | 100 | 101 | 102 | workflow trinity_fusion_wf { 103 | Boolean? TrinityFusion_C 104 | Boolean? TrinityFusion_UC 105 | Boolean? TrinityFusion_D 106 | 107 | String sample_name 108 | File left_fq 109 | File right_fq 110 | File genome_lib_tar 111 | 112 | File? chimeric_junctions_file 113 | File? star_aligned_bam 114 | 115 | 116 | if (defined(TrinityFusion_UC)) { 117 | call TRINITY_FUSION_UC_TASK { 118 | input: 119 | sample_name=sample_name, 120 | left_fq=left_fq, 121 | right_fq=right_fq, 122 | genome_lib_tar=genome_lib_tar, 123 | chimeric_junctions_file=chimeric_junctions_file, 124 | aligned_bam=star_aligned_bam 125 | } 126 | } 127 | 128 | if (defined(TrinityFusion_D)) { 129 | call TRINITY_FUSION_D_TASK { 130 | input: 131 | sample_name=sample_name, 132 | left_fq=left_fq, 133 | right_fq=right_fq, 134 | genome_lib_tar=genome_lib_tar 135 | } 136 | } 137 | 138 | } 139 | 140 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/wdl/inputs.json: -------------------------------------------------------------------------------- 1 | { 2 | "trinity_fusion_wf.genome_lib_tar": "inputs/ctat_testkit_genome_lib_dir.star1.5.tar", 3 | "trinity_fusion_wf.left_fq": "inputs/rnaseq_1.fastq.gz", 4 | "trinity_fusion_wf.right_fq": "inputs/rnaseq_2.fastq.gz", 5 | "trinity_fusion_wf.TrinityFusion_D": "true", 6 | "trinity_fusion_wf.sample_name": "mysample" 7 | } 8 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/wdl/make_wdl_input_template.sh: -------------------------------------------------------------------------------- 1 | java -jar ~/utilities/wdltool-0.12.jar inputs TrinityFusion.wdl 2 | -------------------------------------------------------------------------------- /alt_methods/TrinityFusion/wdl/run.sh: -------------------------------------------------------------------------------- 1 | java -jar ~/utilities/cromwell-39.jar run TrinityFusion.wdl --inputs inputs.json 2 | -------------------------------------------------------------------------------- /alt_methods/arriba/arriba_wrapper.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through); 7 | use FindBin; 8 | use lib ("$FindBin::Bin/../../PerlLib"); 9 | use Process_cmd; 10 | 11 | 12 | my $usage = <<__EOUSAGE__; 13 | 14 | ######################################################################### 15 | # 16 | # Required: 17 | # 18 | # --left_reads left reads file (reads1.fastq.gz) 19 | # --right_reads right reads file (reads2.fastq.gz) 20 | # --arriba_singularity_img arriba singularity img 21 | # --arriba_references_dir arriba references directory 22 | # --output_dir output directory 23 | # 24 | # Optional: 25 | # 26 | # --mount dirctory to mount 27 | # 28 | ######################################################################## 29 | 30 | __EOUSAGE__ 31 | 32 | 33 | ; 34 | 35 | 36 | 37 | my $help_flag; 38 | my $left_reads; 39 | my $right_reads; 40 | my $arriba_singularity_img; 41 | my $arriba_references_dir; 42 | my $output_dir; 43 | my $mount = ""; 44 | 45 | &GetOptions ( 'h' => \$help_flag, 46 | 47 | ## all required 48 | 'left_reads=s' => \$left_reads, 49 | 'right_reads=s' => \$right_reads, 50 | 'arriba_singularity_img=s' => \$arriba_singularity_img, 51 | 'arriba_references_dir=s' => \$arriba_references_dir, 52 | 'output_dir=s' => \$output_dir, 53 | 54 | # optional 55 | 'mount=s' => \$mount, 56 | ); 57 | 58 | 59 | if ($help_flag) { 60 | die $usage; 61 | } 62 | 63 | unless($left_reads && $right_reads && $arriba_singularity_img && $arriba_references_dir && $output_dir) { 64 | die $usage; 65 | } 66 | 67 | if ($mount) { 68 | $mount = &ensure_full_path($mount); 69 | $mount = " -B $mount "; 70 | } 71 | 72 | $left_reads = &ensure_full_path($left_reads); 73 | $right_reads = &ensure_full_path($right_reads); 74 | $arriba_singularity_img = &ensure_full_path($arriba_singularity_img); 75 | $arriba_references_dir = &ensure_full_path($arriba_references_dir); 76 | $output_dir = &ensure_full_path($output_dir); 77 | 78 | 79 | main: { 80 | 81 | unless (-d $output_dir) { 82 | &process_cmd("mkdir -p $output_dir"); 83 | } 84 | 85 | my $cmd = "singularity exec -e $mount" 86 | . " -B $output_dir:/output " 87 | . " -B $arriba_references_dir:/references:ro " 88 | . " -B $left_reads:/read1.fastq.gz:ro " 89 | . " -B $right_reads:/read2.fastq.gz:ro " 90 | . " $arriba_singularity_img arriba.sh "; 91 | 92 | &process_cmd($cmd); 93 | 94 | } 95 | 96 | -------------------------------------------------------------------------------- /alt_methods/arriba/uger/arriba.uger.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf arriba.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7 --name arriba 2 | -------------------------------------------------------------------------------- /alt_methods/arriba/uger/arriba.uger.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=ARRIBA 9 | USE_GZIP_FIFO=FALSE 10 | CMD=/home/unix/bhaas/GITHUB/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/arriba/arriba_wrapper.pl --left_reads {__LEFT_FQ__} --right_reads {__RIGHT_FQ__} --arriba_singularity_img /seq/RNASEQ/TOOLS/ARRIBA/SINGULARITY/arriba-1.1.0.simg --arriba_references_dir /seq/RNASEQ/TOOLS/ARRIBA/references --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --mount /seq/RNASEQ 11 | -------------------------------------------------------------------------------- /alt_methods/prada/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | MAINTAINER bhaas@broadinstitute.org 3 | 4 | RUN apt-get update && apt-get install -y gcc g++ perl python automake make \ 5 | wget git curl libdb-dev \ 6 | zlib1g-dev bzip2 libncurses5-dev \ 7 | texlive-latex-base \ 8 | default-jre \ 9 | python-pip python-dev \ 10 | gfortran \ 11 | build-essential libghc-zlib-dev libncurses-dev libbz2-dev liblzma-dev libpcre3-dev libxml2-dev \ 12 | libblas-dev gfortran git unzip ftp libzmq3-dev nano ftp fort77 libreadline-dev \ 13 | libcurl4-openssl-dev libx11-dev libxt-dev \ 14 | x11-common libcairo2-dev libpng12-dev libreadline6-dev libjpeg8-dev pkg-config libtbb-dev \ 15 | && apt-get clean 16 | 17 | 18 | RUN sed -i -e 's/:\/\/(archive.ubuntu.com\|security.ubuntu.com)/old-releases.ubuntu.com/g' /etc/apt/sources.list 19 | 20 | 21 | ## install old java-7 22 | RUN apt-get update && apt-get install -y python3-software-properties software-properties-common 23 | 24 | RUN add-apt-repository ppa:openjdk-r/ppa && \ 25 | apt-get update && apt-get install -y openjdk-7-jdk 26 | 27 | 28 | RUN mv /usr/bin/java /usr/bin/java8 && \ 29 | ln -s /usr/lib/jvm/java-7-openjdk-amd64/bin/java /usr/bin/java7 && \ 30 | ln -s /usr/bin/java7 /usr/bin/java 31 | 32 | 33 | 34 | ## install prada 35 | 36 | WORKDIR /usr/local/src 37 | RUN wget https://downloads.sourceforge.net/project/prada/pyPRADA/pyPRADA_1.2.tar.gz && \ 38 | tar xvf pyPRADA_1.2.tar.gz 39 | 40 | -------------------------------------------------------------------------------- /alt_methods/prada/VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.2 2 | -------------------------------------------------------------------------------- /alt_methods/prada/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | VERSION=`cat VERSION.txt` 6 | 7 | docker build -t fusiontranscripts/prada:${VERSION} . 8 | docker build -t fusiontranscripts/prada:latest . 9 | -------------------------------------------------------------------------------- /alt_methods/star-seqr/docker/make_simg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=0.6.7 4 | 5 | singularity build star-seqr.v${VERSION}.simg docker://eagenomics/starseqr:$VERSION 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /alt_methods/star-seqr/docker/run_test.sh: -------------------------------------------------------------------------------- 1 | singularity run -e -B /home/bhaas star-seqr.v0.6.7.simg starseqr.py -1 ~/garb/reads_1.fq.gz -2 ~/garb/reads_2.fq.gz -i /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa.star.idx -g /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf -r /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa -m 1 -vv -p `pwd`/seqrout 2 | 3 | -------------------------------------------------------------------------------- /alt_methods/star-seqr/uger/star-seqr.uger.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf star-seqr.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7 --name starseqr 2 | -------------------------------------------------------------------------------- /alt_methods/star-seqr/uger/star-seqr.uger.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=STARSEQR 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/STAR-SEQR/SINGULARITY/star-seqr.v0.6.7.simg starseqr.py -1 {__LEFT_FQ__} -2 {__RIGHT_FQ__} -i /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa.star.idx -g /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf -r /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa -m 1 -vv -p {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} -t 1 11 | -------------------------------------------------------------------------------- /alt_methods/starchip/Docker/VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.3ec 2 | -------------------------------------------------------------------------------- /alt_methods/starchip/Docker/build_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | VERSION=`cat VERSION.txt` 6 | 7 | docker build -t fusiontranscripts/starchip:${VERSION} . 8 | docker build -t fusiontranscripts/starchip:latest . 9 | -------------------------------------------------------------------------------- /alt_methods/starchip/Docker/make_simg.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=`cat VERSION.txt` 4 | 5 | singularity build starchip.v${VERSION}.simg docker://fusiontranscripts/starchip:$VERSION 6 | -------------------------------------------------------------------------------- /alt_methods/starchip/Docker/push_docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | VERSION=`cat VERSION.txt` 6 | 7 | docker push fusiontranscripts/starchip:${VERSION} 8 | docker push fusiontranscripts/starchip:latest 9 | -------------------------------------------------------------------------------- /alt_methods/starchip/Docker/starchip_wrapper.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through); 7 | use File::Basename; 8 | use FindBin; 9 | use lib ("$FindBin::Bin"); 10 | use Pipeliner; 11 | 12 | 13 | my $help_flag; 14 | 15 | my $output_token = "starchip"; 16 | my $chim_seg_min = 15; 17 | 18 | my $usage = <<__EOUSAGE; 19 | 20 | ###################################################################################################### 21 | # 22 | # Required: 23 | 24 | # --left_fq reads_1.fq.gz 25 | # 26 | # --right_fq reads_2.fq.gz 27 | # 28 | # --starchip_parameters_file the starchip parameters file (indicates where the star index is) 29 | # 30 | # --output_dir output directory 31 | # 32 | # Optional: 33 | # 34 | # --output_token token for output files (default: $output_token) 35 | # 36 | # --chim_seg_min value for STAR --chimSegmentMin and --chimJunctionOverhangMin (default: $chim_seg_min) 37 | # 38 | ####################################################################################################### 39 | 40 | __EOUSAGE 41 | 42 | ; 43 | 44 | 45 | my $left_fq; 46 | my $right_fq; 47 | my $starchip_parameters_file; 48 | my $output_dir; 49 | 50 | 51 | &GetOptions ( 'h' => \$help_flag, 52 | 'left_fq=s' => \$left_fq, 53 | 'right_fq=s' => \$right_fq, 54 | 'starchip_parameters_file=s' => \$starchip_parameters_file, 55 | 'chim_seg_min=i' => \$chim_seg_min, 56 | 'output_dir=s' => \$output_dir); 57 | 58 | 59 | if ($help_flag) { 60 | die $usage; 61 | } 62 | 63 | unless ($left_fq && $right_fq && $starchip_parameters_file && $output_dir) { 64 | die $usage; 65 | } 66 | 67 | $left_fq = Pipeliner::ensure_full_path($left_fq); 68 | $right_fq = Pipeliner::ensure_full_path($right_fq); 69 | $starchip_parameters_file = Pipeliner::ensure_full_path($starchip_parameters_file); 70 | $output_dir = Pipeliner::ensure_full_path($output_dir); 71 | 72 | 73 | 74 | main: { 75 | 76 | my $starchip_reference_dirname = dirname($starchip_parameters_file); 77 | 78 | my $star_index_dir = "$starchip_reference_dirname/ref_genome.fa.star.idx"; 79 | 80 | if (! -d $output_dir) { 81 | &Pipeliner::process_cmd("mkdir -p $output_dir"); 82 | } 83 | chdir($output_dir) or die "Error, cannot cd to $output_dir"; 84 | 85 | &Pipeliner::process_cmd("ln -sf $starchip_reference_dirname"); 86 | 87 | ## Run STAR: 88 | my $cmd = "STAR --genomeDir $star_index_dir " 89 | . " --readFilesIn $left_fq $right_fq " 90 | . " --outReadsUnmapped Fastx " 91 | . " --quantMode GeneCounts " 92 | . " --chimSegmentMin $chim_seg_min " 93 | . " --chimJunctionOverhangMin $chim_seg_min " 94 | . " --outSAMstrandField intronMotif " 95 | . " --readFilesCommand zcat " 96 | . " --outSAMtype BAM Unsorted "; 97 | 98 | my $chkpt_dir = "__starchip_chkpts"; 99 | my $pipeliner = new Pipeliner( '-checkpoint_dir' => $chkpt_dir, '-verbose' => 2 ); 100 | 101 | $pipeliner->add_commands( new Command($cmd, "star_align.ok") ); 102 | 103 | ## run STARChip 104 | 105 | $cmd = "/usr/local/src/starchip-1.3e/starchip-fusions.pl $output_token Chimeric.out.junction $starchip_reference_dirname/hg19.parameters.txt"; 106 | 107 | $pipeliner->add_commands( new Command($cmd, "starchip.ok") ); 108 | 109 | $pipeliner->run(); 110 | 111 | exit(0); 112 | 113 | } 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /alt_methods/starchip/README.md: -------------------------------------------------------------------------------- 1 | instructions found at: https://github.com/LosicLab/starchip/tree/master/example 2 | 3 | ## starchip setup 4 | cp ~/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa . 5 | 6 | cp ~/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf . 7 | 8 | singularity shell -e starchip.v1.3e.simg 9 | 10 | /usr/local/src/starchip-1.3e/setup.sh ref_annot.gtf ref_genome.fa references/ 11 | 12 | cd references 13 | wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz 14 | gunzip rmsk.txt.gz 15 | cut -f6-8 rmsk.txt > hg19.repeats.bed 16 | 17 | 18 | 19 | ## run test 20 | 21 | singularity shell -e starchip.v1.3e.simg 22 | 23 | - star alignment, using STAR v2.5.3a 24 | 25 | STAR --genomeDir ../CTAT_GENOMICS/genome_libs_StarFpre-v1.3/GRCh37_gencode_v19_CTAT_lib_Nov012017/ctat_genome_lib_build_dir/ref_genome.fa.star.idx --readFilesIn ../GITHUB/CTAT_FUSIONS/STAR-Fusion/testing/reads_1.fq.gz ../GITHUB/CTAT_FUSIONS/STAR-Fusion/testing/reads_2.fq.gz --outReadsUnmapped Fastx --quantMode GeneCounts --chimSegmentMin 15 --chimJunctionOverhangMin 15 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMtype BAM Unsorted 26 | 27 | - run example 28 | 29 | /usr/local/src/starchip-1.3e/starchip-fusions.pl ladeda2 reference/example/Chimeric.out.junction reference/hg19.parameters.txt 30 | 31 | 32 | -------------------------------------------------------------------------------- /alt_methods/starchip/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -rf test_outdir/ 4 | -------------------------------------------------------------------------------- /alt_methods/starchip/run_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | ## download the starchip reference bundle and unpack 6 | ## replace /seq/RNASEQ/TOOLS/STARCHIP/reference 7 | ## replace /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3e.simg with your location for the simg. 8 | 9 | 10 | singularity exec -e -B `pwd` \ 11 | -B /seq/RNASEQ/TOOLS/STARCHIP/reference:/usr/local/src/reference \ 12 | /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3e.simg \ 13 | /usr/local/bin/starchip_wrapper.pl \ 14 | --left_fq test_data/reads_1.fq.gz \ 15 | --right_fq test_data/reads_2.fq.gz \ 16 | --starchip_parameters_file /usr/local/src/reference/hg19.parameters \ 17 | --output_dir `pwd`/test_outdir 18 | 19 | -------------------------------------------------------------------------------- /alt_methods/starchip/test_data/reads_1.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_data/reads_1.fq.gz -------------------------------------------------------------------------------- /alt_methods/starchip/test_data/reads_2.fq.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_data/reads_2.fq.gz -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/Aligned.out.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/Aligned.out.bam -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/Log.final.out: -------------------------------------------------------------------------------- 1 | Started job on | Apr 05 15:37:29 2 | Started mapping on | Apr 05 15:37:55 3 | Finished on | Apr 05 15:37:58 4 | Mapping speed, Million of reads per hour | 6.03 5 | 6 | Number of input reads | 5026 7 | Average input read length | 100 8 | UNIQUE READS: 9 | Uniquely mapped reads number | 1522 10 | Uniquely mapped reads % | 30.28% 11 | Average mapped length | 91.19 12 | Number of splices: Total | 725 13 | Number of splices: Annotated (sjdb) | 682 14 | Number of splices: GT/AG | 715 15 | Number of splices: GC/AG | 7 16 | Number of splices: AT/AC | 0 17 | Number of splices: Non-canonical | 3 18 | Mismatch rate per base, % | 0.69% 19 | Deletion rate per base | 0.01% 20 | Deletion average length | 1.17 21 | Insertion rate per base | 0.01% 22 | Insertion average length | 1.00 23 | MULTI-MAPPING READS: 24 | Number of reads mapped to multiple loci | 280 25 | % of reads mapped to multiple loci | 5.57% 26 | Number of reads mapped to too many loci | 0 27 | % of reads mapped to too many loci | 0.00% 28 | UNMAPPED READS: 29 | % of reads unmapped: too many mismatches | 0.00% 30 | % of reads unmapped: too short | 64.15% 31 | % of reads unmapped: other | 0.00% 32 | CHIMERIC READS: 33 | Number of chimeric reads | 2207 34 | % of chimeric reads | 43.91% 35 | -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/Log.progress.out: -------------------------------------------------------------------------------- 1 | Time Speed Read Read Mapped Mapped Mapped Mapped Unmapped Unmapped Unmapped Unmapped 2 | M/hr number length unique length MMrate multi multi+ MM short other 3 | ALL DONE! 4 | -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/__starchip_chkpts/pipeliner.2350.cmds: -------------------------------------------------------------------------------- 1 | STAR --genomeDir /usr/local/src/reference/ref_genome.fa.star.idx --readFilesIn /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_1.fq.gz /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_2.fq.gz --outReadsUnmapped Fastx --quantMode GeneCounts --chimSegmentMin 15 --chimJunctionOverhangMin 15 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMtype BAM Unsorted 2 | -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/__starchip_chkpts/pipeliner.2900.cmds: -------------------------------------------------------------------------------- 1 | STAR --genomeDir /usr/local/src/reference/ref_genome.fa.star.idx --readFilesIn /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_1.fq.gz /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_2.fq.gz --outReadsUnmapped Fastx --quantMode GeneCounts --chimSegmentMin 15 --chimJunctionOverhangMin 15 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMtype BAM Unsorted 2 | /usr/local/src/starchip-1.3e/starchip-fusions.pl starchip Chimeric.out.junction /usr/local/src/reference/hg19.parameters.txt 3 | -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/__starchip_chkpts/star_align.ok: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/__starchip_chkpts/star_align.ok -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/__starchip_chkpts/starchip.ok: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/__starchip_chkpts/starchip.ok -------------------------------------------------------------------------------- /alt_methods/starchip/test_outdir/starchip.summary: -------------------------------------------------------------------------------- 1 | Partner1 Partner2 SpanningReads SplitReads AvgAS NearGene1 Distance1 NearGene2 Distance2 ConsensusSeq 2 | chr17:48943419:- chr17:35880751:- 26 8 0 TOB1 0 SYNRG 0 . 3 | chr17:38243106:+ chr17:46371709:+ 72 25 0 THRA 0 AC090627.1 0 . 4 | chr20:56886178:+ chr19:17256207:+ 8 5 0 RAB22A 0 MYO9B 0 . 5 | chr4:76846964:+ chr1:247094880:+ 22 2 0 NAAA 0 AHCTF1 0 . 6 | chr17:57970686:+ chr17:47021337:- 17 9 0 RPS6KB1 0 SNF8 0 . 7 | chr17:48548389:- chr17:37595418:+ 5 4 0 ACSF2 0 MED1 0 . 8 | chr17:37374426:+ chr17:35479453:+ 40 4 0 STAC2 0 ACACA 0 . 9 | chr17:46384693:- chr17:38243106:- 18 5 0 AC090627.1 0 THRA 0 . 10 | -------------------------------------------------------------------------------- /alt_methods/starchip/uger/starchip.uger.cmd: -------------------------------------------------------------------------------- 1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf starchip.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7 --name starchip 2 | -------------------------------------------------------------------------------- /alt_methods/starchip/uger/starchip.uger.conf: -------------------------------------------------------------------------------- 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited. 2 | 3 | [GLOBALS] 4 | USE_QTRIM_READS=F 5 | 6 | [CUSTOM_050] 7 | RUN=T 8 | CUSTOM_DIR=STARCHIP_csm10 9 | USE_GZIP_FIFO=FALSE 10 | CMD=singularity exec -e -B /seq/RNASEQ -B /seq/RNASEQ/TOOLS/STARCHIP/reference:/usr/local/src/reference /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3eb.simg /usr/local/bin/starchip_wrapper.pl --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --starchip_parameters_file /usr/local/src/reference/hg19.parameters --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --chim_seg_min 10 11 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/ARRIBA_hc_parser.pm: -------------------------------------------------------------------------------- 1 | package ARRIBA_hc_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =ARRIBA_format 9 | 10 | 11 | 0 #gene1 12 | 1 gene2 13 | 2 strand1(gene/fusion) 14 | 3 strand2(gene/fusion) 15 | 4 breakpoint1 16 | 5 breakpoint2 17 | 6 site1 18 | 7 site2 19 | 8 type 20 | 9 direction1 21 | 10 direction2 22 | 11 split_reads1 23 | 12 split_reads2 24 | 13 discordant_mates 25 | 14 coverage1 26 | 15 coverage2 27 | 16 confidence 28 | 17 closest_genomic_breakpoint1 29 | 18 closest_genomic_breakpoint2 30 | 19 filters 31 | 20 fusion_transcript 32 | 21 reading_frame 33 | 22 peptide_sequence 34 | 23 read_identifiers 35 | 36 | 37 | 0 PID1 38 | 1 DAP 39 | 2 -/- 40 | 3 -/- 41 | 4 2:230020534 42 | 5 5:10681281 43 | 6 splice-site 44 | 7 splice-site 45 | 8 translocation 46 | 9 upstream 47 | 10 downstream 48 | 11 305 49 | 12 304 50 | 13 300 51 | 14 8156 52 | 15 6135 53 | 16 high 54 | 17 . 55 | 18 . 56 | 19 duplicates(79),mismatches(9) 57 | 20 ACACCGACCCCAGATGTAAAGCGGGACCCCAGCCCCTCGCCCCCCGGCGCGATCGACAGTCTCGCCAGCGTCTCCTCTGCCAAAACCCAGGGCTGGAAGATGTGGCAGCCGGCCACGGAGCGCCTGCAG___CACTTTCAGACCATGCTGAAGTCTAAATTGAATGTCTTAACACTGAAAAAGGAACCTCTCCCAGCGGTCATCTTCCATGAGCCGGAGGCCATTGAGCTGTGCACGACCACACCGCTGATGAAGACAAGGACTCACAGTGGCTGCAAG|GGTGACAAAGATTTCCCCCCGGCGGCTGCGCAGGTGGCTCACCAGAAGCCGCATGCCTCCATGGACAAGCATCCTTCCCCAAGAACCCAGCACATCCAGCAGCCACGCAAGTGAGCCTGGAGTCCACCAGCCTGCCCCATGGCCCCGGCTCTGCTGCACTTGGTATTTCCCTGACAGAGAGAACCAGCAGTTTCGCCCAAATCCTACTCTGCTGGGAAATCTAAGGCAAAACCAAGTGCTCTGTCCTTTGCCTTACATTTCCATATTTAAAACTAGAAACAGCTCCAGC 58 | 21 in-frame 59 | 22 MWQPATERLQHFQTMLKSKLNVLTLKKEPLPAVIFHEPEAIELCTTTPLMKTRTHSGCK|GDKDFPPAAAQVAHQKPHASMDKHPSPRTQHIQQPRK* 60 | 23 . 61 | 62 | 63 | =cut 64 | 65 | 66 | 67 | sub parse_fusion_result_file { 68 | my ($file) = @_; 69 | 70 | my @fusions; 71 | 72 | open (my $fh, $file) or die "Error, cannot open file $file"; 73 | my $header = <$fh>; 74 | while (<$fh>) { 75 | chomp; 76 | my @x = split(/\t/); 77 | 78 | my $conf_level = $x[16]; 79 | unless ($conf_level =~ /high/i) { next; } ### ARRIBA hc requires high confidence predictions only 80 | 81 | my $geneA = $x[0]; 82 | my $geneB = $x[1]; 83 | 84 | $geneA =~ s/\(\d+\)//g; 85 | $geneB =~ s/\(\d+\)//g; 86 | 87 | my $coord_info_A = $x[4]; 88 | my ($chrA, $coordA) = split(/:/, $coord_info_A); 89 | 90 | my $coord_info_B = $x[5]; 91 | my ($chrB, $coordB) = split(/:/, $coord_info_B); 92 | 93 | my $junction_read_count = $x[11] + $x[12]; 94 | my $spanning_frags = $x[13]; 95 | 96 | 97 | 98 | my $struct = { 99 | geneA => $geneA, 100 | chrA => $chrA, 101 | coordA => $coordA, 102 | 103 | geneB => $geneB, 104 | chrB => $chrB, 105 | coordB => $coordB, 106 | 107 | span_reads => $spanning_frags, 108 | junc_reads => $junction_read_count, 109 | }; 110 | 111 | push (@fusions, $struct); 112 | 113 | } 114 | 115 | close $fh; 116 | 117 | return(@fusions); 118 | } 119 | 120 | 121 | 1; #EOM 122 | 123 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/ARRIBA_parser.pm: -------------------------------------------------------------------------------- 1 | package ARRIBA_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =ARRIBA_format 9 | 10 | 11 | 0 #gene1 12 | 1 gene2 13 | 2 strand1(gene/fusion) 14 | 3 strand2(gene/fusion) 15 | 4 breakpoint1 16 | 5 breakpoint2 17 | 6 site1 18 | 7 site2 19 | 8 type 20 | 9 direction1 21 | 10 direction2 22 | 11 split_reads1 23 | 12 split_reads2 24 | 13 discordant_mates 25 | 14 coverage1 26 | 15 coverage2 27 | 16 confidence 28 | 17 closest_genomic_breakpoint1 29 | 18 closest_genomic_breakpoint2 30 | 19 filters 31 | 20 fusion_transcript 32 | 21 reading_frame 33 | 22 peptide_sequence 34 | 23 read_identifiers 35 | 36 | 37 | 0 PID1 38 | 1 DAP 39 | 2 -/- 40 | 3 -/- 41 | 4 2:230020534 42 | 5 5:10681281 43 | 6 splice-site 44 | 7 splice-site 45 | 8 translocation 46 | 9 upstream 47 | 10 downstream 48 | 11 305 49 | 12 304 50 | 13 300 51 | 14 8156 52 | 15 6135 53 | 16 high 54 | 17 . 55 | 18 . 56 | 19 duplicates(79),mismatches(9) 57 | 20 ACACCGACCCCAGATGTAAAGCGGGACCCCAGCCCCTCGCCCCCCGGCGCGATCGACAGTCTCGCCAGCGTCTCCTCTGCCAAAACCCAGGGCTGGAAGATGTGGCAGCCGGCCACGGAGCGCCTGCAG___CACTTTCAGACCATGCTGAAGTCTAAATTGAATGTCTTAACACTGAAAAAGGAACCTCTCCCAGCGGTCATCTTCCATGAGCCGGAGGCCATTGAGCTGTGCACGACCACACCGCTGATGAAGACAAGGACTCACAGTGGCTGCAAG|GGTGACAAAGATTTCCCCCCGGCGGCTGCGCAGGTGGCTCACCAGAAGCCGCATGCCTCCATGGACAAGCATCCTTCCCCAAGAACCCAGCACATCCAGCAGCCACGCAAGTGAGCCTGGAGTCCACCAGCCTGCCCCATGGCCCCGGCTCTGCTGCACTTGGTATTTCCCTGACAGAGAGAACCAGCAGTTTCGCCCAAATCCTACTCTGCTGGGAAATCTAAGGCAAAACCAAGTGCTCTGTCCTTTGCCTTACATTTCCATATTTAAAACTAGAAACAGCTCCAGC 58 | 21 in-frame 59 | 22 MWQPATERLQHFQTMLKSKLNVLTLKKEPLPAVIFHEPEAIELCTTTPLMKTRTHSGCK|GDKDFPPAAAQVAHQKPHASMDKHPSPRTQHIQQPRK* 60 | 23 . 61 | 62 | 63 | =cut 64 | 65 | 66 | 67 | sub parse_fusion_result_file { 68 | my ($file) = @_; 69 | 70 | my @fusions; 71 | 72 | open (my $fh, $file) or die "Error, cannot open file $file"; 73 | my $header = <$fh>; 74 | while (<$fh>) { 75 | chomp; 76 | my @x = split(/\t/); 77 | 78 | my $geneA = $x[0]; 79 | my $geneB = $x[1]; 80 | 81 | $geneA =~ s/\(\d+\)//g; 82 | $geneB =~ s/\(\d+\)//g; 83 | 84 | my $coord_info_A = $x[4]; 85 | my ($chrA, $coordA) = split(/:/, $coord_info_A); 86 | 87 | my $coord_info_B = $x[5]; 88 | my ($chrB, $coordB) = split(/:/, $coord_info_B); 89 | 90 | my $junction_read_count = $x[11] + $x[12]; 91 | my $spanning_frags = $x[13]; 92 | 93 | 94 | 95 | my $struct = { 96 | geneA => $geneA, 97 | chrA => $chrA, 98 | coordA => $coordA, 99 | 100 | geneB => $geneB, 101 | chrB => $chrB, 102 | coordB => $coordB, 103 | 104 | span_reads => $spanning_frags, 105 | junc_reads => $junction_read_count, 106 | }; 107 | 108 | push (@fusions, $struct); 109 | 110 | } 111 | 112 | close $fh; 113 | 114 | return(@fusions); 115 | } 116 | 117 | 118 | 1; #EOM 119 | 120 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/ChimeraScan_parser.pm: -------------------------------------------------------------------------------- 1 | package ChimeraScan_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | =chimerascan_format 8 | 9 | 0 #chrom5p 10 | 1 start5p 11 | 2 end5p 12 | 3 chrom3p 13 | 4 start3p 14 | 5 end3p 15 | 6 chimera_cluster_id 16 | 7 score 17 | 8 strand5p 18 | 9 strand3p 19 | 10 transcript_ids_5p 20 | 11 transcript_ids_3p 21 | 12 genes5p 22 | 13 genes3p 23 | 14 type 24 | 15 distance 25 | 16 total_frags 26 | 17 spanning_frags 27 | 18 unique_alignment_positions 28 | 19 isoform_fraction_5p 29 | 20 isoform_fraction_3p 30 | 21 breakpoint_spanning_reads 31 | 22 chimera_ids 32 | 33 | 0 chr17 34 | 1 38219062 35 | 2 38243105 36 | 3 chr17 37 | 4 46371708 38 | 5 46385190 39 | 6 CLUSTER41 40 | 7 138 41 | 8 + 42 | 9 + 43 | 10 ENST00000450525.2:0-1066,ENST00000450525.2:0-1213,ENST00000584985.1:0-1155,ENST00000546243.1:0-977,ENST00000394121.4:0-1131,ENST00000264637.4:0-1155,ENST00000546243.1:0-1124,ENST00000584985.1:0-1302,ENST00000394121.4:0-1278,ENST00000264637.4:0-1302 44 | 11 ENST00000421610.2:169-667,ENST00000604191.1:0-570,ENST00000421610.2:0-667 45 | 12 THRA 46 | 13 AC090627.1 47 | 14 Intrachromosomal 48 | 15 8121588 49 | 16 138 50 | 17 71 51 | 18 129 52 | 19 0.896103896104 53 | 20 1.0 54 | 21 >4910959/2;pos=4;strand=-,AAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCA,>2034127/1;pos=0;strand=-,CAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGT,>774891/2;pos=2;strand=-,AAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGT,>19849147/1;pos=8;strand=-,TGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCTG,>10432608/1;pos=7;strand=-,CTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCA,>820632/1;pos=5;strand=-,AACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAG,>5743073/2;pos=3;strand=-,AAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTC,>9753659/1;pos=7;strand=-,CTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCT,>20253102/2;pos=13;strand=-,ATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCGCAGTGTCAGCTAAAGAA,>15409265/1;pos=6;strand=-,ACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGC,>7246100/2;pos=1;strand=-,AAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTG,>13967970/1;pos=15;strand=-,GTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCTAAAGAAAC,>12125678/2;pos=1257;strand=+,CACCCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAAT,>17295693/1;pos=1259;strand=+,CCCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTT,>563772/1;pos=1260;strand=+,CCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTC,>3811711/2;pos=1264;strand=+,GTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGT,>12114145/1;pos=1267;strand=+,GTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCA,>4357847/2;pos=1268;strand=+,TGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAA,>8496244/2;pos=1270;strand=+,GACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGT,>20818261/1;pos=1272;strand=+,CTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGC,>10659063/2;pos=1272;strand=+,CTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGAGCAAGTGC,>7520969/1;pos=1273;strand=+,TTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCC,>10039588/1;pos=1282;strand=+,AAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCAACAGTGTCA 55 | 22 C5666846,C5666847,C6157374,C4569163,C3378772,C4569165,C3378770,C3378771,C5666883,C0478383,C0478385,C0478384,C5666848,C6157506,C0478515,C6157375,C6157376,C4569164,C4569205,C3378807 56 | 57 | =cut 58 | 59 | 60 | 61 | sub parse_fusion_result_file { 62 | my ($chimeraScan_file) = @_; 63 | 64 | my @fusions; 65 | 66 | open (my $fh, $chimeraScan_file) or die "Error, cannot open file $chimeraScan_file"; 67 | my $header = <$fh>; 68 | while (<$fh>) { 69 | if (/^\#/) { next; } 70 | chomp; 71 | my @x = split(/\t/); 72 | 73 | my $chrA = $x[0]; 74 | my $chrA_start = $x[1]; 75 | my $chrA_end = $x[2]; 76 | 77 | my $chrB = $x[3]; 78 | my $chrB_start = $x[4]; 79 | my $chrB_end = $x[5]; 80 | 81 | my $chrA_strand = $x[8]; 82 | my $chrB_strand = $x[9]; 83 | 84 | 85 | my $geneA = $x[12]; 86 | my $geneB = $x[13]; 87 | 88 | 89 | my $brkpt_A = ($chrA_strand eq '+') ? $chrA_end : $chrA_start; 90 | my $brkpt_B = ($chrB_strand eq '+') ? $chrB_end : $chrB_start; 91 | 92 | 93 | my $total_frags = $x[16]; 94 | 95 | my $junction_count = $x[17]; 96 | my $spanning_count = $total_frags - $junction_count; 97 | 98 | 99 | my $struct = { 100 | 101 | geneA => $geneA, 102 | chrA => $chrA, 103 | coordA => $brkpt_A, 104 | 105 | geneB => $geneB, 106 | chrB => $chrB, 107 | coordB => $brkpt_B, 108 | 109 | span_reads => $spanning_count, 110 | junc_reads => $junction_count, 111 | }; 112 | 113 | push (@fusions, $struct); 114 | } 115 | 116 | close $fh; 117 | 118 | return(@fusions); 119 | } 120 | 121 | 1; #EOM 122 | 123 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/DEFUSE_parser.pm: -------------------------------------------------------------------------------- 1 | package DEFUSE_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =defuse_format 9 | 10 | 0 cluster_id 11 | 1 splitr_sequence 12 | 2 splitr_count 13 | 3 splitr_span_pvalue 14 | 4 splitr_pos_pvalue 15 | 5 splitr_min_pvalue 16 | 6 adjacent 17 | 7 altsplice 18 | 8 break_adj_entropy1 19 | 9 break_adj_entropy2 20 | 10 break_adj_entropy_min 21 | 11 breakpoint_homology 22 | 12 breakseqs_estislands_percident 23 | 13 cdna_breakseqs_percident 24 | 14 deletion 25 | 15 est_breakseqs_percident 26 | 16 eversion 27 | 17 exonboundaries 28 | 18 expression1 29 | 19 expression2 30 | 20 gene1 31 | 21 gene2 32 | 22 gene_align_strand1 33 | 23 gene_align_strand2 34 | 24 gene_chromosome1 35 | 25 gene_chromosome2 36 | 26 gene_end1 37 | 27 gene_end2 38 | 28 gene_location1 39 | 29 gene_location2 40 | 30 gene_name1 41 | 31 gene_name2 42 | 32 gene_start1 43 | 33 gene_start2 44 | 34 gene_strand1 45 | 35 gene_strand2 46 | 36 genome_breakseqs_percident 47 | 37 genomic_break_pos1 48 | 38 genomic_break_pos2 49 | 39 genomic_strand1 50 | 40 genomic_strand2 51 | 41 interchromosomal 52 | 42 interrupted_index1 53 | 43 interrupted_index2 54 | 44 inversion 55 | 45 library_name 56 | 46 max_map_count 57 | 47 max_repeat_proportion 58 | 48 mean_map_count 59 | 49 min_map_count 60 | 50 num_multi_map 61 | 51 num_splice_variants 62 | 52 orf 63 | 53 read_through 64 | 54 repeat_proportion1 65 | 55 repeat_proportion2 66 | 56 span_count 67 | 57 span_coverage1 68 | 58 span_coverage2 69 | 59 span_coverage_max 70 | 60 span_coverage_min 71 | 61 splice_score 72 | 62 splicing_index1 73 | 63 splicing_index2 74 | 64 probability 75 | 76 | 0 3247 77 | 1 GCGCACTTCCCTGAGGACACTGTGGAGCAGAAGGCAGAAAGCGTGGGCAGAATTATGCCTCACACGGAGGTGAGCCCCTGACCAAGACTCCAAAGTCCCACCTCCCGTCACCCAGCTGGGGTGCACCCAGCTGGGACATCGGTTGCTTTCAGTGAGAGAGTCAAATGGCTCAC|CCAGGGCTCTCCCCAGATACCATTTCAAATTCCTGTTAATTTTATTTTAATCCTGAATTCTGAGTTTGAATGTATACCCAGATCAGCCCTGTCTTTGTTTTCACTCACTGGTGTGGATGTAGCATGCCTCCATTAAGCTTTTTATTAACTTGCCTTGTTTTTGTCTCTGGCCTCGTTACCT 78 | 2 6 79 | 3 0.0891263313789634 80 | 4 0.854900607940091 81 | 5 0.564814798312889 82 | 6 N 83 | 7 N 84 | 8 3.57167229721571 85 | 9 3.4325554405491 86 | 10 3.4325554405491 87 | 11 0 88 | 12 0 89 | 13 0 90 | 14 Y 91 | 15 0 92 | 16 N 93 | 17 N 94 | 18 2876 95 | 19 0 96 | 20 ENSG00000167107 97 | 21 ENSG00000227011 98 | 22 + 99 | 23 - 100 | 24 17 101 | 25 17 102 | 26 48552206 103 | 27 51065012 104 | 28 intron 105 | 29 downstream 106 | 30 ACSF2 107 | 31 C17orf112 108 | 32 48503519 109 | 33 51062880 110 | 34 + 111 | 35 + 112 | 36 0 113 | 37 48548600 114 | 38 51089613 115 | 39 + 116 | 40 - 117 | 41 N 118 | 42 - 119 | 43 - 120 | 44 N 121 | 45 defuse_outdir 122 | 46 1 123 | 47 0 124 | 48 1 125 | 49 1 126 | 50 0 127 | 51 1 128 | 52 N 129 | 53 N 130 | 54 0 131 | 55 0 132 | 56 8 133 | 57 1.23121459994238 134 | 58 1.40196698971541 135 | 59 1.40196698971541 136 | 60 1.23121459994238 137 | 61 2 138 | 62 - 139 | 63 - 140 | 64 0.510155154639065 141 | 142 | =cut 143 | 144 | 145 | 146 | 147 | sub parse_fusion_result_file { 148 | my ($defuse_out_file) = @_; 149 | 150 | my @fusions; 151 | 152 | open (my $fh, $defuse_out_file) or die "Error, cannot open file $defuse_out_file"; 153 | my $header = <$fh>; 154 | while (<$fh>) { 155 | chomp; 156 | my @x = split(/\t/); 157 | 158 | my $geneA = $x[30]; 159 | my $geneB = $x[31]; 160 | 161 | my $junction_count = $x[2]; # splitr_count 162 | unless ($junction_count =~ /\w/) { 163 | $junction_count = 0; 164 | } 165 | 166 | my $spanning_count = $x[56]; # span_count 167 | unless ($spanning_count =~ /\w/) { 168 | $spanning_count = 0; 169 | } 170 | 171 | my $chrA = $x[24]; 172 | my $brkpt_A = $x[37]; 173 | my $chrB = $x[25]; 174 | my $brkpt_B = $x[38]; 175 | 176 | my $struct = { 177 | 178 | geneA => $geneA, 179 | chrA => $chrA, 180 | coordA => $brkpt_A, 181 | 182 | geneB => $geneB, 183 | chrB => $chrB, 184 | coordB => $brkpt_B, 185 | 186 | span_reads => $spanning_count, 187 | junc_reads => $junction_count, 188 | }; 189 | 190 | push (@fusions, $struct); 191 | } 192 | 193 | return(@fusions); 194 | } 195 | 196 | 1; #EOM 197 | 198 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/EricScript_parser.pm: -------------------------------------------------------------------------------- 1 | package EricScript_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =ericscript_format 9 | 10 | 0 GeneName1 11 | 1 GeneName2 12 | 2 chr1 13 | 3 Breakpoint1 14 | 4 strand1 15 | 5 chr2 16 | 6 Breakpoint2 17 | 7 strand2 18 | 8 EnsemblGene1 19 | 9 EnsemblGene2 20 | 10 crossingreads 21 | 11 spanningreads 22 | 12 mean.insertsize 23 | 13 homology 24 | 14 fusiontype 25 | 15 InfoGene1 26 | 16 InfoGene2 27 | 17 JunctionSequence 28 | 18 GeneExpr1 29 | 19 GeneExpr2 30 | 20 GeneExpr_Fused 31 | 21 ES 32 | 22 GJS 33 | 23 US 34 | 24 EricScore 35 | 36 | 0 PPP1CB 37 | 1 SPDYA 38 | 2 2 39 | 3 28781842 40 | 4 + 41 | 5 2 42 | 6 28783907 43 | 7 + 44 | 8 ENSG00000213639 45 | 9 ENSG00000163806 46 | 10 42 47 | 11 31 48 | 12 210.54 49 | 13 ENSG00000186298 (93%) 50 | 14 Read-Through 51 | 15 protein phosphatase 1, catalytic subunit, beta isozyme [Source:HGNC Symbol;Acc:HGNC:9282] 52 | 16 speedy/RINGO cell cycle regulator family member A [Source:HGNC Symbol;Acc:HGNC:30613] 53 | 17 tctgcctatagcagccattgtggatgagaagatcttctgttgtcatggagGATTGTCACCAGACCTGCAATCTATGGAGCAGATTCGGAGAATTATGAGA 54 | 18 15.81 55 | 19 0.03 56 | 20 29.76 57 | 21 0.6967 58 | 22 0.667 59 | 23 0.738095238095238 60 | 24 0.981308796513694 61 | 62 | =cut 63 | 64 | 65 | sub parse_fusion_result_file { 66 | my ($ericscript_out_file) = @_; 67 | 68 | my @fusions; 69 | 70 | open (my $fh, $ericscript_out_file) or die "Error, cannot open file $ericscript_out_file"; 71 | my $header = <$fh>; 72 | while (<$fh>) { 73 | chomp; 74 | my @x = split(/\t/); 75 | 76 | my $geneA = $x[0]; 77 | my $geneB = $x[1]; 78 | 79 | my $junction_count = $x[10]; 80 | my $spanning_count = $x[11]; 81 | 82 | my $chrA = $x[2]; 83 | my $brkpt_A = $x[3]; 84 | my $chrB = $x[5]; 85 | my $brkpt_B = $x[6]; 86 | 87 | my $struct = { 88 | 89 | geneA => $geneA, 90 | chrA => $chrA, 91 | coordA => $brkpt_A, 92 | 93 | geneB => $geneB, 94 | chrB => $chrB, 95 | coordB => $brkpt_B, 96 | 97 | span_reads => $spanning_count, 98 | junc_reads => $junction_count, 99 | }; 100 | 101 | push (@fusions, $struct); 102 | } 103 | 104 | return(@fusions); 105 | } 106 | 107 | 108 | 1; #EOM 109 | 110 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/FusionCatcher_KP_parser.pm: -------------------------------------------------------------------------------- 1 | package FusionCatcher_KP_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | 9 | =FusionCatcher_format 10 | 11 | ## described at: https://github.com/ndaniel/fusioncatcher/blob/master/doc/manual.md 12 | 13 | 0 Fusion_gene_1 14 | 1 Fusion_gene_2 15 | 2 Count_paired-end_reads 16 | 3 Fusion_gene_symbol_1 17 | 4 Fusion_gene_symbol_2 18 | 5 Fusion_description 19 | 6 Analysis_status 20 | 7 Counts_of_common_mapping_reads 21 | 22 | 0 ENSG00000175121 23 | 1 ENSG00000178053 24 | 2 9465 25 | 3 WFDC5 26 | 4 MLF1 27 | 5 28 | 6 further_analysis 29 | 7 0 30 | 31 | 32 | 33 | =cut 34 | 35 | 36 | 37 | sub parse_fusion_result_file { 38 | 39 | my ($fusionCatcher_file) = @_; 40 | 41 | my @fusions; 42 | 43 | open (my $fh, $fusionCatcher_file) or die "Error, cannot open file $fusionCatcher_file"; 44 | my $header = <$fh>; 45 | while (<$fh>) { 46 | chomp; 47 | my @x = split(/\t/); 48 | 49 | my $geneA = $x[3]; 50 | my $geneB = $x[4]; 51 | 52 | unless ($geneA =~ /\w/ && $geneB =~ /\w/) { next; } # not scoring fusions not tied to genes. 53 | 54 | my $brkpt_A = $x[8]; 55 | my $brkpt_B = $x[9]; 56 | 57 | my $junction_count = $x[2]; 58 | 59 | my $struct = { 60 | 61 | geneA => $geneA, 62 | chrA => "NA", 63 | coordA => "NA", 64 | 65 | geneB => $geneB, 66 | chrB => "NA", 67 | coordB => "NA", 68 | 69 | span_reads => 0, # treat total count as junction here. 70 | junc_reads => $junction_count, 71 | }; 72 | 73 | push (@fusions, $struct); 74 | } 75 | 76 | close $fh; 77 | 78 | return(@fusions); 79 | } 80 | 81 | 1; #EOM 82 | 83 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/FusionCatcher_parser.pm: -------------------------------------------------------------------------------- 1 | package FusionCatcher_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | 9 | =FusionCatcher_format 10 | 11 | ## described at: https://github.com/ndaniel/fusioncatcher/blob/master/doc/manual.md 12 | 13 | 14 | 0 Gene_1_symbol(5end_fusion_partner) 15 | 1 Gene_2_symbol(3end_fusion_partner) 16 | 2 Fusion_description 17 | 3 Counts_of_common_mapping_reads 18 | 4 Spanning_pairs # Count of pair-end reads supporting the fusion 19 | 5 Spanning_unique_reads # Count of unique reads (i.e. unique mapping positions) mapping on the fusion junction. Shortly, here are counted all the reads which map on fusion junction minus the PCR duplicated reads. 20 | 6 Longest_anchor_found 21 | 7 Fusion_finding_method 22 | 8 Fusion_point_for_gene_1(5end_fusion_partner) 23 | 9 Fusion_point_for_gene_2(3end_fusion_partner) 24 | 10 Gene_1_id(5end_fusion_partner) 25 | 11 Gene_2_id(3end_fusion_partner) 26 | 12 Exon_1_id(5end_fusion_partner) 27 | 13 Exon_2_id(3end_fusion_partner) 28 | 14 Fusion_sequence 29 | 15 Predicted_effect 30 | 16 Predicted_fused_transcripts 31 | 17 Predicted_fused_proteins 32 | 33 | 0 THRA 34 | 1 THRA1/BTR 35 | 2 no_protein,antisense,known_fusion 36 | 3 0 37 | 4 74 38 | 5 20 39 | 6 25 40 | 7 BOWTIE 41 | 8 17:40086853:+ 42 | 9 17:48294347:+ 43 | 10 ENSG00000126351 44 | 11 ENSG00000235300 45 | 12 ENSE00000863335 46 | 13 ENSE00001677074 47 | 14 GTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAG*CAATTTCGAGTGCAAGTGCCACAGTGTCAGCTAAAG 48 | 15 CDS(truncated)/exonic(no-known-CDS) 49 | 50 | =cut 51 | 52 | 53 | 54 | sub parse_fusion_result_file { 55 | 56 | my ($fusionCatcher_file) = @_; 57 | 58 | my @fusions; 59 | 60 | open (my $fh, $fusionCatcher_file) or die "Error, cannot open file $fusionCatcher_file"; 61 | my $header = <$fh>; 62 | while (<$fh>) { 63 | chomp; 64 | my @x = split(/\t/); 65 | 66 | my $geneA = $x[0]; 67 | my $geneB = $x[1]; 68 | 69 | unless ($geneA =~ /\w/ && $geneB =~ /\w/) { next; } # not scoring fusions not tied to genes. 70 | 71 | my $brkpt_A = $x[8]; 72 | my $brkpt_B = $x[9]; 73 | 74 | my ($chrA, $coordA, $orientA) = split(/:/, $brkpt_A); 75 | my ($chrB, $coordB, $orientB) = split(/:/, $brkpt_B); 76 | 77 | my $spanning_count = $x[4]; 78 | my $junction_count = $x[5]; 79 | 80 | my $struct = { 81 | 82 | geneA => $geneA, 83 | chrA => $chrA, 84 | coordA => $coordA, 85 | 86 | geneB => $geneB, 87 | chrB => $chrB, 88 | coordB => $coordB, 89 | 90 | span_reads => $spanning_count, 91 | junc_reads => $junction_count, 92 | }; 93 | 94 | push (@fusions, $struct); 95 | } 96 | 97 | close $fh; 98 | 99 | return(@fusions); 100 | } 101 | 102 | 1; #EOM 103 | 104 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/FusionInspector_parser.pm: -------------------------------------------------------------------------------- 1 | package FusionInspector_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | sub parse_fusion_result_file { 8 | my ($FI_file) = @_; 9 | 10 | my @fusions; 11 | 12 | my $fh; 13 | 14 | if ($FI_file =~ /\.gz$/) { 15 | open ($fh, "gunzip -c $FI_file | ") or die "Error, cannot open file $FI_file"; 16 | } 17 | else { 18 | open ($fh, $FI_file) or die "Error, cannot open file $FI_file"; 19 | } 20 | 21 | my $header = <$fh>; 22 | 23 | my @x = split(/\t/, $header); 24 | my %idx; 25 | for (my $i = 0; $i <= $#x; $i++) { 26 | $idx{$x[$i]} = $i; 27 | } 28 | 29 | while (<$fh>) { 30 | chomp; 31 | my @x = split(/\t/); 32 | 33 | my $fusion = $x[ $idx{'#FusionName'} ]; 34 | my $junction_reads = $x[ $idx{'JunctionReadCount'} ]; 35 | my $spanning_reads = $x[ $idx{'SpanningFragCount'} ]; 36 | my $fusion_gene_A = $x[ $idx{'LeftGene'} ]; 37 | my $chr_coords_A = $x[ $idx{'LeftBreakpoint'} ]; 38 | my $fusion_gene_B = $x[ $idx{'RightGene'} ]; 39 | my $chr_coords_B = $x[ $idx{'RightBreakpoint'} ]; 40 | 41 | my $LeftBreakDinuc = uc $x[ $idx{'LeftBreakDinuc'} ]; 42 | my $RightBreakDinuc = uc $x[ $idx{'RightBreakDinuc'} ]; 43 | 44 | my $splice_combo = "${LeftBreakDinuc}-${RightBreakDinuc}"; 45 | if ($splice_combo !~ /^(GT\-AG|GC\-AG|CT\-AC)$/) { next; } # require canonical splice breakpoints, eliminate RT-artifacts 46 | 47 | my $rest; 48 | ($fusion_gene_A, $rest) = split(/\^/, $fusion_gene_A); 49 | ($fusion_gene_B, $rest) = split(/\^/, $fusion_gene_B); 50 | 51 | if ($junction_reads < 1) { next; } # require at least one junction read 52 | 53 | if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions 54 | 55 | my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A); 56 | my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B); 57 | 58 | 59 | my $struct = { 60 | geneA => $fusion_gene_A, 61 | chrA => $chrA || ".", 62 | coordA => $coordA || ".", 63 | 64 | geneB => $fusion_gene_B, 65 | chrB => $chrB || ".", 66 | coordB => $coordB || ".", 67 | 68 | span_reads => $spanning_reads, 69 | junc_reads => $junction_reads, 70 | }; 71 | 72 | push (@fusions, $struct); 73 | 74 | } 75 | 76 | close $fh; 77 | 78 | 79 | return(@fusions); 80 | } 81 | 82 | 1; #EOM 83 | 84 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/InFusion_parser.pm: -------------------------------------------------------------------------------- 1 | package InFusion_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =infusion_format 9 | 10 | 0 #id 11 | 1 ref1 12 | 2 break_pos1 13 | 3 region1 14 | 4 ref2 15 | 5 break_pos2 16 | 6 region2 17 | 7 num_span # should be num_split 18 | 8 num_paired 19 | 9 genes_1 20 | 10 genes_2 21 | 11 fusion_class 22 | 23 | 0 5591 24 | 1 20 25 | 2 35689536 26 | 3 [35689535,35689672] 27 | 4 1 28 | 5 84946639 29 | 6 [84946634,84946695] 30 | 7 5 31 | 8 8210 32 | 9 RBL1 33 | 10 RPF1 34 | 11 inter-chromosomal 35 | 36 | =cut 37 | 38 | 39 | sub parse_fusion_result_file { 40 | my ($preds_file) = @_; 41 | 42 | my @fusions; 43 | 44 | open (my $fh, $preds_file) or die "Error, cannot open file $preds_file"; 45 | my $header = <$fh>; 46 | while (<$fh>) { 47 | chomp; 48 | my @x = split(/\t/); 49 | 50 | my $geneA = $x[9]; 51 | my $geneB = $x[10]; 52 | 53 | $geneA =~ s/;/,/g; 54 | $geneB =~ s/;/,/g; # others use commas instead of semicolons, so lets be consistent here. 55 | 56 | my $chrA = $x[1]; 57 | my $chrB = $x[4]; 58 | 59 | my $brkpt_A = $x[2]; 60 | my $brkpt_B = $x[5]; 61 | 62 | my $junction_count = $x[7]; 63 | my $spanning_count = $x[8]; 64 | 65 | my $struct = { 66 | 67 | geneA => $geneA, 68 | chrA => $chrA, 69 | coordA => $brkpt_A, 70 | 71 | geneB => $geneB, 72 | chrB => $chrB, 73 | coordB => $brkpt_B, 74 | 75 | span_reads => $spanning_count, 76 | junc_reads => $junction_count, 77 | }; 78 | 79 | push (@fusions, $struct); 80 | } 81 | 82 | close $fh; 83 | 84 | return(@fusions); 85 | } 86 | 87 | 88 | 89 | 1; #EOM 90 | 91 | 92 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/JAFFA_parser.pm: -------------------------------------------------------------------------------- 1 | package JAFFA_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =JAFFA_format 9 | 10 | ## described here: https://github.com/Oshlack/JAFFA/wiki/OutputDescription 11 | 12 | 0 "sample" 13 | 1 "fusion genes" 14 | 2 "chrom1" 15 | 3 "base1" 16 | 4 "chrom2" 17 | 5 "base2" 18 | 6 "gap (kb)" 19 | 7 "spanning pairs" # spanning: The number of read-pairs, where each read in the pair aligns entirely on either side of the breakpoint. You might see a "-" in some of these. This indicates that no spanning pairs were found, but that the contig had only a small amount of flanking sequence to align reads to. i.e. the spanning pairs results may not be indicative of the true support for the fusion event. 20 | 8 "spanning reads" # junction: The number of reads aligning to the breakpoint, with at least 15 bases of flanking sequence either side (by default). 21 | 9 "inframe" 22 | 10 "aligns" 23 | 11 "rearrangement" 24 | 12 "contig" 25 | 13 "contig break" 26 | 14 "classification" 27 | 15 "known" 28 | 29 | 0 "jaffa-direct" 30 | 1 "PROP1:FLRT1" 31 | 2 "chr5" 32 | 3 177421107 33 | 4 "chr11" 34 | 5 63883691 35 | 6 Inf 36 | 7 "13674" 37 | 8 3261 38 | 9 TRUE 39 | 10 TRUE 40 | 11 TRUE 41 | 12 "Locus_1_Transcript_2940/6203_Confidence_0.001_Length_3574" 42 | 13 3017 43 | 14 "HighConfidence" 44 | 15 "-" 45 | 46 | =cut 47 | 48 | sub parse_fusion_result_file { 49 | my ($jaffa_out_file) = @_; 50 | 51 | my @fusions; 52 | 53 | open (my $fh, $jaffa_out_file) or die "Error, cannot open file $jaffa_out_file"; 54 | my $header = <$fh>; 55 | while (<$fh>) { 56 | chomp; 57 | s/\"//g; 58 | 59 | my @x = split(/,/); 60 | 61 | my $fusion = $x[1]; 62 | my ($geneA, $geneB) = split(/:/, $fusion); 63 | 64 | my $junction_count = $x[8]; 65 | unless ($junction_count =~ /\w/) { 66 | $junction_count = 0; 67 | } 68 | 69 | my $spanning_count = $x[7]; 70 | unless ($spanning_count =~ /\w/) { 71 | $spanning_count = 0; 72 | } 73 | 74 | my $chrA = $x[2]; 75 | my $brkpt_A = $x[3]; 76 | my $chrB = $x[4]; 77 | my $brkpt_B = $x[5]; 78 | 79 | my $struct = { 80 | 81 | geneA => $geneA, 82 | chrA => $chrA, 83 | coordA => $brkpt_A, 84 | 85 | geneB => $geneB, 86 | chrB => $chrB, 87 | coordB => $brkpt_B, 88 | 89 | span_reads => $spanning_count, 90 | junc_reads => $junction_count, 91 | }; 92 | 93 | push (@fusions, $struct); 94 | } 95 | 96 | return(@fusions); 97 | } 98 | 99 | 100 | 1; #EOM 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/MapSplice_parser.pm: -------------------------------------------------------------------------------- 1 | package MapSplice_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | =mapsplice_format 8 | 9 | Go here: http://www.netlab.uky.edu/p/bioinfo/MapSplice2FusionJunctionFormat 10 | 11 | 0 chr1~chr1 12 | 1 8926354 13 | 2 236647038 14 | 3 FUSIONJUNC_827 15 | 4 2 # junction count (coverage: number of reads aligned to the fusion junction) 16 | 5 -+ 17 | 6 255,0,0 18 | 7 2 19 | 8 20,36,169,36, 20 | 9 0,227720721, 21 | 10 0.693147 22 | 11 3 23 | 12 CTGC 24 | 13 0 25 | 14 1 26 | 15 0.500000 27 | 16 20 28 | 17 0 29 | 18 10 30 | 19 2 31 | 20 0 32 | 21 2 33 | 22 2 34 | 23 0 35 | 24 0 36 | 25 2 37 | 26 0 38 | 27 8 # spanning count (encompassing_read pair_count: Number of reads pairs surround the fusion(but not cross the fusion)) 39 | 28 8926374 40 | 29 236647074 41 | 30 8926354,64M54P50M| 42 | 31 236647038,133M| 43 | 32 0 44 | 33 0 45 | 34 0.553333 46 | 35 0.413333 47 | 36 114 48 | 37 114 49 | 38 133 50 | 39 133 51 | 40 1.79176 52 | 41 0.01 53 | 42 1 54 | 43 114 55 | 44 133 56 | 45 194 57 | 46 100 58 | 47 162.5 59 | 48 4 60 | 49 0 61 | 50 4 62 | 51 0 63 | 52 not_matched 64 | 53 not_matched 65 | 54 GCGGGTTTGCTCCCAACATC 66 | 55 ATTTCTCCTTGATGACATTCTTCAG 67 | 56 1 68 | 57 from_fusion 69 | 58 fusion 70 | 59 -,+ 71 | 60 ENO1, 72 | 61 EDARADD, 73 | 74 | 75 | 76 | 77 | 78 | 79 | =cut 80 | 81 | 82 | sub parse_fusion_result_file { 83 | my ($mapsplice_out_file) = @_; 84 | 85 | my @fusions; 86 | 87 | my $get_unique_gene_list_sref = sub { 88 | my ($gene_txt) = @_; 89 | 90 | my %genes; 91 | my @fields = split(/,/, $gene_txt); 92 | foreach my $gene (@fields) { 93 | if ($gene) { 94 | $genes{$gene} = 1; 95 | } 96 | } 97 | 98 | my $unique_gene_list = join(",", keys %genes); 99 | 100 | return($unique_gene_list); 101 | }; 102 | 103 | 104 | open (my $fh, $mapsplice_out_file) or die "Error, cannot open file $mapsplice_out_file"; 105 | while (<$fh>) { 106 | chomp; 107 | my @x = split(/\t/); 108 | 109 | my $geneA = &$get_unique_gene_list_sref($x[60]); 110 | my $geneB = &$get_unique_gene_list_sref($x[61]); 111 | 112 | my $junction_count = $x[4]; 113 | my $spanning_count = $x[27]; 114 | 115 | my ($chrA, $chrB) = split(/\~/, $x[0]); 116 | unless ($chrA =~ /chr/ && $chrB =~ /chr/) { 117 | confess "Erorr, didn't parse chr vals from $x[0] of $_"; 118 | } 119 | 120 | $chrA =~ s/chr//; 121 | $chrB =~ s/chr//; 122 | 123 | my $brkpt_A = $x[1]; 124 | my $brkpt_B = $x[2]; 125 | 126 | my $struct = { 127 | 128 | geneA => $geneA, 129 | chrA => $chrA, 130 | coordA => $brkpt_A, 131 | 132 | geneB => $geneB, 133 | chrB => $chrB, 134 | coordB => $brkpt_B, 135 | 136 | span_reads => $spanning_count, 137 | junc_reads => $junction_count, 138 | }; 139 | 140 | push (@fusions, $struct); 141 | } 142 | 143 | return(@fusions); 144 | 145 | } 146 | 147 | 148 | 1; #EOM 149 | 150 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/NFuse_parser.pm: -------------------------------------------------------------------------------- 1 | package NFuse_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =nFUSE_format 9 | 10 | 0 cluster_id 11 | 1 adjacent 12 | 2 altsplice 13 | 3 break_adj_entropy1 14 | 4 break_adj_entropy2 15 | 5 break_adj_entropy_min 16 | 6 breakpoint_homology 17 | 7 breakpos1 18 | 8 breakpos2 19 | 9 cdna_breakseqs_percident 20 | 10 chromosome1 21 | 11 chromosome2 22 | 12 defuse_probability 23 | 13 deletion 24 | 14 est_breakseqs_percident 25 | 15 estisland_breakseqs_percident 26 | 16 eversion 27 | 17 exonboundaries 28 | 18 gene1 29 | 19 gene2 30 | 20 gene_align_strand1 31 | 21 gene_align_strand2 32 | 22 gene_chromosome1 33 | 23 gene_chromosome2 34 | 24 gene_end1 35 | 25 gene_end2 36 | 26 gene_location1 37 | 27 gene_location2 38 | 28 gene_name1 39 | 29 gene_name2 40 | 30 gene_start1 41 | 31 gene_start2 42 | 32 gene_strand1 43 | 33 gene_strand2 44 | 34 genome_breakseqs_percident 45 | 35 genomic_break_pos1 46 | 36 genomic_break_pos2 47 | 37 genomic_strand1 48 | 38 genomic_strand2 49 | 39 interchromosomal 50 | 40 inversion 51 | 41 library_name 52 | 42 max_map_count 53 | 43 mean_map_count 54 | 44 min_map_count 55 | 45 num_multi_map 56 | 46 num_splice_variants 57 | 47 orf 58 | 48 readthrough 59 | 49 reference1 60 | 50 reference2 61 | 51 repeat_list1 62 | 52 repeat_list2 63 | 53 repeat_proportion1 64 | 54 repeat_proportion2 65 | 55 repeat_proportion_max 66 | 56 sequence 67 | 57 span_count # spanning count 68 | 58 span_coverage1 69 | 59 span_coverage2 70 | 60 span_coverage_max 71 | 61 span_coverage_min 72 | 62 splice_score 73 | 63 splitreads_count # junction count 74 | 64 splitreads_min_pvalue 75 | 65 splitreads_pos_pvalue 76 | 66 splitreads_span_pvalue 77 | 67 strand1 78 | 68 strand2 79 | 80 | 0 16780 81 | 1 N 82 | 2 N 83 | 3 3.46807512488172 84 | 4 3.60632674564779 85 | 5 3.46807512488172 86 | 6 0 87 | 7 393 88 | 8 150364072 89 | 9 0 90 | 10 3 91 | 11 6 92 | 12 0.798023102489623 93 | 13 N 94 | 14 0 95 | 15 0 96 | 16 N 97 | 17 N 98 | 18 ENSG00000183396 99 | 19 ENSG00000213091 100 | 20 - 101 | 21 + 102 | 22 3 103 | 23 6 104 | 24 48659288 105 | 25 150364489 106 | 26 coding 107 | 27 intron 108 | 28 TMEM89 109 | 29 PHBP1 110 | 30 48658192 111 | 31 150363682 112 | 32 - 113 | 33 + 114 | 34 0 115 | 35 48658896 116 | 36 150364072 117 | 37 + 118 | 38 + 119 | 39 Y 120 | 40 N 121 | 41 tmp.defuse_outdir 122 | 42 2 123 | 43 1.33333333333333 124 | 44 1 125 | 45 2 126 | 46 1 127 | 47 N 128 | 48 N 129 | 49 ENSG00000183396|ENST00000330862 130 | 50 6 131 | 51 - 132 | 52 - 133 | 53 0 134 | 54 0 135 | 55 0 136 | 56 CCCACTCTGGGTGGAAGTCCCCTTTATTTGGATTTGCCGCTGGGTGGCTAGATGACGTAGGTGGCCTTCGATGTGGACCAGGAGGGCATCCAGCATGTGCAGGACCCCACGGAGCAGGGTGTGGTCTGAGATTGGGGCCCGCCGTTTCCAGGGTCCGCAGGGCTCAGTGGTCACCTGCGGATGC|ACCACTGACTTGAGGATCTCAGTCATGATGGACGTCAGCACACGCTCATCATAGTCCTCTCCGGTGATGGCGAAGATGCGAGGAAGCTGGCTAGAGACGGGCCGGAAGAGGATGCACAGTGTGATGTTGACATTCTGTAAATATTTGCTACCAGTGATGACTGGCACAGTA 137 | 57 6 138 | 58 0.780927305360403 139 | 59 0.615731144611087 140 | 60 0.780927305360403 141 | 61 0.615731144611087 142 | 62 1 143 | 63 1655 144 | 64 0.717074880359308 145 | 65 0.58076168734171 146 | 66 0.844296742119671 147 | 67 - 148 | 68 + 149 | 150 | 151 | =cut 152 | 153 | 154 | sub parse_fusion_result_file { 155 | my ($nFUSE_out_file) = @_; 156 | 157 | my @fusions; 158 | 159 | open (my $fh, $nFUSE_out_file) or die "Error, cannot open file $nFUSE_out_file"; 160 | my $header = <$fh>; 161 | while (<$fh>) { 162 | chomp; 163 | my @x = split(/\t/); 164 | 165 | my $geneA = $x[28]; 166 | my $geneB = $x[29]; 167 | 168 | my $chrA = $x[10]; 169 | my $chrB = $x[11]; 170 | 171 | my $brkpt_A = $x[35]; 172 | my $brkpt_B = $x[36]; 173 | 174 | my $spanning_count = $x[57]; 175 | my $junction_count = $x[63]; 176 | 177 | my $struct = { 178 | 179 | geneA => $geneA, 180 | chrA => $chrA, 181 | coordA => $brkpt_A, 182 | 183 | geneB => $geneB, 184 | chrB => $chrB, 185 | coordB => $brkpt_B, 186 | 187 | span_reads => $spanning_count, 188 | junc_reads => $junction_count, 189 | }; 190 | 191 | push (@fusions, $struct); 192 | } 193 | 194 | return(@fusions); 195 | } 196 | 197 | 198 | 1; #EOM 199 | 200 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/PRADA_parser.pm: -------------------------------------------------------------------------------- 1 | package PRADA_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =PRADA_format 9 | 10 | 0 Gene_A 11 | 1 Gene_B 12 | 2 A_chr 13 | 3 B_chr 14 | 4 A_strand 15 | 5 B_strand 16 | 6 Discordant_n # span count 17 | 7 JSR_n 18 | 8 perfectJSR_n 19 | 9 Junc_n 20 | 10 Position_Consist 21 | 11 Junction # extract junction count 22 | 12 Identity 23 | 13 Align_Len 24 | 14 Evalue 25 | 15 BitScore 26 | 27 | 0 TRPC4AP 28 | 1 MRPL45 29 | 2 20 30 | 3 17 31 | 4 -1 32 | 5 1 33 | 6 6 34 | 7 3 35 | 8 3 36 | 9 2 37 | 10 PARTIALLY 38 | 11 TRPC4AP:20:33665849_MRPL45:17:36478009,2|TRPC4AP:20:33665849_MRPL45:17:36476502,1 39 | 12 100.00 40 | 13 12 41 | 14 0.68 42 | 15 22.9 43 | 44 | =cut 45 | 46 | 47 | 48 | sub parse_fusion_result_file { 49 | my ($prada_file) = @_; 50 | 51 | my @fusions; 52 | 53 | open (my $fh, $prada_file) or die "Error, cannot open file $prada_file"; 54 | my $header = <$fh>; 55 | while (<$fh>) { 56 | chomp; 57 | my @x = split(/\t/); 58 | 59 | my $span_count = $x[6]; 60 | 61 | my $fusion_info = $x[11]; 62 | # CPNE1:20:34243124_PI3:20:43804502,2 63 | 64 | my @fusion_evidence = split(/\|/, $fusion_info); 65 | foreach my $f_info (@fusion_evidence) { 66 | 67 | $f_info =~ /^(\S+):([^\:]+):(\d+)_(\S+):([^\:]+):(\d+),(\d+)$/ or die "Error, cannot parse $f_info"; 68 | 69 | my $geneA = $1; 70 | my $chrA = $2; 71 | my $coordA = $3; 72 | 73 | my $geneB = $4; 74 | my $chrB = $5; 75 | my $coordB = $6; 76 | 77 | my $junc_reads = $7; 78 | 79 | 80 | my $struct = { 81 | geneA => $geneA, 82 | chrA => $chrA, 83 | coordA => $coordA, 84 | 85 | geneB => $geneB, 86 | chrB => $chrB, 87 | coordB => $coordB, 88 | 89 | span_reads => $span_count, 90 | junc_reads => $junc_reads, 91 | }; 92 | 93 | push (@fusions, $struct); 94 | } 95 | } 96 | 97 | 98 | close $fh; 99 | 100 | return(@fusions); 101 | } 102 | 103 | 1; #EOM 104 | 105 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/SOAPfuse_parser.pm: -------------------------------------------------------------------------------- 1 | package SOAPfuse_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | sub parse_fusion_result_file { 8 | my ($soap_file) = @_; 9 | 10 | =soapfuse_format 11 | 12 | 0 up_gene 13 | 1 up_chr 14 | 2 up_strand 15 | 3 up_Genome_pos 16 | 4 up_loc 17 | 5 dw_gene 18 | 6 dw_chr 19 | 7 dw_strand 20 | 8 dw_Genome_pos 21 | 9 dw_loc 22 | 10 Span_reads_num # S 23 | 11 Junc_reads_num # J 24 | 12 Fusion_Type 25 | 13 down_fusion_part_frame-shift_or_not 26 | 27 | 0 SLMO2 28 | 1 chr20 29 | 2 - 30 | 3 57610027 31 | 4 M 32 | 5 ATP5E 33 | 6 chr20 34 | 7 - 35 | 8 57605484 36 | 9 E 37 | 10 3 38 | 11 4 39 | 12 INTRACHR-SS-OGO-0GAP 40 | 13 NA 41 | 42 | 43 | =cut 44 | 45 | ; 46 | 47 | 48 | my @fusions; 49 | 50 | open (my $fh, $soap_file) or die "Error, cannot open file $soap_file"; 51 | my $header = <$fh>; 52 | while (<$fh>) { 53 | chomp; 54 | my @x = split(/\t/); 55 | 56 | my $geneA = $x[0]; 57 | my $geneB = $x[5]; 58 | 59 | $geneA =~ s/SOAPfuse.*//; 60 | $geneB =~ s/SOAPfuse.*//; 61 | 62 | my $struct = { 63 | geneA => $geneA, 64 | chrA => $x[1], 65 | coordA => $x[3], 66 | 67 | geneB => $geneB, 68 | chrB => $x[6], 69 | coordB => $x[8], 70 | 71 | span_reads => $x[10], 72 | junc_reads => $x[11], 73 | 74 | }; 75 | 76 | 77 | push (@fusions, $struct); 78 | } 79 | 80 | 81 | return(@fusions); 82 | } 83 | 84 | 85 | 1; #EOM 86 | 87 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/STARCHIP_parser.pm: -------------------------------------------------------------------------------- 1 | package STARCHIP_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =STARCHIP_format 9 | 10 | 0 Partner1 11 | 1 Partner2 12 | 2 SpanningReads 13 | 3 SplitReads 14 | 4 AvgAS 15 | 5 NearGene1 16 | 6 Distance1 17 | 7 NearGene2 18 | 8 Distance2 19 | 9 ConsensusSeq 20 | 21 | 0 chr9:133729451:- 22 | 1 chr22:23632600:- 23 | 2 21 24 | 3 18 25 | 4 87.4 26 | 5 ABL1 27 | 6 0 28 | 7 BCR 29 | 8 0 30 | 9 gggctctatgggtttctgaatgtcatcgtccactcagccactggatttaagcagagttcaaaagcccttcagcggccagtagcatctgactttgagcctcagggtctgagtgaagccgctcg 31 | 32 | =cut 33 | 34 | 35 | 36 | sub parse_fusion_result_file { 37 | my ($file) = @_; 38 | 39 | my @fusions; 40 | 41 | open (my $fh, $file) or die "Error, cannot open file $file"; 42 | my $header = <$fh>; 43 | while (<$fh>) { 44 | if (/^\#/) { next; } 45 | chomp; 46 | my ($chr_coords_A, $chr_coords_B, 47 | $span_count, $junc_count, 48 | $avgAS, 49 | $geneA, 50 | $distA, 51 | $geneB, 52 | $distB, 53 | $seq) = split(/\t/); 54 | 55 | my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A); 56 | my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B); 57 | 58 | my $struct = { 59 | geneA => $geneA, 60 | chrA => $chrA, 61 | coordA => $coordA, 62 | 63 | geneB => $geneB, 64 | chrB => $chrB, 65 | coordB => $coordB, 66 | 67 | span_reads => $span_count, 68 | junc_reads => $junc_count, 69 | }; 70 | 71 | push (@fusions, $struct); 72 | 73 | } 74 | 75 | close $fh; 76 | 77 | return(@fusions); 78 | } 79 | 80 | 81 | 1; #EOM 82 | 83 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/STARFusion_parser.pm: -------------------------------------------------------------------------------- 1 | package STARFusion_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | use FindBin; 8 | use DelimParser; 9 | 10 | =STARFusion_format 11 | 12 | 0 #FusionName 13 | 1 JunctionReadCount 14 | 2 SpanningFragCount 15 | 3 SpliceType 16 | 4 LeftGene 17 | 5 LeftBreakpoint 18 | 6 RightGene 19 | 7 RightBreakpoint 20 | 8 LargeAnchorSupport 21 | 9 LeftBreakDinuc 22 | 10 LeftBreakEntropy 23 | 11 RightBreakDinuc 24 | 12 RightBreakEntropy 25 | 26 | 0 THRA--AC090627.1 27 | 1 76 28 | 2 104 29 | 3 ONLY_REF_SPLICE 30 | 4 THRA^ENSG00000126351.8 31 | 5 chr17:38243106:+ 32 | 6 AC090627.1^ENSG00000235300.3 33 | 7 chr17:46371709:+ 34 | 8 YES_LDAS 35 | 9 GT 36 | 10 1.8892 37 | 11 AG 38 | 12 1.9656 39 | 40 | 41 | =cut 42 | 43 | 44 | 45 | sub parse_fusion_result_file { 46 | my ($starFusion_file) = @_; 47 | 48 | my @fusions; 49 | 50 | my $fh; 51 | 52 | if ($starFusion_file =~ /\.gz$/) { 53 | open($fh, "gunzip -c $starFusion_file | "); 54 | } 55 | else { 56 | open ($fh, $starFusion_file) or die "Error, cannot open file $starFusion_file"; 57 | } 58 | 59 | unless($fh) { 60 | confess "Error, no filehandle opened on $starFusion_file"; 61 | } 62 | 63 | my $tab_reader = new DelimParser::Reader($fh, "\t"); 64 | 65 | while(my $row = $tab_reader->get_row()) { 66 | 67 | chomp; 68 | 69 | my $fusion = $tab_reader->get_row_val($row, "#FusionName"); 70 | my $junction_reads = $tab_reader->get_row_val($row, "JunctionReadCount"); 71 | my $spanning_reads = $tab_reader->get_row_val($row, "SpanningFragCount"); 72 | my $splice_type = $tab_reader->get_row_val($row, "SpliceType"); 73 | my $fusion_gene_A = $tab_reader->get_row_val($row, "LeftGene"); 74 | my $chr_coords_A = $tab_reader->get_row_val($row, "LeftBreakpoint"); 75 | my $fusion_gene_B = $tab_reader->get_row_val($row, "RightGene"); 76 | my $chr_coords_B = $tab_reader->get_row_val($row, "RightBreakpoint"); 77 | 78 | my $rest; 79 | ($fusion_gene_A, $rest) = split(/\^/, $fusion_gene_A); 80 | ($fusion_gene_B, $rest) = split(/\^/, $fusion_gene_B); 81 | 82 | if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions 83 | 84 | my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A); 85 | my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B); 86 | 87 | 88 | my $struct = { 89 | geneA => $fusion_gene_A, 90 | chrA => $chrA || ".", 91 | coordA => $coordA || ".", 92 | 93 | geneB => $fusion_gene_B, 94 | chrB => $chrB || ".", 95 | coordB => $coordB || ".", 96 | 97 | span_reads => $spanning_reads, 98 | junc_reads => $junction_reads, 99 | }; 100 | 101 | push (@fusions, $struct); 102 | 103 | } 104 | 105 | close $fh; 106 | 107 | return(@fusions); 108 | } 109 | 110 | 111 | 1; #EOM 112 | 113 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/TopHatFusion_parser.pm: -------------------------------------------------------------------------------- 1 | package TopHatFusion_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | sub parse_fusion_result_file { 8 | my ($tophatfusion_file) = @_; 9 | 10 | =tophatfusion_format 11 | 12 | from: http://tophat.cbcb.umd.edu/data/result.html#detail 13 | 14 | 1. Sample name in which a fusion is identified 15 | 2. Gene on the "left" side of the fusion 16 | 3. Chromosome ID on the left 17 | 4. Coordinates on the left 18 | 5. Gene on the "right" side 19 | 6. Chromosome ID on the right 20 | 7. Coordinates on the right 21 | 8. Number of spanning reads 22 | 9. Number of spanning mate pairs 23 | 10. Number of spanning mate pairs where one end spans a fusion 24 | If you follow the the 9th column, it shows coordinates "number1:number2" where one end is located at a distance of "number1" bases from the left genomic coordinate of a fusion and "number2" is similarly defined 25 | 26 | 0 sample_1 27 | 1 PNRC2 28 | 2 chr1 29 | 3 24289902 30 | 4 DGKD 31 | 5 chr2 32 | 6 234263228 33 | 7 147 # J 34 | 8 134 # S 35 | 9 102 36 | 10 953.07 37 | 38 | =cut 39 | 40 | ; 41 | 42 | 43 | my @fusions; 44 | 45 | open (my $fh, $tophatfusion_file) or die "Error, cannot open file $tophatfusion_file"; 46 | while (<$fh>) { 47 | chomp; 48 | my @x = split(/\t/); 49 | 50 | my $struct = { 51 | 52 | geneA => $x[1], 53 | chrA => $x[2], 54 | coordA => $x[3], 55 | 56 | geneB => $x[4], 57 | chrB => $x[5], 58 | coordB => $x[6], 59 | 60 | span_reads => $x[8], 61 | junc_reads => $x[7], 62 | 63 | }; 64 | 65 | push (@fusions, $struct); 66 | } 67 | 68 | close $fh; 69 | 70 | return(@fusions); 71 | } 72 | 73 | 74 | 1; #EOM 75 | 76 | -------------------------------------------------------------------------------- /benchmarking/FusionProgParsers/TrinityFusion_parser.pm: -------------------------------------------------------------------------------- 1 | package TrinityFusion_parser; 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | =TrinityFusion_format 9 | 10 | 0 #FusionName 11 | 1 JunctionReadCount 12 | 2 SpanningFragCount 13 | 3 trans_acc 14 | 4 trans_brkpt 15 | 5 LeftGene 16 | 6 LeftBreakpoint 17 | 7 RightGene 18 | 8 RightBreakpoint 19 | 9 SpliceType 20 | 10 annots 21 | 22 | 0 CWC22--AC104532.2 23 | 1 3105 24 | 2 9676 25 | 3 TRINITY_DN196_c0_g1_i1 26 | 4 284-283 27 | 5 CWC22 28 | 6 chr2:180835228 29 | 7 AC104532.2 30 | 8 chr19:5914396 31 | 9 ONLY_REF_SPLICE 32 | 10 ["INTERCHROMOSOMAL[chr2--chr19]"] 33 | 34 | =cut 35 | 36 | 37 | sub parse_fusion_result_file { 38 | my ($file) = @_; 39 | 40 | my @fusions; 41 | 42 | open (my $fh, $file) or die "Error, cannot open file $file"; 43 | while (<$fh>) { 44 | if (/^\#/) { next; } 45 | 46 | chomp; 47 | 48 | my @x = split("\t"); 49 | my $fusion_gene_A = $x[5]; 50 | my $fusion_gene_B = $x[7]; 51 | 52 | if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions 53 | 54 | my $splice_type = $x[9]; 55 | unless ($splice_type eq "ONLY_REF_SPLICE") { next; } # otherwise, too many assembly artifacts 56 | 57 | my $chr_coords_A = $x[6]; 58 | my $chr_coords_B = $x[8]; 59 | 60 | my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A); 61 | my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B); 62 | 63 | my $junction_reads = $x[1]; 64 | my $spanning_reads = $x[2]; 65 | 66 | my $struct = { 67 | geneA => $fusion_gene_A, 68 | chrA => $chrA || ".", 69 | coordA => $coordA || ".", 70 | 71 | geneB => $fusion_gene_B, 72 | chrB => $chrB || ".", 73 | coordB => $coordB || ".", 74 | 75 | span_reads => $spanning_reads, 76 | junc_reads => $junction_reads, 77 | }; 78 | 79 | push (@fusions, $struct); 80 | 81 | } 82 | 83 | close $fh; 84 | 85 | return(@fusions); 86 | } 87 | 88 | 89 | 1; #EOM 90 | 91 | -------------------------------------------------------------------------------- /benchmarking/aggregate_peak_F1_stats.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(ggplot2) 4 | library(dplyr) 5 | library(tidyr) 6 | 7 | args<-commandArgs(TRUE) 8 | if (length(args) != 3) { 9 | stop("require param: maxF1_file_suffix low_val high_val") # example: okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1.dat 3 13 10 | } 11 | 12 | file_suffix = args[1] 13 | low_val = as.numeric(args[2]) 14 | high_val = as.numeric(args[3]) 15 | 16 | 17 | dfs_list = list() 18 | 19 | for (minprogs in seq(low_val, high_val)) { 20 | dat_file = sprintf("__min_%d_agree/min_%d.%s", minprogs, minprogs, file_suffix) 21 | data = read.table(dat_file) 22 | 23 | data$min_progs_agree = minprogs 24 | 25 | dfs_list[[as.character(minprogs)]] <- data 26 | } 27 | 28 | all_data = do.call(rbind, dfs_list) 29 | 30 | write.table(all_data, file=sprintf("%s.consolidated.dat", file_suffix), quote=F, sep="\t") 31 | 32 | pdf(sprintf("%s.consolidated.scatters.pdf", file_suffix), height=5, width=11) 33 | 34 | p = all_data %>% ggplot(aes(x=PPV, y=TPR, color=prog, shape=prog)) + geom_point() + scale_shape_manual(values=rep(seq(0,25), 2)) + facet_wrap(~min_progs_agree) 35 | 36 | plot(p) 37 | 38 | dev.off() 39 | -------------------------------------------------------------------------------- /benchmarking/all_TP_FP_FN_to_ROC.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | 7 | my $usage = "\n\n\tusage: $0 summary.TP_FP_FN\n\n"; 8 | 9 | my $tp_fp_fn_file = $ARGV[0] or die $usage; 10 | 11 | main: { 12 | 13 | my %data = &parse_file($tp_fp_fn_file); 14 | 15 | print join("\t", "prog", "min_sum_frags", "TP", "FP", "FN", "TPR", "PPV", "F1") . "\n"; 16 | 17 | foreach my $prog (keys %data) { 18 | my $progdata_href = $data{$prog}; 19 | 20 | &make_ROC($prog, $progdata_href); 21 | } 22 | 23 | exit(0); 24 | } 25 | 26 | 27 | #### 28 | sub make_ROC { 29 | my ($prog_name, $progdata_href) = @_; 30 | 31 | my %data = %$progdata_href; 32 | 33 | my @TP_fusions = ($data{TP}) ? @{$data{TP}} : (); 34 | my @FP_fusions = ($data{FP}) ? @{$data{FP}} : (); 35 | my @FN_fusions = ($data{FN}) ? @{$data{FN}} : (); 36 | 37 | my $num_truth_fusions = scalar(@TP_fusions) + scalar(@FN_fusions); 38 | my $num_total_FP = scalar(@FP_fusions); 39 | 40 | my @uniq_vals = sort {$a<=>$b} &get_unique(@TP_fusions, @FP_fusions); 41 | 42 | for (my $i = 0; $i < $#uniq_vals; $i++) { 43 | 44 | my $min_val = $uniq_vals[$i]; 45 | 46 | @TP_fusions = grep { $_ >= $min_val } @TP_fusions; 47 | 48 | @FP_fusions = grep { $_ >= $min_val } @FP_fusions; 49 | 50 | my $num_TP = scalar(@TP_fusions); 51 | my $num_FP = scalar(@FP_fusions); 52 | my $num_FN = $num_truth_fusions - $num_TP; 53 | 54 | my $TPR = sprintf("%.2f", $num_TP / $num_truth_fusions); # True Positive Rate 55 | 56 | my $FDR = sprintf("%.2f", $num_FP / ($num_FP + $num_TP)); # False Discovery Rate 57 | 58 | my $PPV = 1 - $FDR; # Positive Predictive Value 59 | 60 | 61 | my $Sn = $TPR; # using true positive rate as 'sensitivity' measure 62 | my $Sp = $PPV; # using positive predictive value as 'specificity' measure 63 | 64 | 65 | my $F1 = "NA"; 66 | eval { 67 | $F1 = sprintf("%.3f", 2 * $Sn * $Sp / ($Sn + $Sp) ); 68 | }; 69 | 70 | print join("\t", $prog_name, $min_val, $num_TP, $num_FP, $num_FN, $TPR, $PPV, $F1) . "\n"; 71 | } 72 | 73 | return; 74 | } 75 | 76 | 77 | #### 78 | sub parse_file { 79 | my ($fusions_file) = @_; 80 | my %data; 81 | 82 | 83 | my %seen; 84 | 85 | open (my $fh, $fusions_file) or die $!; 86 | 87 | my $header = <$fh>; 88 | unless ($header =~ /^pred_result/) { 89 | die "Error, not reading expected header format for $fusions_file"; 90 | } 91 | while (<$fh>) { 92 | chomp; 93 | my @x = split(/\t/); 94 | unless (scalar @x == 10) { 95 | die "Error, did not parse 10 fields from row: $_"; 96 | } 97 | my ($pred_type, $sample_name, $progname, $fusion, $J, $S, 98 | $mapped_gencode_A, $mapped_gencode_B, $explanation, $selected_fusion) = @x; 99 | 100 | unless ($pred_type =~ /^(TP|FP|FN)$/) { next; } 101 | 102 | if ($selected_fusion ne '.') { 103 | $fusion = $selected_fusion; 104 | } 105 | 106 | my $fusion_token = join("::", $progname, $sample_name, $fusion); 107 | 108 | if ($seen{$fusion_token}) { 109 | die "Error, already processed fusion [$fusion_token], and these should be unique entries in this file $fusions_file"; 110 | } 111 | $seen{$fusion_token} = 1 ; 112 | 113 | my $val = $J + $S; 114 | 115 | push (@{$data{$progname}->{$pred_type}}, $val); 116 | } 117 | close $fh; 118 | 119 | return(%data); 120 | 121 | } 122 | 123 | #### 124 | sub get_unique { 125 | my (@vals) = @_; 126 | 127 | my %v = map { + $_ => 1 } @vals; 128 | 129 | return(keys %v); 130 | } 131 | -------------------------------------------------------------------------------- /benchmarking/all_TP_FP_FN_to_ROC.vary_minF_minS.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | 7 | my $usage = "\n\n\tusage: $0 summary.TP_FP_FN\n\n"; 8 | 9 | my $tp_fp_fn_file = $ARGV[0] or die $usage; 10 | 11 | main: { 12 | 13 | my %data = &parse_file($tp_fp_fn_file); 14 | 15 | print join("\t", "prog", "min_J", "min_S", "min_sum_frags", "TP", "FP", "FN", "TPR", "PPV", "F1") . "\n"; 16 | 17 | for (my $min_J = 0; $min_J <= 5; $min_J++) { 18 | 19 | for (my $min_S = 0; $min_S <= 5; $min_S++) { 20 | 21 | 22 | foreach my $prog (keys %data) { 23 | my $progdata_href = $data{$prog}; 24 | 25 | &make_ROC($prog, $progdata_href, $min_J, $min_S); 26 | } 27 | } 28 | } 29 | exit(0); 30 | } 31 | 32 | use Data::Dumper; 33 | 34 | #### 35 | sub make_ROC { 36 | my ($prog_name, $progdata_href, $min_J, $min_S) = @_; 37 | 38 | my %data = %$progdata_href; 39 | 40 | my @TP_fusions = ($data{TP}) ? @{$data{TP}} : (); 41 | my @FP_fusions = ($data{FP}) ? @{$data{FP}} : (); 42 | my @FN_fusions = ($data{FN}) ? @{$data{FN}} : (); 43 | 44 | my $num_truth_fusions = scalar(@TP_fusions) + scalar(@FN_fusions); 45 | my $num_total_FP = scalar(@FP_fusions); 46 | 47 | my @uniq_vals = sort {$a<=>$b} &get_unique(@TP_fusions, @FP_fusions); 48 | 49 | @uniq_vals = grep { $_ >= $min_J + $min_S } @uniq_vals; 50 | 51 | @TP_fusions = grep { $_->{J} >= $min_J && $_->{S} >= $min_S} @TP_fusions; 52 | @FP_fusions = grep { $_->{J} >= $min_J && $_->{S} >= $min_S} @FP_fusions; 53 | 54 | for (my $i=0; $i < $#uniq_vals; $i++) { 55 | 56 | my $min_val = $uniq_vals[$i]; 57 | 58 | @TP_fusions = grep { $_->{sum} >= $min_val } @TP_fusions; 59 | 60 | @FP_fusions = grep { $_->{sum} >= $min_val } @FP_fusions; 61 | 62 | my $num_TP = scalar(@TP_fusions); 63 | my $num_FP = scalar(@FP_fusions); 64 | my $num_FN = $num_truth_fusions - $num_TP; 65 | 66 | my $TPR = sprintf("%.2f", $num_TP / $num_truth_fusions); # True Positive Rate 67 | 68 | my $FDR = sprintf("%.2f", $num_FP / ($num_FP + $num_TP)); # False Discovery Rate 69 | 70 | my $PPV = 1 - $FDR; # Positive Predictive Value 71 | 72 | 73 | my $Sn = $TPR; # using true positive rate as 'sensitivity' measure 74 | my $Sp = $PPV; # using positive predictive value as 'specificity' measure 75 | 76 | 77 | my $F1 = "NA"; 78 | eval { 79 | $F1 = sprintf("%.3f", 2 * $Sn * $Sp / ($Sn + $Sp) ); 80 | }; 81 | 82 | print join("\t", $prog_name, $min_J, $min_S, $min_val, $num_TP, $num_FP, $num_FN, $TPR, $PPV, $F1) . "\n"; 83 | } 84 | 85 | return; 86 | } 87 | 88 | 89 | #### 90 | sub parse_file { 91 | my ($fusions_file) = @_; 92 | my %data; 93 | 94 | my %seen; 95 | 96 | open (my $fh, $fusions_file) or die $!; 97 | 98 | my $header = <$fh>; 99 | unless ($header =~ /^pred_result/) { 100 | die "Error, not reading expected header format for $fusions_file"; 101 | } 102 | while (<$fh>) { 103 | chomp; 104 | my @x = split(/\t/); 105 | unless (scalar @x == 10) { 106 | die "Error, did not parse 10 fields from row: $_"; 107 | } 108 | my ($pred_type, $sample_name, $progname, $fusion, $J, $S, 109 | $mapped_gencode_A, $mapped_gencode_B, $explanation, $selected_fusion) = @x; 110 | 111 | unless ($pred_type =~ /^(TP|FP|FN)$/) { next; } 112 | 113 | if ($selected_fusion ne '.') { 114 | $fusion = $selected_fusion; 115 | } 116 | 117 | my $fusion_token = join("::", $progname, $sample_name, $fusion); 118 | 119 | if ($seen{$fusion_token}) { 120 | die "Error, already processed fusion [$fusion_token], and these should be unique entries in this file $fusions_file"; 121 | } 122 | $seen{$fusion_token} = 1 ; 123 | 124 | my $val = $J + $S; 125 | 126 | push (@{$data{$progname}->{$pred_type}}, { sum => $val, 127 | J => $J, 128 | S => $S } ); 129 | } 130 | close $fh; 131 | 132 | return(%data); 133 | 134 | } 135 | 136 | #### 137 | sub get_unique { 138 | my (@vals) = @_; 139 | 140 | my %v = map { + $_->{sum} => 1 } @vals; 141 | 142 | return(keys %v); 143 | } 144 | -------------------------------------------------------------------------------- /benchmarking/all_TP_FP_FN_to_ROC.vary_minF_minS.plot.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | parser = ArgumentParser() 8 | 9 | parser$add_argument("--datafile", help="input data file", required=TRUE, nargs=1) 10 | parser$add_argument("--progname", help="fusion prog name", required=TRUE, nargs=1) 11 | args = parser$parse_args() 12 | 13 | 14 | library(tidyverse) 15 | 16 | pdf(paste0(args$progname, ".plot.pdf")) 17 | data = read.table(args$datafile, header=T) 18 | 19 | data = data %>% filter( min_sum_frags <= 10 & min_J <= 1 & min_S <= 1) 20 | 21 | data = data %>% unite(col='JS', min_J, min_S, sep=',') 22 | 23 | data = data %>% filter(prog==args$progname & JS != "1,1") 24 | 25 | data %>% ggplot(aes(x=min_sum_frags, y=F1, color=JS)) + geom_point() + geom_line() 26 | 27 | message("see plot file: ", paste0(args$progname, ".plot.pdf")) 28 | 29 | quit(save = "no", status = 0, runLast = FALSE) 30 | -------------------------------------------------------------------------------- /benchmarking/calc_PR.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import math 5 | import argparse 6 | 7 | # contributed by Bo Li, mod by bhaas 8 | 9 | ntruth = 0 10 | 11 | 12 | def main(): 13 | 14 | parser = argparse.ArgumentParser(description="computes Precision-Recall Curve and AUC values", formatter_class=argparse.ArgumentDefaultsHelpFormatter) 15 | 16 | parser.add_argument("--in_ROC", dest="in_ROC_file", type=str, default="", required=True, help="input ROC file") 17 | 18 | parser.add_argument("--out_PR", dest="out_PR_file", type=str, default="", required=True, help="output PR file") 19 | 20 | parser.add_argument("--min_read_support", dest="min_read_support", type=int, default=0, help="minimum read support for including data point in AUC computation") 21 | 22 | args = parser.parse_args() 23 | 24 | 25 | ntotal = 25000**2 # all possible gene pairs, rough approx. 26 | prog = "" 27 | ltp = lfp = 0 28 | auc = 0.0 29 | 30 | with open(args.in_ROC_file) as fin, open(args.out_PR_file, "w") as fout: 31 | # write header 32 | fout.write("{}\t{}\t{}\t{}\n".format('prog', 'recall', 'precision', 'actual')) 33 | next(fin) # skip header line 34 | for line in fin: 35 | fields = line.strip().split() 36 | 37 | min_frags = int(fields[1]) 38 | if (min_frags < args.min_read_support): 39 | continue 40 | 41 | tp = int(fields[2]) 42 | fp = int(fields[3]) 43 | fn = int(fields[4]) 44 | 45 | global ntruth 46 | ntruth = tp + fn 47 | 48 | if prog != fields[0]: 49 | # prog switch 50 | if prog != "": 51 | # process last line of prev prog and report 52 | auc += output(fout, prog, 0, 0, ltp, lfp) 53 | print("{}\t{:.2f}".format(prog, auc)) 54 | # first line of next prog, reinit vals 55 | prog = fields[0] 56 | ltp = ntruth 57 | lfp = ntotal - ntruth 58 | auc = output(fout, prog, ltp, lfp) 59 | 60 | # add to auc 61 | auc += output(fout, prog, tp, fp, ltp, lfp) 62 | ltp = tp 63 | lfp = fp 64 | 65 | if prog != "": 66 | # last line of file, process last prog results 67 | auc += output(fout, prog, 0, 0, ltp, lfp) 68 | print("{}\t{:.2f}".format(prog, auc)) 69 | 70 | 71 | sys.exit(0) 72 | 73 | 74 | 75 | 76 | def output(fout, prog, ntp, nfp, nltp = -1, nlfp = -1): 77 | """ return delta auc """ 78 | 79 | dauc = 0.0 80 | if nltp < 0: 81 | recall = 1.0 82 | precision = ntp * 1.0 / (ntp + nfp) 83 | fout.write("{}\t{}\t{}\t0\n".format(prog, recall, precision)) 84 | elif ntp == 0 and nfp == 0: 85 | assert nltp >= 0 and nlfp >= 0 and nltp + nlfp > 0 86 | lrecall = nltp * 1.0 / ntruth 87 | lprecision = nltp * 1.0 / (nltp + nlfp) 88 | recall = 0.0 89 | precision = lprecision 90 | if lrecall > 0.0: 91 | fout.write("{}\t{}\t{}\t0\n".format(prog, recall, precision)) 92 | #dauc = 0.5 * lrecall * lprecision 93 | dauc = lrecall * lprecision 94 | else: 95 | recall = ntp * 1.0 / ntruth 96 | precision = ntp * 1.0 / (ntp + nfp) 97 | 98 | if nltp > ntp: 99 | lrecall = nltp * 1.0 / ntruth 100 | lprecision = nltp * 1.0 / (nltp + nlfp) 101 | 102 | rate = (nlfp - nfp) * 1.0 / (nltp - ntp) 103 | trecall = lrecall - 0.01 104 | x = nltp - ntp - 0.01 * ntruth 105 | tlrecall = lrecall 106 | tlprecision = lprecision 107 | while trecall > recall: 108 | trecall = (ntp + x) * 1.0 / ntruth 109 | tprecision = (ntp + x) * 1.0 / (ntp + x + nfp + rate * x) 110 | fout.write("{}\t{}\t{}\t0\n".format(prog, trecall, tprecision)) 111 | dauc += 0.5 * (tlprecision + tprecision) * (tlrecall - trecall) 112 | 113 | tlrecall = trecall 114 | tlprecision = tprecision 115 | trecall -= 0.01 116 | x -= 0.01 * ntruth 117 | 118 | dauc += 0.5 * (tlprecision + precision) * (tlrecall - recall) 119 | 120 | fout.write("{}\t{}\t{}\t1\n".format(prog, recall, precision)) 121 | 122 | return dauc 123 | 124 | 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /benchmarking/collect_preds.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use FindBin; 7 | use lib ("$FindBin::Bin/../PerlLib"); 8 | 9 | 10 | my $usage = "usage: $0 fusion_result_file_listing.dat\n\n"; 11 | 12 | my $fusion_result_file_listing = $ARGV[0] or die $usage; 13 | 14 | my $fusion_prog_parser_lib_dir = "$FindBin::Bin/FusionProgParsers"; 15 | 16 | my %prog_type_to_file_parser = ( 17 | 'ChimPipe' => 'ChimPipe_parser', 18 | 'ChimeraScan' => 'ChimeraScan_parser', 19 | 'deFuse' => 'DEFUSE_parser', 20 | 'EricScript' => 'EricScript_parser', 21 | 22 | 'Fusion.*Catcher' => 'FusionCatcher_parser', 23 | 'FC_V0997c' => 'FusionCatcher_parser', 24 | 25 | 'FusionHunter' => 'FusionHunter_parser', 26 | 27 | 'FusionInspector' => 'FusionInspector_parser', 28 | 'FI-*' => 'FusionInspector_parser', 29 | 30 | 'InFusion' => 'InFusion_parser', 31 | 32 | 'JAFFA-Assembly' => 'JAFFA_parser', 33 | 'JAFFA-Direct' => 'JAFFA_parser', 34 | 'JAFFA-Hybrid' => 'JAFFA_parser', 35 | 36 | 'MapSplice' => 'MapSplice_parser', 37 | 38 | 'nFuse' => 'NFuse_parser', 39 | 40 | 'PRADA' => 'PRADA_parser', 41 | 42 | 'SOAP-fuse' => 'SOAPfuse_parser', 43 | 44 | 'STAR_FUSION' => 'STARFusion_parser', 45 | 'starfusion' => 'STARFusion_parser', 46 | 'STARF' => 'STARFusion_parser', 47 | 48 | 'TopHat-Fusion' => 'TopHatFusion_parser', 49 | 50 | 'PIZZLY' => 'PIZZLY_parser', 51 | 52 | 'ARRIBA' => 'ARRIBA_parser', 53 | 'ARRIBA_hc' => 'ARRIBA_hc_parser', 54 | 55 | 'STARCHIP' => 'STARCHIP_parser', 56 | 'STARChip_csm10' => 'STARCHIP_parser', 57 | 'STARCHIP_csm10' => 'STARCHIP_parser', 58 | 'STARCHIP_csm10_pG_Apr302019' => 'STARCHIP_parser', 59 | 'STARCHIP_csm10_pGm2_May012019' => 'STARCHIP_parser', 60 | 61 | #'TrinityFusion' => 'TrinityFusion_parser', 62 | #'TrinityFusion-D' => 'TrinityFusion_parser', 63 | #'TrinityFusion-C' => 'TrinityFusion_parser', 64 | #'TrinityFusion-UC' => 'TrinityFusion_parser', 65 | 'TRINITY.*FUSION' => 'TrinityFusion_parser', 66 | 67 | 'STARSEQR' => 'STARSEQR_parser' 68 | 69 | ); 70 | 71 | 72 | 73 | foreach my $module (values %prog_type_to_file_parser) { 74 | my $module_path = "$fusion_prog_parser_lib_dir/$module.pm"; 75 | 76 | require($module_path); 77 | 78 | } 79 | 80 | 81 | main: { 82 | 83 | 84 | # print header 85 | print join("\t", "sample", "prog", "fusion", "J", "S") . "\n"; 86 | 87 | open(my $fh, $fusion_result_file_listing) or die "Error, cannot open file $fusion_result_file_listing"; 88 | while (<$fh>) { 89 | chomp; 90 | my ($sample_name, $prog_name, $result_file) = split(/\t/); 91 | 92 | 93 | 94 | my $parser_module; 95 | 96 | if (exists $prog_type_to_file_parser{$prog_name}) { 97 | $parser_module = $prog_type_to_file_parser{$prog_name}; 98 | } 99 | else { 100 | ## use regex to find parser 101 | foreach my $name (keys %prog_type_to_file_parser) { 102 | if ($prog_name =~ /$name/i) { 103 | $parser_module = $prog_type_to_file_parser{$name}; 104 | last; 105 | } 106 | } 107 | } 108 | 109 | unless (defined $parser_module) { 110 | 111 | die "Error, no parser for prog [$prog_name] "; 112 | } 113 | 114 | my $parser_function = $parser_module . "::" . "parse_fusion_result_file"; 115 | 116 | no strict 'refs'; 117 | my @fusions = &$parser_function($result_file); 118 | 119 | &add_sum_fusions(\@fusions); 120 | 121 | @fusions = reverse sort { $a->{sum_frags} <=> $b->{sum_frags} } @fusions; 122 | 123 | foreach my $fusion (@fusions) { 124 | 125 | my $fusion_name = join("--", $fusion->{geneA}, $fusion->{geneB}); 126 | 127 | my $junc_count = $fusion->{junc_reads}; 128 | my $span_count = $fusion->{span_reads}; 129 | 130 | 131 | print join("\t", $sample_name, $prog_name, $fusion_name, $junc_count, $span_count) . "\n"; 132 | } 133 | 134 | } 135 | close $fh; 136 | 137 | 138 | exit(0); 139 | } 140 | 141 | #### 142 | sub add_sum_fusions { 143 | my ($fusions_aref) = @_; 144 | 145 | foreach my $fusion (@$fusions_aref) { 146 | 147 | $fusion->{sum_frags} = $fusion->{junc_reads} + $fusion->{span_reads}; 148 | 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /benchmarking/collected_preds_to_fusion_prog_support_listing.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "\n\n\tusage: $0 fusion_preds.collected progs_to_consider.txt\n\n"; 7 | 8 | my $preds_file = $ARGV[0] or die $usage; 9 | my $progs_to_consider_file = $ARGV[1] or die $usage; 10 | 11 | main: { 12 | 13 | my %progs_to_consider; 14 | { 15 | open(my $fh, $progs_to_consider_file) or die "Error, cannot open file $progs_to_consider_file"; 16 | while (<$fh>) { 17 | s/^\s+|\s+$//g; 18 | my $prog = $_; 19 | $progs_to_consider{$prog} = 1; 20 | } 21 | close $fh; 22 | } 23 | 24 | my %fusion_to_prog; 25 | 26 | open (my $fh, $preds_file) or die "Error, cannot open file $preds_file"; 27 | my $header = <$fh>; 28 | unless ($header =~ /^sample\tprog/) { 29 | die "Error, missing expected header in $preds_file"; 30 | } 31 | while (<$fh>) { 32 | chomp; 33 | my ($sample_name, $prog, $fusion_name, $junc_support, $frag_support) = split(/\t/); 34 | 35 | unless ($progs_to_consider{$prog}) { next; } 36 | 37 | $fusion_name = uc $fusion_name; 38 | 39 | my ($genesA, $genesB) = split(/--/, $fusion_name); 40 | 41 | foreach my $geneA (split(/,/, $genesA)) { 42 | foreach my $geneB (split(/,/, $genesB)) { 43 | 44 | my $fusion_name_use = join("--", $geneA, $geneB); 45 | $fusion_name = "$sample_name|$fusion_name_use"; 46 | 47 | $fusion_to_prog{$fusion_name}->{$prog} = 1; 48 | } 49 | } 50 | } 51 | close $fh; 52 | 53 | my @fusion_structs; 54 | foreach my $fusion_name (keys %fusion_to_prog) { 55 | my $progs_href = $fusion_to_prog{$fusion_name}; 56 | 57 | my @prognames = sort keys %$progs_href; 58 | my $num_progs = scalar(@prognames); 59 | 60 | push (@fusion_structs, { fusion_name => $fusion_name, 61 | prognames => \@prognames, 62 | count => $num_progs, 63 | } ); 64 | 65 | } 66 | 67 | @fusion_structs = reverse sort {$a->{count} <=> $b->{count} } @fusion_structs; 68 | 69 | foreach my $fusion_struct (@fusion_structs) { 70 | print join("\t", $fusion_struct->{fusion_name}, 71 | join(",", @{$fusion_struct->{prognames}}), 72 | $fusion_struct->{count}, 73 | ) . "\n"; 74 | } 75 | 76 | exit(0); 77 | } 78 | 79 | -------------------------------------------------------------------------------- /benchmarking/compare_A_vs_B_scored_preds.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use FindBin; 7 | use lib ("$FindBin::Bin/../PerlLib"); 8 | use DelimParser; 9 | 10 | my $usage = "usage: $0 pred_results.scored progA progB\n\n"; 11 | 12 | my $scored_preds_file = $ARGV[0] or die $usage; 13 | my $progA = $ARGV[1] or die $usage; 14 | my $progB = $ARGV[2] or die $usage; 15 | 16 | 17 | 18 | main: { 19 | 20 | my %TP_preds; 21 | my %FP_preds; 22 | 23 | open(my $fh, $scored_preds_file) or die $!; 24 | 25 | my $delim_parser = new DelimParser::Reader($fh, "\t"); 26 | 27 | while (my $row = $delim_parser->get_row()) { 28 | 29 | my $prog = $row->{prog}; 30 | if ($prog eq "$progA" || $prog eq "$progB") { 31 | 32 | my $J = $row->{J}; 33 | my $S = $row->{S}; 34 | 35 | my $pred_result = $row->{pred_result}; 36 | if ($pred_result eq "TP") { 37 | my $selected_fusion = $row->{selected_fusion}; 38 | $TP_preds{$selected_fusion}->{$prog} = "($J,$S)"; 39 | } 40 | elsif ($pred_result eq "FP") { 41 | my $fusion = $row->{fusion}; 42 | $FP_preds{$fusion}->{$prog} = "($J,$S)"; 43 | } 44 | } 45 | 46 | } 47 | 48 | print "#pred_result\tfusion\t$progA\t$progB\n"; 49 | foreach my $fusion (keys %TP_preds) { 50 | my $progA_results = $TP_preds{$fusion}->{$progA} || "."; 51 | my $progB_results = $TP_preds{$fusion}->{$progB} || "."; 52 | 53 | print join("\t", "TP", $fusion, $progA_results, $progB_results) . "\n"; 54 | } 55 | 56 | print "#pred_result\tfusion\t$progA\t$progB\n"; 57 | foreach my $fusion (keys %FP_preds) { 58 | my $progA_results = $FP_preds{$fusion}->{$progA} || "."; 59 | my $progB_results = $FP_preds{$fusion}->{$progB} || "."; 60 | 61 | print join("\t", "FP", $fusion, $progA_results, $progB_results) . "\n"; 62 | } 63 | 64 | 65 | exit(0); 66 | } 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /benchmarking/define_truth_n_unsure_set.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use File::Basename; 6 | 7 | my $usage = "\n\n\tusage: $0 preds.collected.byProg min_truth\n\n"; 8 | 9 | my $preds_collected = $ARGV[0] or die $usage; 10 | my $min_truth = $ARGV[1] or die $usage; 11 | 12 | main: { 13 | 14 | open(my $fh, $preds_collected) or die "Error, cannot open file $preds_collected"; 15 | 16 | my $out_basename = basename($preds_collected); 17 | 18 | open(my $ofh_truth, ">$out_basename.min_${min_truth}.truth_set") or die $!; 19 | open(my $ofh_unsure, ">$out_basename.min_${min_truth}.unsure_set") or die $!; 20 | 21 | while (<$fh>) { 22 | chomp; 23 | my ($fusion_name, $prog_list, $prog_count) = split(/\t/); 24 | if ($prog_count >= $min_truth) { 25 | print $ofh_truth "$fusion_name\n"; 26 | } 27 | elsif ($prog_count > 1) { 28 | print $ofh_unsure "$fusion_name\n"; 29 | } 30 | } 31 | 32 | close $fh; 33 | close $ofh_truth; 34 | close $ofh_unsure; 35 | 36 | exit(0); 37 | } 38 | -------------------------------------------------------------------------------- /benchmarking/filter_collected_preds.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "\n\n\tusage: $0 preds.collected.wAnnot\n\n"; 7 | 8 | my $preds_file = $ARGV[0] or die $usage; 9 | 10 | open (my $fh, $preds_file) or die "Error, cannot open file $preds_file"; 11 | while(<$fh>) { 12 | my $line = $_; 13 | my @x = split(/\t/); 14 | my $fusion_name = $x[2]; 15 | my $annot = $x[7]; 16 | 17 | if ($fusion_name =~ /(^HLA\-)|\-HLA\-/ 18 | || 19 | ($annot && 20 | ($annot =~ /chrM:/i 21 | || 22 | $annot =~ /NEIGHBOR/ 23 | || 24 | $annot =~ /BLAST/ 25 | || 26 | $annot =~ /GTEx|BodyMap|DGD_PARALOGS|HGNC_GENEFAM|Greger_Normal|Babiceanu_Normal|ConjoinG/ 27 | || 28 | $fusion_name =~ /IG[HKL].*--IG[HKL]/ 29 | ) 30 | ) 31 | ) 32 | { 33 | next; 34 | } 35 | print $line; 36 | } 37 | 38 | exit(0); 39 | 40 | -------------------------------------------------------------------------------- /benchmarking/fusion_preds_sensitivity_vs_expr.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use FindBin; 6 | 7 | use POSIX; 8 | use Data::Dumper; 9 | 10 | my $usage = "\n\n\tusage: $0 preds.collected.scored fusion_TPM_values.dat\n\n"; 11 | 12 | 13 | my $preds_file = $ARGV[0] or die $usage; 14 | my $fusion_TPMs_file = $ARGV[1] or die $usage; 15 | 16 | my $max_bin = 10; 17 | 18 | main: { 19 | 20 | my %fusion_to_expr_bin = &parse_fusions_into_expr_bins($fusion_TPMs_file); 21 | 22 | my %count_ref_fusions_in_bin; 23 | foreach my $bin (values %fusion_to_expr_bin) { 24 | $count_ref_fusions_in_bin{$bin}++; 25 | } 26 | 27 | #print Dumper(\%count_ref_fusions_in_bin); 28 | print STDERR "counts of ref fusions per bin:\n"; 29 | for my $bin (1..$max_bin) { 30 | my $count = $count_ref_fusions_in_bin{$bin} || 0; 31 | print STDERR join("\t", $bin, $count) . "\n"; 32 | } 33 | 34 | my %method_to_fusion_pred = &parse_fusion_predictions($preds_file); 35 | 36 | 37 | ## for each method, determine the counts in each bin. 38 | 39 | print "#\t" . join("\t", (1..$max_bin)) . "\n"; 40 | foreach my $method (keys %method_to_fusion_pred) { 41 | 42 | my $fusion_preds_href = $method_to_fusion_pred{$method}; 43 | my %bin_counts; 44 | 45 | foreach my $fusion_pred (keys %$fusion_preds_href) { 46 | 47 | my $bin = $fusion_to_expr_bin{$fusion_pred}; 48 | unless (defined $bin) { 49 | print STDERR "Error, fusion \"$fusion_pred\" not assigned to an expression bin.\n"; 50 | next; 51 | } 52 | 53 | $bin_counts{$bin}++; 54 | 55 | } 56 | 57 | print "$method"; 58 | for my $bin (1..$max_bin) { 59 | my $count = $bin_counts{$bin} || 0; 60 | my $num_ref_fusions_in_bin = $count_ref_fusions_in_bin{$bin}; 61 | 62 | my $sensitivity = sprintf("%.2f", $count / $num_ref_fusions_in_bin * 100); 63 | print "\t$sensitivity"; 64 | } 65 | print "\n"; 66 | } 67 | 68 | exit(0); 69 | 70 | 71 | } 72 | 73 | #### 74 | sub parse_fusion_predictions { 75 | my ($preds_file) = @_; 76 | 77 | my %method_to_preds; 78 | 79 | open (my $fh, $preds_file) or die $!; 80 | while (<$fh>) { 81 | my $line = $_; 82 | chomp; 83 | if (/^\#/) { next; } 84 | my @x = split(/\t/); 85 | 86 | my $pred_class = $x[0]; 87 | unless ($pred_class eq "TP") { next; } 88 | 89 | my $fusion_name = $x[3]; 90 | my $sample = $x[2]; 91 | my $method = $x[1]; 92 | 93 | if ($line =~ /chr_mapping_to_first_encounter_of_TP_\S+\|(\S+--\S+)/) { 94 | $fusion_name = $1; 95 | } 96 | 97 | $fusion_name = uc $fusion_name; 98 | my ($geneA, $geneB) = sort split(/--/, $fusion_name); 99 | $fusion_name = "$geneA--$geneB"; 100 | 101 | $fusion_name = "$sample|$fusion_name"; 102 | 103 | $method_to_preds{$method}->{$fusion_name} = 1; 104 | } 105 | close $fh; 106 | 107 | 108 | return(%method_to_preds); 109 | } 110 | 111 | 112 | #### 113 | sub parse_fusions_into_expr_bins { 114 | my ($fusion_tpm_file) = @_; 115 | 116 | my %fusion_to_TPM_bin; 117 | 118 | open (my $fh, $fusion_tpm_file) or die $!; 119 | while (<$fh>) { 120 | chomp; 121 | my ($sample, $fusion, $TPM) = split(/\t/); 122 | 123 | $fusion = uc($fusion); 124 | my ($geneA, $geneB) = sort split(/--/, $fusion); 125 | 126 | $fusion = "$geneA--$geneB"; 127 | 128 | my $bin = ceil(log($TPM+0.01)/log(2)); 129 | 130 | if ($bin < 1) { 131 | $bin = 1; 132 | } 133 | elsif ($bin > $max_bin) { 134 | $bin = $max_bin; 135 | } 136 | 137 | my $fusion_name = join("|", $sample, $fusion); 138 | 139 | $fusion_to_TPM_bin{$fusion_name} = $bin; 140 | } 141 | close $fh; 142 | 143 | return(%fusion_to_TPM_bin); 144 | } 145 | -------------------------------------------------------------------------------- /benchmarking/fusion_preds_to_matrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 preds.collected\n\n"; 7 | 8 | my $preds_file = $ARGV[0] or die $usage; 9 | 10 | 11 | main: { 12 | 13 | my %prognames; 14 | my %fusion_preds; 15 | 16 | open(my $fh, $preds_file) or die $!; 17 | my $header = <$fh>; 18 | unless ($header =~ /^sample\tprog/) { 19 | die "Error, missing expected header format for $preds_file"; 20 | } 21 | while(<$fh>) { 22 | chomp; 23 | my @x = split(/\t/); 24 | my $sample_name = $x[0]; 25 | my $prog = $x[1]; 26 | my $fusion_name = uc $x[2]; 27 | my $J = $x[3]; 28 | my $S = $x[4]; 29 | 30 | my $sum_JS = $J + $S; 31 | 32 | $fusion_name = "$sample_name|$fusion_name"; 33 | 34 | $prognames{$prog} = 1; 35 | 36 | $fusion_preds{$fusion_name}->{$prog} = $sum_JS; 37 | 38 | } 39 | close $fh; 40 | 41 | 42 | ## output matrix 43 | my @prognames = sort keys %prognames; 44 | my @fusions = sort keys %fusion_preds; 45 | 46 | print "\t" . join("\t", @prognames) . "\n"; 47 | 48 | foreach my $fusion (@fusions) { 49 | my @vals = ($fusion); 50 | foreach my $progname (@prognames) { 51 | my $val = $fusion_preds{$fusion}->{$progname} || 0; 52 | push (@vals, $val); 53 | } 54 | 55 | print join("\t", @vals) . "\n"; 56 | } 57 | 58 | exit(0); 59 | } 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /benchmarking/fusion_progs_agree_to_matrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 fusions.byProgAgree [min_progs=3]\n\n"; 7 | 8 | my $progs_agree_file = $ARGV[0] or die $usage; 9 | my $min_progs_agree = $ARGV[1] || 3; 10 | 11 | main: { 12 | 13 | my %prognames; 14 | my %fusion_preds; 15 | 16 | open(my $fh, $progs_agree_file) or die $!; 17 | while(<$fh>) { 18 | chomp; 19 | my @x = split(/\t/); 20 | my $fusion_name = $x[0]; 21 | my $prog_list = $x[1]; 22 | my $count_fusions = $x[2]; 23 | 24 | if ($count_fusions < $min_progs_agree) { 25 | next; 26 | } 27 | 28 | my @progs = split(/,/, $prog_list); 29 | foreach my $prog (@progs) { 30 | 31 | $prognames{$prog} = 1; 32 | 33 | $fusion_preds{$fusion_name}->{$prog} = 1; 34 | } 35 | } 36 | close $fh; 37 | 38 | 39 | ## output matrix 40 | my @prognames = sort keys %prognames; 41 | my @fusions = sort keys %fusion_preds; 42 | 43 | print "\t" . join("\t", @prognames) . "\n"; 44 | 45 | foreach my $fusion (@fusions) { 46 | my @vals = ($fusion); 47 | foreach my $progname (@prognames) { 48 | my $val = $fusion_preds{$fusion}->{$progname} || 0; 49 | push (@vals, $val); 50 | } 51 | 52 | print join("\t", @vals) . "\n"; 53 | } 54 | 55 | exit(0); 56 | } 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /benchmarking/fusion_sample_TPs_to_matrix.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 preds.collected.scored\n\n"; 7 | 8 | my $preds_file = $ARGV[0] or die $usage; 9 | 10 | 11 | main: { 12 | 13 | my %prog_to_sample_to_TP; 14 | my %samples; 15 | 16 | open(my $fh, $preds_file) or die $!; 17 | while(<$fh>) { 18 | if (/^\#/) { next; } 19 | chomp; 20 | my @x = split(/\t/); 21 | 22 | my $score_type = $x[0]; 23 | my $prog = $x[1]; 24 | my $sample_name = $x[2]; 25 | my $fusion = $x[3]; 26 | 27 | if ($score_type eq 'TP') { 28 | $prog_to_sample_to_TP{$prog}->{$sample_name}->{$fusion}++; 29 | } 30 | 31 | $samples{$sample_name}++; 32 | 33 | 34 | } 35 | close $fh; 36 | 37 | 38 | ## output matrix 39 | my @prognames = sort keys %prog_to_sample_to_TP; 40 | my @samplenames = keys %samples; 41 | 42 | print "\t" . join("\t", @prognames) . "\n"; 43 | 44 | foreach my $sample (@samplenames) { 45 | my @vals = ($sample); 46 | foreach my $prog (@prognames) { 47 | my @TP_fusions; 48 | if (exists $prog_to_sample_to_TP{$prog}->{$sample}) { 49 | @TP_fusions = keys %{$prog_to_sample_to_TP{$prog}->{$sample}}; 50 | } 51 | my $num_TP = scalar(@TP_fusions); 52 | push (@vals, $num_TP); 53 | } 54 | print join("\t", @vals) . "\n"; 55 | } 56 | 57 | exit(0); 58 | } 59 | 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /benchmarking/notes: -------------------------------------------------------------------------------- 1 | 2 | 3 | ~/GITHUB/CTAT_FUSIONS/FusionAnnotator/FusionAnnotator --annotate preds.collected -C 2 > preds.collected.wAnnot 4 | 5 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/filter_collected_preds.pl preds.collected.wAnnot > preds.collected.wAnnot.filt 6 | 7 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/collected_preds_to_fusion_prog_support_listing.pl preds.collected.wAnnot.filt > preds.collected.wAnnot.filt.byProg 8 | 9 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/define_truth_n_unsure_set.pl preds.collected.wAnnot.filt.byProg 4 10 | 11 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/fusion_preds_to_TP_FP_FN.wrapper.pl --fusion_preds preds.collected.wAnnot.filt --truth_fusions preds.collected.wAnnot.filt.byProg.min_4.truth_set --unsure_fusions preds.collected.wAnnot.filt.byProg.min_4.unsure_set --allow_reverse_fusion > min4.score 12 | 13 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/all_TP_FP_FN_to_ROC.pl min4.score > min4.score.roc 14 | 15 | 16 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/plotters/plot_ROC.Rscript min4.score.roc 17 | 18 | 19 | 20 | 21 | ## sensitivty vs. expression 22 | 23 | fusion_preds_sensitivity_vs_expr.pl preds.collected.scored fusion_TPM_values.dat > sensitivity_vs_expr.dat 24 | 25 | ~/GITHUB/trinityrnaseq/Analysis/DifferentialExpression/PtR -m sensitivity_vs_expr.dat --heatmap --sample_clust none --heatmap_colorscheme 'black,yellow' 26 | 27 | -------------------------------------------------------------------------------- /benchmarking/plotters/AUC_barplot.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) != 1) { 6 | stop("require param: file.auc ") 7 | } 8 | 9 | 10 | auc = args[1] 11 | 12 | data = read.table(auc, header=F) 13 | 14 | 15 | library('data.table') 16 | 17 | colnames(data) = c('progname', 'auc') 18 | 19 | dt = data.table(data) 20 | 21 | 22 | 23 | barplot_filename = paste0(auc, ".barplot.pdf") 24 | pdf(barplot_filename) 25 | 26 | prognames = levels(dt[,progname]) 27 | colors = rainbow(length(prognames)) 28 | names(colors) = prognames 29 | 30 | dt = dt[order(-auc),] 31 | 32 | ordered_prognames = dt[,progname] 33 | barplot(dt[,auc], names.arg=ordered_prognames, las=2, cex.axis=0.5, cex.names=0.5, col=colors[ordered_prognames]) 34 | 35 | 36 | dev.off() 37 | 38 | -------------------------------------------------------------------------------- /benchmarking/plotters/AUC_boxplot.from_separate_auc_files.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) < 2) { 6 | stop("require param: fileA.auc fileB.auc ... ") 7 | } 8 | 9 | 10 | summary_table = NULL 11 | 12 | for (auc in args) { 13 | 14 | data = read.table(auc, header=F, row.names=1) 15 | data = t(data) 16 | rownames(data) = c(auc) 17 | 18 | if (is.null(summary_table)) { 19 | summary_table = data 20 | } 21 | else { 22 | 23 | summary_table = rbind(summary_table, data[,colnames(summary_table)]) 24 | } 25 | 26 | } 27 | 28 | 29 | summary_table = summary_table[,rev(order(apply(summary_table, 2, median)))] 30 | 31 | print(summary_table) 32 | 33 | write.table(summary_table, file="auc.summary_table.txt", quote=F, sep="\t") 34 | 35 | pdf("auc.boxplot.pdf") 36 | boxplot(summary_table, las=2, outline=F) 37 | 38 | dev.off() 39 | 40 | -------------------------------------------------------------------------------- /benchmarking/plotters/AUC_boxplot.from_single_summary_AUC_file.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) < 1) { 6 | stop("require param: summary_all.auc ") 7 | } 8 | 9 | 10 | 11 | auc = args[1] 12 | 13 | data = read.table(auc, header=F) 14 | 15 | l = split(data, data[,1]) 16 | 17 | l2 = lapply(l, function(x) { x[,2] }) 18 | 19 | 20 | pdf_filename = paste0(auc, ".AUC_boxplot.pdf") 21 | pdf(pdf_filename) 22 | boxplot(l2[ rev(order(sapply(l2, function(x) { median(x) })) ) ], las=2, outline=F) 23 | 24 | dev.off() 25 | 26 | -------------------------------------------------------------------------------- /benchmarking/plotters/plotPRcurves.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # contributed by Bo Li, mod by bhaas 4 | 5 | argv = commandArgs(TRUE) 6 | if (length(argv) != 2) { 7 | cat("Usage: Rscript plotPRcurves.R input.table output.pdf\n") 8 | q(status = 1) 9 | } 10 | 11 | lwd=1 12 | 13 | plotPR = function(id, progs, data, colors) { 14 | idx = data[,1] == progs[id] 15 | if (id == 1) { 16 | plot(data[idx,2], data[idx,3], type = 'l', lwd = lwd, col = colors[id], lty = id, xlim = c(0, 1), ylim = c(0, 1), xlab = "Recall", ylab = "Precision") 17 | } else { 18 | par(new = T) 19 | plot(data[idx,2], data[idx,3], type = 'l', lwd = lwd, col = colors[id], lty = id, xlim = c(0, 1), ylim = c(0, 1), xlab = "", ylab = "") 20 | } 21 | } 22 | 23 | data = read.table(argv[1], header=T) 24 | progs = levels(data[,1]) 25 | colors = rainbow(length(progs)) 26 | 27 | pdf(argv[2]) 28 | par(mar = c(5, 4, 8, 2) + 0.1, xpd = TRUE) 29 | a = lapply(1:length(progs), plotPR, progs, data, colors) 30 | legend(x = -0.06, y = 1.3, legend = progs, ncol = 3, lwd = lwd, col = colors, lty = 1:length(progs), cex = 0.54) 31 | dev.off() 32 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) != 3) { 6 | stop("require params: auc.50.dat auc.101.dat plot_name.pdf") 7 | } 8 | 9 | auc_50_dat_filename = args[1] 10 | auc_101_dat_filename = args[2] 11 | plot_name = args[3] 12 | 13 | auc_50_dat = read.table(auc_50_dat_filename, header=F) 14 | auc_101_dat = read.table(auc_101_dat_filename, header=F) 15 | 16 | df_50_dat = data.frame(read_type=c('len50'), progname=auc_50_dat[,1], auc=auc_50_dat[,2]) 17 | df_101_dat = data.frame(read_type=c('len101'), progname=auc_101_dat[,1], auc=auc_101_dat[,2]) 18 | 19 | all_dat = rbind(df_50_dat, df_101_dat) 20 | 21 | 22 | ## plot it 23 | library('ggplot2') 24 | library('data.table') 25 | 26 | dt = data.table(all_dat) 27 | 28 | #dt_len101 = dt[read_type == "len101"] 29 | 30 | #dt_auc_median = dt_len101[,.(auc.median=median(auc)), by=.(progname)][order(-auc.median)] 31 | 32 | #dt_auc_median = dt[,.(auc.median=median(auc)), by=.(progname)][order(-auc.median)] 33 | 34 | dt_auc_mean = dt[,.(auc.mean=mean(auc)), by=.(progname)][order(-auc.mean)] 35 | 36 | all_dat$progname = factor(all_dat$progname, levels=factor(dt_auc_mean$progname)) 37 | all_dat$read_type = factor(all_dat$read_type, levels=factor(c('len101', 'len50'))) 38 | 39 | 40 | p = ggplot(all_dat, aes(factor(progname), auc)) + 41 | geom_boxplot(aes(fill=read_type), outlier.shape=NA) + 42 | theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)); 43 | 44 | pdf(plot_name) 45 | plot(p) 46 | 47 | dev.off() 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_F1_vs_min_frags.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | args<-commandArgs(TRUE) 6 | 7 | if (length(args) == 0) { 8 | stop("require param: min_X.results.scored.ROC") 9 | } 10 | 11 | roc_file = args[1] 12 | 13 | library(ggplot2) 14 | library(dplyr) 15 | 16 | data = read.table(roc_file, header=T) 17 | 18 | p = data %>% filter(min_sum_frags <= 20) %>% ggplot(aes(x=min_sum_frags, y=F1, color=prog)) + geom_line() 19 | 20 | pdf_filename = paste0(roc_file, ".F1_vs_minFrags.pdf") 21 | pdf(pdf_filename, width=8) 22 | 23 | plot(p) 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_TP_FP_vs_minSum_per_prog.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | args<-commandArgs(TRUE) 6 | 7 | if (length(args) == 0) { 8 | stop("require param: min_X.results.scored.ROC") 9 | } 10 | 11 | roc_file = args[1] 12 | 13 | library(ggplot2) 14 | library(dplyr) 15 | library(tidyr) 16 | 17 | data = read.table(roc_file, header=T) 18 | 19 | max_TP = max(data$TP) 20 | 21 | p = data %>% filter(min_sum_frags<20) %>% gather(key='TPFP', value='count', TP, FP) %>% ggplot(aes(x=min_sum_frags, y=count, color=TPFP)) + geom_point() + facet_wrap(~prog) + geom_hline(yintercept = max_TP) + ylim(0,1.5*max_TP) 22 | 23 | pdf_filename = paste0(roc_file, ".TP_and_FP_counts_vs_minFrags_eaProg.pdf") 24 | pdf(pdf_filename) 25 | 26 | plot(p) 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_all_auc_barplots.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library(ggplot2) 4 | library(dplyr) 5 | library(tidyr) 6 | library('data.table') 7 | 8 | 9 | args<-commandArgs(TRUE) 10 | 11 | if (length(args) != 2) { 12 | stop("require param: low_val high_val") 13 | } 14 | 15 | low_val = as.numeric(args[1]) 16 | high_val = as.numeric(args[2]) 17 | 18 | pdf("all.auc.dat.pdf", height=20) 19 | 20 | data = read.table("all.auc.dat", header=T) 21 | 22 | num_experiments = high_val - low_val + 1 23 | num_plots = num_experiments*4 24 | 25 | layout(matrix(1:num_plots, nrow=num_experiments, ncol=4, byrow=T)) 26 | 27 | par(mar=c(2,2,2,2)) 28 | 29 | dt = data.table(data) 30 | 31 | prognames = levels(dt[,progname]) 32 | 33 | colors = rainbow(length(prognames)) 34 | names(colors) = prognames 35 | 36 | rankings = list() 37 | 38 | for (mt in seq(low_val, high_val)) { 39 | for (iu in c(0,1)) { 40 | for (okp in c(0,1)) { 41 | 42 | title = sprintf("mA=%d, iu=%d, okp=%d", mt, iu, okp) 43 | 44 | mini_dt = dt[min_thresh==mt & ignoreUnsure==iu & okpara==okp,] 45 | mini_dt = mini_dt[order(-auc),] 46 | print(mini_dt) 47 | ordered_prognames = mini_dt[,progname] 48 | barplot(mini_dt[,auc], names.arg=ordered_prognames, las=2, cex.axis=0.5, cex.names=0.5, col=colors[ordered_prognames], main=title) 49 | 50 | ordered_prognames = as.character(ordered_prognames) 51 | for (rank_val in seq(length(ordered_prognames))) { 52 | myprogname = ordered_prognames[rank_val] 53 | prog_auc = mini_dt[rank_val,auc] 54 | if (myprogname %in% names(rankings)) { 55 | rankings[[ myprogname ]] = rbind(rankings[[ myprogname ]], data.frame(rankval=rank_val, mt=mt, iu=iu, okp=okp, auc=prog_auc)) 56 | } else { 57 | rankings[[ myprogname ]] = data.frame(rankval=rank_val, mt=mt, iu=iu, okp=okp, auc=prog_auc) 58 | } 59 | } 60 | 61 | } 62 | } 63 | } 64 | 65 | dev.off() 66 | 67 | 68 | ############################# 69 | ## Examine relative rankings: 70 | 71 | rankings.table = do.call(rbind, lapply(names(rankings), function(x) { cbind(prog=x, rankings[[x]]) })) 72 | 73 | write.table(rankings.table, file="all.auc.rankings.dat", quote=F, sep="\t") 74 | 75 | 76 | 77 | for (iu_val in c(0,1)) { 78 | for (okp_val in c(0,1)) { 79 | 80 | pdf(sprintf("all.auc.rankings.iu=%d.okp=%d.boxplot.pdf", iu_val, okp_val)) 81 | 82 | title = sprintf("iu=%d, okp=%d", iu_val, okp_val) 83 | 84 | filtered_table = rankings.table %>% filter(iu==iu_val & okp==okp_val) 85 | ranked.progs = filtered_table %>% group_by(prog) %>% summarize(mr=median(rankval)) %>% arrange(mr) 86 | 87 | filtered_table$prog = factor(filtered_table$prog, levels=factor(ranked.progs$prog)) 88 | 89 | p = filtered_table %>% ggplot(aes(as.factor(prog), rankval)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + ggtitle(title) 90 | 91 | plot(p) 92 | 93 | dev.off() 94 | 95 | output_rankings_file = sprintf("all.auc.rankings.iu=%d.okp=%d.dat", iu_val, okp_val) 96 | write.table(ranked.progs, file=output_rankings_file, quote=F, sep="\t") 97 | } 98 | } 99 | 100 | 101 | 102 | ## show how the iu and okp params change status for each prog 103 | pdf("all.auc.rankings_per_prog_adj.boxplot.pdf") 104 | p = rankings.table %>% mutate(combo_iu_okp=sprintf("%i,%i", iu, okp)) %>% ggplot(aes(combo_iu_okp, rankval)) + geom_boxplot() + facet_wrap(~prog) 105 | plot(p) 106 | dev.off() 107 | 108 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_before_vs_after_filt_TP_FP_compare.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | if (length(args) != 2) { 6 | stop("usage: plot_before_vs_after_filt_TP_FP_compare.Rscript before.best.dat after.best.dat") 7 | } 8 | 9 | 10 | before_dat_filename = args[1] 11 | after_dat_filename = args[2] 12 | 13 | pdf_filename = paste0(after_dat_filename, '.before_vs_after.pdf') 14 | pdf(pdf_filename, width=10) 15 | 16 | 17 | before_dat = read.table(before_dat_filename, header=T, row.names=1) 18 | after_dat = read.table(after_dat_filename, header=T, row.names=1) 19 | 20 | prognames = rownames(before_dat) 21 | merged_df = data.frame(prog=prognames, 22 | before_TP=before_dat[prognames,'TP'], before_FP=before_dat[prognames,'FP'], 23 | after_TP=after_dat[prognames,'TP'], after_FP=after_dat[prognames,'FP']) 24 | 25 | rownames(merged_df) = prognames 26 | 27 | max_TP = max(merged_df$before_TP, merged_df$after_TP) 28 | max_FP = max(merged_df$before_FP, merged_df$after_FP) 29 | 30 | plot(0,0, type='n', xlim=c(0,max_FP), ylim=c(0,max_TP), xlab='FP', ylab='TP') 31 | 32 | colors = rainbow(length(prognames)) 33 | 34 | i=0 35 | for (progname in prognames) { 36 | prog_data = merged_df[progname,,drop=F] 37 | 38 | print(prog_data) 39 | 40 | before_TP = prog_data$before_TP[1] 41 | before_FP = prog_data$before_FP[1] 42 | 43 | after_TP = prog_data$after_TP[1] 44 | after_FP = prog_data$after_FP[1] 45 | 46 | i = i + 1 47 | arrows(before_FP, before_TP, after_FP, after_TP, col=colors[i], length=0.15, lwd=2) 48 | print(c(before_FP, before_TP, after_FP, after_TP)) 49 | text(before_FP, before_TP, labels=progname, col=colors[i], cex=0.6, pos=4) 50 | } 51 | 52 | merged_dat_filename = paste0(after_dat_filename, ".before_vs_after.dat") 53 | write.table(merged_df, file=merged_dat_filename, quote=F, sep="\t") 54 | 55 | dev.off() 56 | 57 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_median_accuracy_ranking_vs_median_runtime.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | args<-commandArgs(TRUE) 6 | 7 | if (length(args) != 2) { 8 | stop("require params: cancer_cell_lines/all.auc.rankings.dat all_progs_cancer/runtimes.txt") 9 | } 10 | 11 | all_auc_rankings_file = args[1] 12 | runtimes_file = args[2] 13 | 14 | library(tidyverse) 15 | 16 | 17 | auc_data = read.table(all_auc_rankings_file) 18 | 19 | med_rank_data = auc_data %>% group_by(prog) %>% summarize(med_rank=median(rankval)) 20 | 21 | runtime_data = read.table(runtimes_file, header=T) 22 | 23 | med_runtime_data = runtime_data %>% group_by(prog) %>% summarize(med_runtime=median(time_h, na.rm=T)) 24 | 25 | write.table(med_runtime_data, file='med_runtime_data.tsv', quote=F, sep='\t', row.names=F) 26 | write.table(med_rank_data, file='med_rank_data.tsv', quote=F, sep='\t', row.names=F) 27 | 28 | 29 | #p = data %>% group_by(prog) %>% filter(! is.na(F1)) %>% filter(F1 == max(F1)) %>% ggplot(aes(x=PPV, y=TPR, color=prog)) + geom_point() 30 | # 31 | #pdf_filename = paste0(roc_file, ".tpr_ppv_at_maxF1_scatter.pdf") 32 | #pdf(pdf_filename) 33 | # 34 | #plot(p) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_peak_F1_scatter.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | args<-commandArgs(TRUE) 6 | 7 | if (length(args) == 0) { 8 | stop("require param: min_X.results.scored.ROC") 9 | } 10 | 11 | roc_file = args[1] 12 | 13 | library(ggplot2) 14 | library(dplyr) 15 | 16 | data = read.table(roc_file, header=T) 17 | 18 | 19 | peak_F1_data = data %>% group_by(prog) %>% filter(! is.na(F1)) %>% filter(F1 == max(F1)) %>% arrange(desc(F1)) 20 | 21 | p = peak_F1_data %>% ggplot(aes(x=PPV, y=TPR, color=prog, shape=prog)) + geom_point() + scale_shape_manual(values=rep(seq(0,25), 2)) 22 | 23 | pdf_filename = paste0(roc_file, ".tpr_ppv_at_maxF1_scatter.pdf") 24 | pdf(pdf_filename, width=9, height=4) 25 | 26 | plot(p) 27 | 28 | 29 | peak_F1_dat_file = paste0(roc_file, ".tpr_ppv_at_maxF1.dat") 30 | write.table(peak_F1_data, file=peak_F1_dat_file, quote=F, sep="\t") 31 | 32 | -------------------------------------------------------------------------------- /benchmarking/plotters/plot_upsetR.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors = FALSE) 4 | 5 | args<-commandArgs(TRUE) 6 | 7 | if (length(args) == 0) { 8 | stop("require param: matrix") 9 | } 10 | 11 | prog_agree_matrix = args[1] 12 | 13 | library(ggplot2) 14 | library(dplyr) 15 | library(tidyr) 16 | library(UpSetR) 17 | 18 | 19 | pdf_filename = sprintf("%s.UpSetR.pdf", prog_agree_matrix) 20 | pdf(pdf_filename, width=11) 21 | 22 | data = read.table(prog_agree_matrix, header=T) 23 | 24 | upset(data, number.angles=90, nsets=1000, nintersects=1000) 25 | 26 | dev.off() 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/analyze_Edgren_subset.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use FindBin; 7 | use File::Basename; 8 | use lib ("$FindBin::Bin/../../PerlLib"); 9 | use Pipeliner; 10 | use Process_cmd; 11 | use Cwd; 12 | 13 | if (basename(cwd()) ne "Edgren_subset") { 14 | die "Error, must run this while in the cancer_cell_lines/ directory."; 15 | } 16 | 17 | 18 | my $benchmark_data_basedir = "$FindBin::Bin/../.."; 19 | my $benchmark_toolkit_basedir = "$FindBin::Bin/../../benchmarking"; 20 | 21 | 22 | main: { 23 | 24 | my $pipeliner = &init_pipeliner(); 25 | 26 | my $cmd = "$benchmark_toolkit_basedir/collected_preds_to_fusion_prog_support_listing.pl preds.collected.gencode_mapped.wAnnot.filt.edgren ../progs_select.txt > preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree"; 27 | $pipeliner->add_commands(new Command($cmd, "edgren.byProgAgree.ok")); 28 | 29 | ## need the unsure set defined. Basically, treat everything non-unique as unsure. 30 | #$cmd = "$benchmark_toolkit_basedir/define_truth_n_unsure_set.pl preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree 1000"; 31 | #$pipeliner->add_commands(new Command($cmd, "define_min_agree.ok")); 32 | 33 | ## evaluate predictions: 34 | 35 | $cmd = "$benchmark_toolkit_basedir/fusion_preds_to_TP_FP_FN.pl " 36 | . " --truth_fusions edgren.truthset.raw " 37 | . " --fusion_preds preds.collected.gencode_mapped.wAnnot.filt.edgren " 38 | . " --allow_reverse_fusion " 39 | . " --allow_paralogs $benchmark_data_basedir/resources/paralog_clusters.dat " 40 | #. " --unsure_fusions preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree.min_1000.unsure_set " 41 | . " > preds.collected.gencode_mapped.wAnnot.filt.edgren.scored "; 42 | 43 | $pipeliner->add_commands(new Command($cmd, "edgren.TP_FP_FN.ok")); 44 | 45 | my $roc_file = "preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.ROC"; 46 | 47 | $cmd = "$benchmark_toolkit_basedir/all_TP_FP_FN_to_ROC.pl preds.collected.gencode_mapped.wAnnot.filt.edgren.scored > $roc_file"; 48 | $pipeliner->add_commands(new Command($cmd, "edgren.roc.ok")); 49 | 50 | # plot ROC 51 | $cmd = "$benchmark_toolkit_basedir/plotters/plot_ROC.Rscript $roc_file"; 52 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_roc.ok")); 53 | 54 | # plot F1 55 | $cmd = "$benchmark_toolkit_basedir/plotters/plot_F1_vs_min_frags.R $roc_file"; 56 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_F1_vs_min_frags.ok")); 57 | 58 | $cmd = "$benchmark_toolkit_basedir/plotters/plot_peak_F1_scatter.R $roc_file"; 59 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_peak_F1_scatter.ok")); 60 | 61 | # plot TP vs FP counts according to min frags per orog 62 | $cmd = "$benchmark_toolkit_basedir/plotters/plot_TP_FP_vs_minSum_per_prog.R $roc_file"; 63 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_TP_FP_vs_minFrags.ok")); 64 | 65 | 66 | ################################### 67 | # convert to Precision-Recall curve 68 | 69 | my $PR_file = "preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.PR"; 70 | 71 | $cmd = "$benchmark_toolkit_basedir/calc_PR.py --in_ROC $roc_file --min_read_support 3 --out_PR $PR_file | sort -k2,2gr | tee $PR_file.AUC"; 72 | $pipeliner->add_commands(new Command($cmd, "edgren.pr.ok")); 73 | 74 | # plot PR curve 75 | $cmd = "$benchmark_toolkit_basedir/plotters/plotPRcurves.R $PR_file $PR_file.plot.pdf"; 76 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_pr.ok")); 77 | 78 | # plot AUC barplot 79 | $cmd = "$benchmark_toolkit_basedir/plotters/AUC_barplot.Rscript $PR_file.AUC"; 80 | $pipeliner->add_commands(new Command($cmd, "edgren.plot_pr_auc_barplot.ok")); 81 | 82 | $pipeliner->run(); 83 | 84 | } 85 | 86 | 87 | #### 88 | sub init_pipeliner { 89 | 90 | my $pipeliner = new Pipeliner(-verbose => 2, -cmds_log => 'pipe.log'); 91 | my $checkpoint_dir = cwd() . "/_checkpoints"; 92 | unless (-d $checkpoint_dir) { 93 | mkdir $checkpoint_dir or die "Error, cannot mkdir $checkpoint_dir"; 94 | } 95 | $pipeliner->set_checkpoint_dir($checkpoint_dir); 96 | 97 | return($pipeliner); 98 | } 99 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./all* ./auc_files.list 4 | rm -rf ./_* 5 | 6 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/edgren.truthset: -------------------------------------------------------------------------------- 1 | ######################## 2 | ## BT474 ############### 3 | 4 | # Edgren 5 | BT474|ACACA--STAC2 6 | BT474|DIDO1--TTI1 7 | BT474|CPNE1--PI3 8 | BT474|GLB1--CMTM7 9 | BT474|LAMP1--MCF2L 10 | BT474|RAB22A--MYO9B 11 | BT474|RPS6KB1--SNF8 12 | BT474|SKA2--MYO19 13 | BT474|STARD3--DOK5 14 | BT474|VAPB--IKZF3 15 | BT474|ZMYND8--CEP250 16 | 17 | # Kangaspeaka 18 | BT474|AHCTF1--NAAA 19 | BT474|MED1--ACSF2 20 | BT474|MED1--STXBP4 21 | BT474|MED13--BCAS3 22 | BT474|PIP4K2B--RAD51C 23 | BT474|STX16--RAE1 24 | BT474|THRA--AC090627.1 25 | BT474|TOB1--SYNRG 26 | BT474|TRPC4AP--MRPL45 27 | BT474|USP32--MED1 28 | 29 | # Asmann, 2011 30 | BT474|LIMA1--USP22 31 | BT474|ACACA--STAC2 32 | BT474|FAM102A--CIZ1 33 | BT474|GLB1--CMTM7 34 | BT474|MED1--STXBP4 35 | BT474|PIP4K2B--RAD51C 36 | BT474|RAB22A--MYO9B 37 | BT474|RPS6KB1--SNF8 38 | BT474|STARD3--DOK5 39 | BT474|TRPC4AP--MRPL45 40 | BT474|ZMYND8--CEP250 41 | 42 | 43 | 44 | ######################## 45 | ## KPL4 ################ 46 | 47 | # Edgren 48 | KPL4|BSG--NFIX 49 | KPL4|PPP1R12A--SEPT10 50 | KPL4|NOTCH1--NUP214 51 | 52 | 53 | ####################### 54 | ## MCF7 ############### 55 | 56 | # Edgren 57 | MCF7|ARFGEF2--SULF2 58 | MCF7|BCAS4--BCAS3 59 | MCF7|RPS6KB1--VMP1 60 | 61 | # Kangaspeaka 62 | MCF7|AC099850.1--VMP1 63 | MCF7|GCN1L1--MSI1 64 | MCF7|SMARCA4--CARM1 65 | 66 | # Asmann, 2011 67 | MCF7|ADAMTS19--SLC27A6 68 | MCF7|ARFGEF2--SULF2 69 | MCF7|ATXN7L3--FAM171A2 70 | MCF7|BCAS4--BCAS3 71 | MCF7|GCN1L1--MSI1 72 | MCF7|RPS6KB1--DIAPH3 73 | MCF7|SULF2--PRICKLE2 74 | MCF7|MYH9--EIF3D 75 | 76 | # Maher, PNAS, 2009 77 | MCF7|AHCYL1--RAD51C 78 | MCF7|ARFGEF2--SULF2 79 | MCF7|ARHGAP19--DRG1 80 | MCF7|BCAS4--BCAS3 81 | MCF7|PAPOLA--AK7 82 | MCF7|MYO9B--FCHO1 83 | 84 | 85 | ######################## 86 | ## SKBR3 ############### 87 | 88 | # Edgren 89 | SKBR3|ANKHD1--PCDH1 90 | SKBR3|CCDC85C--SETD3 91 | SKBR3|CSE1L--AL035685.1 92 | SKBR3|CYTH1--EIF3H 93 | SKBR3|DHX35--ITCH 94 | SKBR3|NFS1--PREX1 95 | SKBR3|PREX1--CPNE1 96 | SKBR3|RARA--PKIA 97 | SKBR3|SUMF1--LRRFIP2 98 | SKBR3|TATDN1--GSDMB 99 | SKBR3|WDR67--ZNF704 100 | 101 | 102 | # Asmann, 2011 103 | SKBR3|KLHDC2--SNTB1 104 | 105 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/edgren.truthset.raw: -------------------------------------------------------------------------------- 1 | BT474|ACACA--STAC2 2 | BT474|AHCTF1--NAAA 3 | BT474|CPNE1--PI3 4 | BT474|DIDO1--TTI1 5 | BT474|FAM102A--CIZ1 6 | BT474|GLB1--CMTM7 7 | BT474|LAMP1--MCF2L 8 | BT474|LIMA1--USP22 9 | BT474|MED1--ACSF2 10 | BT474|MED1--STXBP4 11 | BT474|MED13--BCAS3 12 | BT474|PIP4K2B--RAD51C 13 | BT474|RAB22A--MYO9B 14 | BT474|RPS6KB1--SNF8 15 | BT474|SKA2--MYO19 16 | BT474|STARD3--DOK5 17 | BT474|STX16--RAE1 18 | BT474|THRA--AC090627.1 19 | BT474|TOB1--SYNRG 20 | BT474|TRPC4AP--MRPL45 21 | BT474|USP32--MED1 22 | BT474|VAPB--IKZF3 23 | BT474|ZMYND8--CEP250 24 | KPL4|BSG--NFIX 25 | KPL4|NOTCH1--NUP214 26 | KPL4|PPP1R12A--SEPT10 27 | MCF7|AC099850.1--VMP1 28 | MCF7|ADAMTS19--SLC27A6 29 | MCF7|AHCYL1--RAD51C 30 | MCF7|ARFGEF2--SULF2 31 | MCF7|ARHGAP19--DRG1 32 | MCF7|ATXN7L3--FAM171A2 33 | MCF7|BCAS4--BCAS3 34 | MCF7|GCN1L1--MSI1 35 | MCF7|MYH9--EIF3D 36 | MCF7|MYO9B--FCHO1 37 | MCF7|PAPOLA--AK7 38 | MCF7|RPS6KB1--DIAPH3 39 | MCF7|RPS6KB1--VMP1 40 | MCF7|SMARCA4--CARM1 41 | MCF7|SULF2--PRICKLE2 42 | SKBR3|ANKHD1--PCDH1 43 | SKBR3|CCDC85C--SETD3 44 | SKBR3|CSE1L--AL035685.1 45 | SKBR3|CYTH1--EIF3H 46 | SKBR3|DHX35--ITCH 47 | SKBR3|KLHDC2--SNTB1 48 | SKBR3|NFS1--PREX1 49 | SKBR3|PREX1--CPNE1 50 | SKBR3|RARA--PKIA 51 | SKBR3|SUMF1--LRRFIP2 52 | SKBR3|TATDN1--GSDMB 53 | SKBR3|WDR67--ZNF704 54 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/eval_edgren_min_agree.consolidated.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 consolidated_edgren_predictions.dat min_agree\n\n"; 7 | 8 | my $input_file = $ARGV[0] or die $usage; 9 | my $min_agree = $ARGV[1] or die $usage; 10 | 11 | main: { 12 | 13 | my @progs_to_count = `cat ../progs_select.txt`; 14 | chomp @progs_to_count; 15 | 16 | my %progs_to_count = map { + $_ => 1 } @progs_to_count; 17 | 18 | 19 | open(my $fh, $input_file) or die $!; 20 | 21 | my %fusion_to_prog; 22 | my %orig_fusion_call; 23 | my %prognames; 24 | 25 | my $header = <$fh>; 26 | while(<$fh>) { 27 | chomp; 28 | my @x = split(/\t/); 29 | my $sample_name = $x[2]; 30 | my $progname = $x[3]; 31 | my $fusion_name = $x[6]; 32 | 33 | my @y = split(/\|/, $fusion_name); 34 | 35 | my ($left_entry, $right_entry) = split(/--/, $y[1]); 36 | 37 | $prognames{$progname}++; 38 | 39 | my $alt_fusion_name = "$sample_name|$left_entry--$right_entry"; 40 | $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++; 41 | $fusion_to_prog{$alt_fusion_name}->{$progname}++; 42 | 43 | $alt_fusion_name = "$sample_name|$right_entry--$left_entry"; 44 | $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++; 45 | $fusion_to_prog{$alt_fusion_name}->{$progname}++; 46 | 47 | 48 | } 49 | 50 | ## capture those fusions that meet the min prog criteria 51 | 52 | my %fusions_meet_min_prog_count; 53 | 54 | foreach my $fusion_name (keys %fusion_to_prog) { 55 | 56 | my $orig_fusion_names_href = $orig_fusion_call{$fusion_name}; 57 | my @orig_fusion_cand_names = sort {$orig_fusion_names_href->{$b}<=>$orig_fusion_names_href->{$a}} keys %$orig_fusion_names_href; 58 | 59 | my $orig_fusion_name = $orig_fusion_cand_names[0]; 60 | 61 | my $prog_count = scalar(grep { $progs_to_count{$_} } keys %{$fusion_to_prog{$fusion_name}}); 62 | 63 | #print "$fusion_name\t$orig_fusion_name\t$prog_count\n"; 64 | 65 | if ($prog_count >= $min_agree) { 66 | $fusions_meet_min_prog_count{$orig_fusion_name} = 1; 67 | } 68 | } 69 | 70 | 71 | ## generate report 72 | my @prognames = sort keys %prognames; 73 | 74 | print "\t" . join("\t", @prognames) . "\n"; 75 | 76 | my @final_fusions = sort keys %fusions_meet_min_prog_count; 77 | 78 | foreach my $fusion (@final_fusions) { 79 | 80 | my @vals = ($fusion); 81 | foreach my $progname (@prognames) { 82 | my $found = (exists $fusion_to_prog{$fusion}->{$progname}) ? 1 : 0; 83 | push (@vals, $found); 84 | } 85 | 86 | print join("\t", @vals) . "\n"; 87 | } 88 | 89 | exit(0); 90 | 91 | } 92 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/eval_edgren_min_agree.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 preds.collected.gencode_mapped.wAnnot.filt.edgren min_agree\n\n"; 7 | 8 | my $input_file = $ARGV[0] or die $usage; 9 | my $min_agree = $ARGV[1] or die $usage; 10 | 11 | main: { 12 | 13 | my @progs_to_count = `cat ../progs_select.txt`; 14 | chomp @progs_to_count; 15 | 16 | my %progs_to_count = map { + $_ => 1 } @progs_to_count; 17 | 18 | 19 | open(my $fh, $input_file) or die $!; 20 | 21 | my %fusion_to_prog; 22 | my %orig_fusion_call; 23 | my %prognames; 24 | 25 | my $header = <$fh>; 26 | while(<$fh>) { 27 | chomp; 28 | my @x = split(/\t/); 29 | my $sample_name = $x[0]; 30 | my $progname = $x[1]; 31 | my $fusion_name = $x[2]; 32 | 33 | my ($left_fusion_name, $right_fusion_name) = split(/--/, $fusion_name); 34 | 35 | $fusion_name = "$sample_name|$fusion_name"; 36 | 37 | $prognames{$progname}++; 38 | 39 | my $alt_fusion_names_left = $x[5]; 40 | my $alt_fusion_names_right = $x[6]; 41 | 42 | my @left_entries = split(/,/, $alt_fusion_names_left); 43 | my @right_entries = split(/,/, $alt_fusion_names_right); 44 | 45 | @left_entries = grep { defined($_) } @left_entries; 46 | @right_entries = grep { defined($_) } @right_entries; 47 | 48 | 49 | 50 | unless (grep {/^$left_fusion_name$/} @left_entries) { 51 | push (@left_entries, $left_fusion_name); 52 | } 53 | unless (grep {/^$right_fusion_name$/} @right_entries) { 54 | push (@right_entries, $right_fusion_name); 55 | } 56 | 57 | 58 | foreach my $left_entry (@left_entries) { 59 | foreach my $right_entry (@right_entries) { 60 | 61 | my $alt_fusion_name = "$sample_name|$left_entry--$right_entry"; 62 | $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++; 63 | $fusion_to_prog{$alt_fusion_name}->{$progname}++; 64 | 65 | $alt_fusion_name = "$sample_name|$right_entry--$left_entry"; 66 | $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++; 67 | $fusion_to_prog{$alt_fusion_name}->{$progname}++; 68 | 69 | } 70 | } 71 | 72 | } 73 | 74 | ## capture those fusions that meet the min prog criteria 75 | 76 | my %fusions_meet_min_prog_count; 77 | 78 | foreach my $fusion_name (keys %fusion_to_prog) { 79 | 80 | my $orig_fusion_names_href = $orig_fusion_call{$fusion_name}; 81 | my @orig_fusion_cand_names = sort {$orig_fusion_names_href->{$b}<=>$orig_fusion_names_href->{$a}} keys %$orig_fusion_names_href; 82 | 83 | my $orig_fusion_name = $orig_fusion_cand_names[0]; 84 | 85 | my $prog_count = scalar(grep { $progs_to_count{$_} } keys %{$fusion_to_prog{$fusion_name}}); 86 | 87 | #print "$fusion_name\t$orig_fusion_name\t$prog_count\n"; 88 | 89 | if ($prog_count >= $min_agree) { 90 | $fusions_meet_min_prog_count{$orig_fusion_name} = 1; 91 | } 92 | } 93 | 94 | 95 | 96 | ## generate report 97 | my @prognames = sort keys %prognames; 98 | 99 | print "\t" . join("\t", @prognames) . "\n"; 100 | 101 | my @final_fusions = sort keys %fusions_meet_min_prog_count; 102 | 103 | foreach my $fusion (@final_fusions) { 104 | 105 | my @vals = ($fusion); 106 | foreach my $progname (@prognames) { 107 | my $found = (exists $fusion_to_prog{$fusion}->{$progname}) ? 1 : 0; 108 | push (@vals, $found); 109 | } 110 | 111 | print join("\t", @vals) . "\n"; 112 | } 113 | 114 | exit(0); 115 | 116 | } 117 | -------------------------------------------------------------------------------- /cancer_cell_lines/Edgren_subset/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | 6 | cat ../preds.collected.gencode_mapped.wAnnot.filt | egrep '^(sample|BT474|MCF7|KPL4|SKBR3)' > preds.collected.gencode_mapped.wAnnot.filt.edgren 7 | 8 | 9 | 10 | ## analyze accuracy 11 | ./analyze_Edgren_subset.pl 12 | 13 | 14 | ## examine enrichment for valid fusions among minProgs 15 | ./examine_validated_enrichment.R edgren.truthset.raw preds.collected.gencode_mapped.wAnnot.filt.edgren.scored 16 | 17 | 18 | ## examine min3 agree Venn 19 | 20 | ./eval_edgren_min_agree.consolidated.pl consolidated_edgren_predictions.dat 3 > edgren.min3 21 | 22 | ../../benchmarking/plotters/plot_upsetR.R edgren.min3 23 | 24 | 25 | 26 | 27 | ## run through standard analysis for curiosity sake 28 | ../../benchmarking/Venn_analysis_strategy.pl preds.collected.gencode_mapped.wAnnot.filt.edgren ../progs_select.txt 3 10 29 | -------------------------------------------------------------------------------- /cancer_cell_lines/SuppTable-cancer_cell_lines.csv: -------------------------------------------------------------------------------- 1 | Cancer cell line,Cancer type,Data source,accession or analysis_id 2 | BT474,breast,SRA,SRP003186 3 | G20476.DMS_454.2,lung,CCLE,1d3e9dae-b558-4187-b8a0-e79b2b307f3f 4 | G20495.786-O.2,kidney,CCLE,166efd97-7b71-4089-be92-d8d006f86c3b 5 | G20498.KYSE-180.2,head-neck,CCLE,a1c5e568-4169-48fe-9d84-7a70d61ceea0 6 | G20500.IGR-37.2,skin,CCLE,1138456d-ecc5-45f4-a169-9ba373d5d71a 7 | G25214.MKN7.1,stomach,CCLE,23056430-2489-4922-ac0b-299b7f43e74e 8 | G25225.NCI-H522.1,lung,CCLE,9da8f1f7-d606-4fe8-ab14-c9fe2a4b7afc 9 | G26175.A172.2,brain,CCLE,0534dd1a-0287-484d-939f-b2e5f37688c3 10 | G26182.KMS-12-BM.2,lymphoid,CCLE,04778047-db4d-4b8c-b77c-b910fa8c9e12 11 | G26199.LN-229.2,brain,CCLE,2dc62ec8-476f-4d4a-a51f-729673e63f80 12 | G26212.A-673.2,sarcomatoid,CCLE,ed412801-81e9-4777-942a-95079b7044e1 13 | G26216.KP-2.2,pancreas,CCLE,77c15c63-b42f-4194-a346-6e8bf98ac23b 14 | G26228.Hs_683.2,brain,CCLE,6af225a0-fcc0-471b-838f-2bf1c0ded8d3 15 | G26236.NCI-H716.2,colo-rectal,CCLE,95e1c652-a7f2-41f6-9a30-503b8c7c37a5 16 | G26249.KMS-26.2,lymphoid,CCLE,d54514e6-1825-46cd-a394-53886430033d 17 | G26253.KMS-34.2,lymphoid,CCLE,905fffad-e784-47b7-a617-6df5ec7e604c 18 | G26262.NCI-H889.2,lung,CCLE,a3e91ee2-aa54-4acf-8eb2-45d795ec5188 19 | G27214.PC-3.1,other,CCLE,811a96a5-a7f2-4082-9044-e6972c7316a9 20 | G27219.Panc_03.27.1,pancreas,CCLE,c5b5dba9-1eee-43ec-a097-dc72d45b20da 21 | G27233.A-498.1,kidney,CCLE,ce9f1b08-8a07-416a-ab43-90e011092b08 22 | G27259.AN3_CA.1,uterus,CCLE,3f1b9533-8773-4ca8-8c43-98101cbba096 23 | G27280.TC-71.1,sarcomatoid,CCLE,04ae78b3-1f09-4d55-ac56-8600d73ed8a5 24 | G27367.BFTC-909.1,kidney,CCLE,1c5f708f-252b-49d0-b16d-78d5a5344e7d 25 | G27376.COLO_792.1,skin,CCLE,5c9df572-dd10-420f-980a-17ae004dfc08 26 | G27453.SNU-398.2,liver,CCLE,15ea12f5-1702-48d7-8cd0-98659cddb7e2 27 | G27463.SK-MEL-1.2,skin,CCLE,8f1e5d67-c86c-4b0d-9e21-5803f7b79432 28 | G27476.PK-59.2,pacreas,CCLE,3051f39a-7881-4d7b-a13b-c5395e5b4ef3 29 | G27479.SK-MEL-3.2,skin,CCLE,f6870a19-fe07-4afb-95c4-204c35bfa95a 30 | G27488.SNU-620.2,stomach,CCLE,b146a903-8d81-4d13-a630-22c944087bf4 31 | G27516.SK-MEL-28.2,skin,CCLE,4a77f393-6c2f-47d7-b7e5-411982bf75a6 32 | G27544.SF268.2,brain,CCLE,f399f213-694c-4463-b855-132f433df92d 33 | G28011.KLE.1,uterus,CCLE,f43766e8-108d-4438-89ab-75d0dbe55997 34 | G28034.MDA-MB-361.1,breast,CCLE,a337c425-4314-40c6-a40a-a444781bd1b7 35 | G28045.KYSE-270.1,head-neck,CCLE,067cb65d-b578-4dd6-95db-60f2d2ae040e 36 | G28050.KMM-1.1,lymphoid,CCLE,1bae551f-db0a-4b96-92ca-2211d0d5265b 37 | G28054.KYSE-520.1,head-neck,CCLE,67d2c686-e66b-489f-a6b3-0ed36a56ba00 38 | G28070.LN-18.1,brain,CCLE,d62d67e3-da37-4046-8be5-07d2ac93f47f 39 | G28072.MDA-MB-175-VII.1,breast,CCLE,ea450165-4ec4-431f-a21b-ebdea26d9794 40 | G28077.MG-63.1,sarcomatoid,CCLE,8c7f184a-ddde-4510-90b6-99a9252b9128 41 | G28081.JHH-7.1,liver,CCLE,b6118100-0a8c-4153-ba64-5db6e1e820fa 42 | G28087.MDA-MB-436.1,breast,CCLE,cd1ad136-1e68-4f08-8341-216382301fd8 43 | G28535.OVTOKO.1,ovary,CCLE,24799388-008d-4011-a181-f4b2070b8bb0 44 | G28545.NUGC-2.1,stomach,CCLE,7cba5a73-b8a7-46c9-8175-eeca8bd483ac 45 | G28575.OUMS-23.1,colo-rectal,CCLE,75b57b0c-df5b-429b-a024-3777b6291bf0 46 | G28610.MHH-ES-1.1,sarcomatoid,CCLE,b1c89d5f-841b-41b6-87ac-d5407bbf8293 47 | G30594.UACC-893.1,breast,CCLE,52646ad2-d86d-4fc5-ba5c-a5beeed25f39 48 | G30631.SU-DHL-10.1,other,CCLE,fe8b3333-45eb-41bc-88b8-cfc3b9666591 49 | G41663.OVISE.5,ovary,CCLE,2d1f25d4-7245-4a2f-bd6f-6f5b6c3e5d0d 50 | G41682.KYSE-510.5,head-neck,CCLE,8a027ed1-8767-48a6-ae39-d4f29417c0e8 51 | G41706.RT4.5,urinary bladder,CCLE,55c315f0-6c8c-4a04-86e5-fc73f35b3619 52 | G41709.FaDu.5,head-neck,CCLE,192ac756-5175-4acd-92be-357cdf926d33 53 | G41710.SNU-16.5,stomach,CCLE,becae7d6-fdb7-4f63-8947-40b799a8d544 54 | G41724.HGC-27.5,stomach,CCLE,f1faef0c-e58c-4eb3-9e3b-e5dc3791fa14 55 | H2228,lung,SRA,DRR016705 56 | K562,lymphoid,SRA,SRR521460 57 | KPL4,breast,SRA,SRP003186 58 | LC2Ad,lung,SRA,DRR016709 59 | MCF7,breast,SRA,SRP003186 60 | SKBR3,breast,SRA,SRP003186 61 | VCaP_85,prostate,SRA,SRR1217085 62 | -------------------------------------------------------------------------------- /cancer_cell_lines/analyze_cancer_data.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | use FindBin; 7 | use Cwd; 8 | use File::Basename; 9 | use lib ("$FindBin::Bin/../PerlLib"); 10 | use Pipeliner; 11 | use Process_cmd; 12 | 13 | 14 | my $restricted_progs_file = $ARGV[0] || ""; 15 | 16 | unless ($ENV{FUSION_ANNOTATOR}) { 17 | 18 | if (-d "$ENV{HOME}/GITHUB/CTAT_FUSIONS/FusionAnnotator") { 19 | $ENV{FUSION_ANNOTATOR} = "~/GITHUB/CTAT_FUSIONS/FusionAnnotator"; 20 | } 21 | else { 22 | die "Error, must set env var FUSION_ANNOTATOR to point to base dir of\n" 23 | . " git clone https://github.com/FusionAnnotator/FusionAnnotator.git\n" 24 | . " (after having installed it) "; 25 | } 26 | } 27 | 28 | unless ($ENV{TRINITY_HOME}) { 29 | die "Error, must specify env var TRINITY_HOME to trinity base installation directory"; 30 | } 31 | 32 | 33 | if (basename(cwd()) ne "cancer_cell_lines") { 34 | die "Error, must run this while in the cancer_cell_lines/ directory."; 35 | } 36 | 37 | 38 | my $benchmark_data_basedir = "$FindBin::Bin/.."; 39 | my $benchmark_toolkit_basedir = "$FindBin::Bin/../benchmarking"; 40 | my $fusion_annotator_basedir = $ENV{FUSION_ANNOTATOR}; 41 | my $trinity_home = $ENV{TRINITY_HOME}; 42 | 43 | 44 | main: { 45 | 46 | my $pipeliner = &init_pipeliner(); 47 | 48 | ## create file listing 49 | my $cmd = "find ./samples -type f | $benchmark_data_basedir/util/make_file_listing_input_table.pl $restricted_progs_file > fusion_result_file_listing.dat"; 50 | $pipeliner->add_commands(new Command($cmd, "fusion_file_listing.ok")); 51 | 52 | # collect predictions 53 | $cmd = "$benchmark_toolkit_basedir/collect_preds.pl fusion_result_file_listing.dat > preds.collected"; 54 | $pipeliner->add_commands(new Command($cmd, "collect_preds.ok")); 55 | 56 | # map fusion predictions to gencode gene symbols based on identifiers or chromosomal coordinates. 57 | $cmd = "$benchmark_toolkit_basedir/map_gene_symbols_to_gencode.pl " 58 | . " preds.collected " 59 | . " $benchmark_data_basedir/resources/genes.coords.gz " 60 | . " $benchmark_data_basedir/resources/genes.aliases " 61 | . " > preds.collected.gencode_mapped "; 62 | 63 | $pipeliner->add_commands(new Command($cmd, "gencode_mapped.ok")); 64 | 65 | # annotate 66 | $cmd = "$fusion_annotator_basedir/FusionAnnotator --annotate preds.collected.gencode_mapped -C 2 > preds.collected.gencode_mapped.wAnnot"; 67 | $pipeliner->add_commands(new Command($cmd, "annotate_fusions.ok")); 68 | 69 | # filter HLA and mitochondrial features 70 | $cmd = "$benchmark_toolkit_basedir/filter_collected_preds.pl preds.collected.gencode_mapped.wAnnot > preds.collected.gencode_mapped.wAnnot.filt"; 71 | $pipeliner->add_commands(new Command($cmd, "filter_fusion_annot.ok")); 72 | 73 | # generate and plot correlation matrix for predicted fusions by prog 74 | $cmd = "$benchmark_toolkit_basedir/fusion_preds_to_matrix.pl preds.collected.gencode_mapped.wAnnot.filt > preds.collected.gencode_mapped.wAnnot.filt.matrix"; 75 | $pipeliner->add_commands(new Command($cmd, "pred_cor_matrix.ok")); 76 | 77 | $cmd = "$trinity_home/Analysis/DifferentialExpression/PtR -m preds.collected.gencode_mapped.wAnnot.filt.matrix --binary --sample_cor_matrix --heatmap_colorscheme 'black,yellow' "; 78 | $pipeliner->add_commands(new Command($cmd, "pred_cor_matrix_plot.ok")); 79 | 80 | 81 | 82 | ## remove edgren set: 83 | $cmd = "bash -c 'set -eou pipefail; cat preds.collected.gencode_mapped.wAnnot.filt | egrep -v \"^(BT474|KPL4|MCF7|SKBR3)\" > preds.collected.gencode_mapped.wAnnot.filt.noEdgren'"; 84 | $pipeliner->add_commands(new Command($cmd, "rmEdgren.ok")); 85 | 86 | ## run Venn-based accuracy analysis: 87 | 88 | $cmd = "$benchmark_toolkit_basedir/Venn_analysis_strategy.pl preds.collected.gencode_mapped.wAnnot.filt.noEdgren progs_select.txt 3 10"; 89 | $pipeliner->add_commands(new Command($cmd, "venn_analysis.ok")); 90 | 91 | 92 | $pipeliner->run(); 93 | 94 | exit(0); 95 | 96 | 97 | } 98 | 99 | 100 | #### 101 | sub init_pipeliner { 102 | 103 | my $pipeliner = new Pipeliner(-verbose => 2, -cmds_log => 'pipe.log'); 104 | my $checkpoint_dir = cwd() . "/_checkpoints"; 105 | unless (-d $checkpoint_dir) { 106 | mkdir $checkpoint_dir or die "Error, cannot mkdir $checkpoint_dir"; 107 | } 108 | $pipeliner->set_checkpoint_dir($checkpoint_dir); 109 | 110 | return($pipeliner); 111 | } 112 | 113 | -------------------------------------------------------------------------------- /cancer_cell_lines/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./all* ./auc_files.list ./*.dat ./*.pdf 4 | rm -rf ./_* 5 | 6 | cd Edgren_subset && ./cleanMe.sh 7 | -------------------------------------------------------------------------------- /cancer_cell_lines/progs_select.txt: -------------------------------------------------------------------------------- 1 | ARRIBA 2 | ChimPipe 3 | ChimeraScan 4 | deFuse 5 | EricScript 6 | FUSIONCATCHER_v1.10_June192019 7 | FusionHunter 8 | InFusion 9 | JAFFA-Assembly 10 | JAFFA-Direct 11 | MapSplice 12 | nFuse 13 | PIZZLY 14 | PRADA 15 | SOAP-fuse 16 | STARCHIP_csm10_pGm2_May012019 17 | STARSEQR 18 | #STAR_FUSION_v1.5 19 | TopHat-Fusion 20 | TrinityFusion-C 21 | -------------------------------------------------------------------------------- /cancer_cell_lines/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | if [ ! -d samples ]; then 6 | wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_cancer_cell_lines/samples/ 7 | find data.broadinstitute.org/|grep html | xargs -n1 rm -f 8 | mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_cancer_cell_lines/samples . 9 | rm -rf ./data.broadinstitute.org 10 | fi 11 | 12 | 13 | 14 | ./analyze_cancer_data.pl $* 15 | 16 | 17 | ## Edgren subset study 18 | 19 | if [ $* ]; then 20 | cd Edgren_subset && ./runMe.sh 21 | fi 22 | -------------------------------------------------------------------------------- /cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | 6 | dirs=(simulated_data cancer_cell_lines runtime_analysis) 7 | 8 | for dir in ${dirs[*]} 9 | do 10 | cd $dir 11 | ./cleanMe.sh 12 | cd ../ 13 | done 14 | 15 | rm -rf ./figs_for_paper 16 | 17 | -------------------------------------------------------------------------------- /progs_restrict.txt: -------------------------------------------------------------------------------- 1 | ARRIBA 2 | CHIMERASCAN 3 | CHIMPIPE 4 | DEFUSE 5 | ERICSCRIPT 6 | FUSIONCATCHER_v1.10_June192019 7 | FUSIONHUNTER 8 | INFUSION_CF3 9 | JAFFA_ASSEMBLY 10 | JAFFA_DIRECT 11 | JAFFA_HYBRID 12 | MAPSPLICE 13 | NFUSE 14 | PIZZLY 15 | PRADA 16 | SOAP_FUSE 17 | STARCHIP_csm10_pGm2_May012019 18 | STAR_FUSION_v1.5_hg19_Apr042019 19 | STARSEQR_STAR-SEQR 20 | TOPHAT_FUSION 21 | TRINITY_FUSION_C_hg19 22 | TRINITY_FUSION_D_hg19 23 | TRINITY_FUSION_UC_hg19 24 | -------------------------------------------------------------------------------- /resources/genes.coords.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/resources/genes.coords.gz -------------------------------------------------------------------------------- /resources/notes: -------------------------------------------------------------------------------- 1 | 2 | ## hg19 coordinate liftovers and gene symbol aliases: 3 | 4 | using: /home/unix/bhaas/FUS/util/make_hg19_gene_coords_file.pl to extract liftover coords. 5 | 6 | ie. 7 | 8 | liftOver -gff ref_annot.gtf /home/unix/bhaas/utilities/hg38ToHg19.over.chain.gz mapped unmapped 9 | ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_to_gene_spans.pl mapped > mapped.coords 10 | 11 | then aggregate all coords across all hg19-transposed annotations to gene.coords.gz file. 12 | 13 | 14 | ## paralog clustering 15 | 16 | see: notes.paralog_clustering.2020.txt 17 | 18 | -------------------------------------------------------------------------------- /resources/notes.paralog_clustering.2020.txt: -------------------------------------------------------------------------------- 1 | ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_file_to_feature_seqs.pl --gtf_file gencode.v22.annotation.gtf --genome_fa GRCh38.primary_assembly.genome.fa --seqType CDSplus > gencode.v22.annotation.cdsplus.fa 2 | 3 | 4 | ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_file_to_feature_seqs.pl --gtf_file gencode.v19.annotation.gtf --genome_fa GRCh37.p13.genome.primary.fa --seqType CDSplus > gencode.v19.annotation.cdsplus.fa 5 | 6 | 7 | c GRCh37/gencode.v19.annotation.cdsplus.fa | perl -lane 's/>/>gv19./; print;' > GRCh37/gencode.v19.annotation.cdsplus.fa.mod 8 | 9 | c GRCh38/gencode.v22.annotation.cdsplus.fa | perl -lane 's/>/>gv22./; print;' > GRCh38/gencode.v22.annotation.cdsplus.fa.mod 10 | 11 | c GRCh37/gencode.v19.annotation.cdsplus.fa.mod GRCh38/gencode.v22.annotation.cdsplus.fa.mod > gencode.combined.cdsplus.fa 12 | 13 | makeblastdb -in gencode.combined.cdsplus.fa -dbtype nucl 14 | 15 | mkdir tmpdir; ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/dfam_repeat_masker.pl --dfam_hmm /seq/RNASEQ/TOOLS/DFAM/homo_sapiens_dfam.hmm --target_fa gencode.combined.cdsplus.fa --out_masked gencode.combined.cdsplus.dfam_masked.fa --CPU 10 --tmpdir ./tmpdir 2>&1 | tee run.dfam.log 16 | 17 | ## blast 18 | 19 | makeblastdb -in gencode.combined.cdsplus.dfam_masked.fa -dbtype nucl 20 | 21 | blastn -query gencode.combined.cdsplus.dfam_masked.fa -db gencode.combined.cdsplus.dfam_masked.fa -max_target_seqs 10000 -outfmt 6 -evalue 1e-3 -lcase_masking -num_threads 20 -word_size 11 > blast_pairs.outfmt6 22 | 23 | 24 | ## prep for paralog clustering 25 | 26 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/outfmt6_add_percent_match_length.group_segments.pl blast_pairs.outfmt6 gencode.combined.cdsplus.fa gencode.combined.cdsplus.fa > blast_pairs.outfmt6.grouped 27 | 28 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/blast_outfmt6_replace_trans_id_w_gene_symbol.pl gencode.combined.cdsplus.dfam_masked.fa blast_pairs.outfmt6.grouped > blast_pairs.outfmt6.grouped.genesym 29 | 30 | cat blast_pairs.outfmt6.grouped.genesym | sort -k4,4g -k3,3gr > blast_pairs.outfmt6.grouped.genesym.sorted 31 | 32 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/get_top_blast_pairs.pl blast_pairs.outfmt6.grouped.genesym.sorted > blast_pairs.outfmt6.grouped.genesym.sorted.top 33 | 34 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl --outfmt6_grouped blast_pairs.outfmt6.grouped.genesym.sorted.top --min_pct_len 1 --min_per_id 90 --inflation_factor 3 35 | -------------------------------------------------------------------------------- /resources/paralog_clusters.dat: -------------------------------------------------------------------------------- 1 | paralog_clusters.2020.I3.dat -------------------------------------------------------------------------------- /runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | 6 | ## Run analyses first for the simulated data and then for the cancer cell line data. 7 | 8 | dirs=(simulated_data cancer_cell_lines runtime_analysis) 9 | 10 | for dir in ${dirs[*]} 11 | do 12 | cd $dir 13 | ./runMe.sh $* 14 | cd ../ 15 | done 16 | 17 | 18 | 19 | # gather main and supp. figures for paper 20 | 21 | ./util/__get_figs_for_paper.pl . 22 | 23 | -------------------------------------------------------------------------------- /runtime_analysis/STAR_F_multicore/runtimes.txt: -------------------------------------------------------------------------------- 1 | sample prog time_h 2 | G20476.DMS_454.2 1core 1.98 3 | G20476.DMS_454.2 2core 1.23 4 | G20476.DMS_454.2 5core 1.01 5 | G20476.DMS_454.2 10core 0.82 6 | G20495.786-O.2 1core 2.01 7 | G20495.786-O.2 2core 1.03 8 | G20495.786-O.2 5core 0.53 9 | G20495.786-O.2 10core 0.36 10 | G20498.KYSE-180.2 1core 1.75 11 | G20498.KYSE-180.2 2core 0.97 12 | G20498.KYSE-180.2 5core 0.54 13 | G20498.KYSE-180.2 10core 0.51 14 | G20500.IGR-37.2 1core 1.78 15 | G20500.IGR-37.2 2core 1.01 16 | G20500.IGR-37.2 5core 0.55 17 | G20500.IGR-37.2 10core 0.52 18 | G25214.MKN7.1 1core 1.83 19 | G25214.MKN7.1 2core 1.06 20 | G25214.MKN7.1 5core 0.63 21 | G25214.MKN7.1 10core 0.63 22 | G25225.NCI-H522.1 1core 1.81 23 | G25225.NCI-H522.1 2core 1.02 24 | G25225.NCI-H522.1 5core 0.60 25 | G25225.NCI-H522.1 10core 0.46 26 | G26175.A172.2 1core 1.90 27 | G26175.A172.2 2core 1.06 28 | G26175.A172.2 5core 0.56 29 | G26175.A172.2 10core 0.52 30 | G26182.KMS-12-BM.2 1core 1.95 31 | G26182.KMS-12-BM.2 2core 1.07 32 | G26182.KMS-12-BM.2 5core 0.67 33 | G26182.KMS-12-BM.2 10core 0.43 34 | G26199.LN-229.2 1core 1.88 35 | G26199.LN-229.2 2core 1.06 36 | G26199.LN-229.2 5core 0.55 37 | G26199.LN-229.2 10core 0.39 38 | G26212.A-673.2 1core 1.93 39 | G26212.A-673.2 2core 1.07 40 | G26212.A-673.2 5core 0.56 41 | G26212.A-673.2 10core 0.57 42 | -------------------------------------------------------------------------------- /runtime_analysis/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | rm -f ./STAR_F_multicore/runtimes.txt.boxplot.pdf ./all_progs_cancer/runtimes.txt.boxplot.pdf 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /runtime_analysis/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | ../util/boxplot_runtimes.Rscript ./STAR_F_multicore/runtimes.txt 6 | 7 | ../util/boxplot_runtimes.Rscript ./all_progs_cancer/runtimes.txt 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /simulated_data/SuppTable-sim_reads.csv: -------------------------------------------------------------------------------- 1 | Simulated read length,simulated read data set name,modeled on original data,original data source 2 | sim_50,sim_adipose,ERR030880,ArrayExpress E-MTAB-513:ERR030880 3 | sim_50,sim_brain,ERR030882,ArrayExpress E-MTAB-513:ERR030882 4 | sim_50,sim_colon,ERR030884,ArrayExpress E-MTAB-513:ERR030884 5 | sim_50,sim_heart,ERR030886,ArrayExpress E-MTAB-513:ERR030886 6 | sim_50,sim_testis,ERR030873,ArrayExpress E-MTAB-513:ERR030873 7 | sim_101,sim1_reads,G27488.SNU-620.2,CCLE:b146a903-8d81-4d13-a630-22c944087bf4 8 | sim_101,sim2_reads,G28535.OVTOKO,CCLE:24799388-008d-4011-a181-f4b2070b8bb0 9 | sim_101,sim3_reads,G25214.MKN7,CCLE:23056430-2489-4922-ac0b-299b7f43e74e 10 | sim_101,sim4_reads,G30608.SW_780,CCLE:a3e56efd-459e-44a2-be92-1abf59ee7ff3 11 | sim_101,sim5_reads,G27376.COLO_792.1,CCLE:5c9df572-dd10-420f-980a-17ae004dfc08 12 | -------------------------------------------------------------------------------- /simulated_data/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dirs=(sim_101 sim_50) 4 | 5 | for dir in ${dirs[*]} 6 | do 7 | cd $dir 8 | ./cleanMe.sh 9 | cd .. 10 | done 11 | 12 | rm -f ./*.pdf 13 | 14 | 15 | -------------------------------------------------------------------------------- /simulated_data/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | dirs=(sim_50 sim_101) 6 | 7 | # run analyses separately for the sim_50 and sim_101 data 8 | for dir in ${dirs[*]} 9 | do 10 | cd $dir 11 | ./runMe.sh $* 12 | cd ../ 13 | done 14 | 15 | #################################### 16 | # combine results into single figure 17 | 18 | ## allow rev 19 | ../benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript sim_50/__analyze_allow_reverse/all.AUC.dat sim_101/__analyze_allow_reverse/all.AUC.dat allow_rev.combined.pdf 20 | 21 | ## allow rev & paralogs-ok 22 | ../benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript sim_50/__analyze_allow_rev_and_paralogs/all.AUC.dat sim_101/__analyze_allow_rev_and_paralogs/all.AUC.dat allow_rev_and_paralogs.combined.pdf 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /simulated_data/sim_101/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log 4 | rm -rf ./_* 5 | 6 | -------------------------------------------------------------------------------- /simulated_data/sim_101/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | if [ ! -d samples ]; then 6 | wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_101/samples/ 7 | find data.broadinstitute.org/|grep html | xargs -n1 rm -f 8 | mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_101/samples . 9 | rm -rf ./data.broadinstitute.org 10 | fi 11 | 12 | 13 | ../analyze_simulated_data.pl sim_101.truth_set.dat sim_101.fusion_TPM_values.dat $* 14 | -------------------------------------------------------------------------------- /simulated_data/sim_50/cleanMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./log ./all.AUC.dat 4 | rm -rf ./_* 5 | 6 | -------------------------------------------------------------------------------- /simulated_data/sim_50/runMe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ev 4 | 5 | if [ ! -d samples ]; then 6 | wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_50/samples/ 7 | find data.broadinstitute.org/|grep html | xargs -n1 rm -f 8 | mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_50/samples . 9 | rm -rf ./data.broadinstitute.org 10 | fi 11 | 12 | 13 | ../analyze_simulated_data.pl sim_50.truth_set.dat sim_50.fusion_TPM_values.dat $* 14 | -------------------------------------------------------------------------------- /util/Terra/organize_FI_results_for_benchmarking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import subprocess 5 | import logging 6 | 7 | logging.basicConfig(stream=sys.stderr, level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def main(): 11 | 12 | usage = "\n\tusage: {} local.FI.files.list ANALYSIS_NAME\n\n".format(sys.argv[0]) 13 | if len(sys.argv) < 3: 14 | sys.stderr.write(usage) 15 | sys.exit(1) 16 | 17 | local_files_list = sys.argv[1] 18 | analysis_name = sys.argv[2] 19 | 20 | 21 | translation = get_sample_name_translation() 22 | 23 | with open(local_files_list) as fh: 24 | for filename in fh: 25 | filename = filename.rstrip() 26 | if filename[-3:] != ".gz": 27 | raise RuntimeError("Error, not identifying filename {} as gzipped") 28 | 29 | sample_name = os.path.basename(filename) 30 | sample_name, count = re.subn(".FusionInspector.tsv.gz", "", sample_name) # when FI run as follow-up to starF 31 | if count != 1: 32 | sample_name, count = re.subn(".finspector.FusionInspector.fusions.abridged.tsv.gz", "", sample_name) # when FI run separately 33 | if count != 1: 34 | raise RuntimeError("didn't find .FusionInspector.tsv.gz or .finspector.FusionInspector.fusions.abridged.tsv.gz in sample_name: {}".format(sample_name)) 35 | 36 | 37 | if sample_name in translation: 38 | sample_name = translation[sample_name] 39 | 40 | outdir = "/".join(["samples", sample_name, analysis_name]) 41 | if not os.path.exists(outdir): 42 | os.makedirs(outdir) 43 | 44 | outputfile = os.path.join(outdir, "finspector.fusion_predictions.abridged.tsv") 45 | 46 | logger.info("-writing {}".format(outputfile)) 47 | subprocess.check_call(" ".join(["gunzip", "-c", filename, ">", outputfile]), shell=True) 48 | 49 | 50 | 51 | sys.exit(0) 52 | 53 | 54 | def get_sample_name_translation(): 55 | 56 | translation = dict() 57 | 58 | pairs_txt = """G20476_DMS_454_2 G20476.DMS_454.2 59 | G20495_786-O_2 G20495.786-O.2 60 | G20498_KYSE-180_2 G20498.KYSE-180.2 61 | G20500_IGR-37_2 G20500.IGR-37.2 62 | G25214_MKN7_1 G25214.MKN7.1 63 | G25225_NCI-H522_1 G25225.NCI-H522.1 64 | G26175_A172_2 G26175.A172.2 65 | G26182_KMS-12-BM_2 G26182.KMS-12-BM.2 66 | G26199_LN-229_2 G26199.LN-229.2 67 | G26212_A-673_2 G26212.A-673.2 68 | G26216_KP-2_2 G26216.KP-2.2 69 | G26228_Hs_683_2 G26228.Hs_683.2 70 | G26236_NCI-H716_2 G26236.NCI-H716.2 71 | G26249_KMS-26_2 G26249.KMS-26.2 72 | G26253_KMS-34_2 G26253.KMS-34.2 73 | G26262_NCI-H889_2 G26262.NCI-H889.2 74 | G27214_PC-3_1 G27214.PC-3.1 75 | G27219_Panc_03_27_1 G27219.Panc_03.27.1 76 | G27233_A-498_1 G27233.A-498.1 77 | G27259_AN3_CA_1 G27259.AN3_CA.1 78 | G27280_TC-71_1 G27280.TC-71.1 79 | G27367_BFTC-909_1 G27367.BFTC-909.1 80 | G27376_COLO_792_1 G27376.COLO_792.1 81 | G27453_SNU-398_2 G27453.SNU-398.2 82 | G27463_SK-MEL-1_2 G27463.SK-MEL-1.2 83 | G27476_PK-59_2 G27476.PK-59.2 84 | G27479_SK-MEL-3_2 G27479.SK-MEL-3.2 85 | G27488_SNU-620_2 G27488.SNU-620.2 86 | G27516_SK-MEL-28_2 G27516.SK-MEL-28.2 87 | G27544_SF268_2 G27544.SF268.2 88 | G28011_KLE_1 G28011.KLE.1 89 | G28034_MDA-MB-361_1 G28034.MDA-MB-361.1 90 | G28045_KYSE-270_1 G28045.KYSE-270.1 91 | G28050_KMM-1_1 G28050.KMM-1.1 92 | G28054_KYSE-520_1 G28054.KYSE-520.1 93 | G28070_LN-18_1 G28070.LN-18.1 94 | G28072_MDA-MB-175-VII_1 G28072.MDA-MB-175-VII.1 95 | G28077_MG-63_1 G28077.MG-63.1 96 | G28081_JHH-7_1 G28081.JHH-7.1 97 | G28087_MDA-MB-436_1 G28087.MDA-MB-436.1 98 | G28535_OVTOKO_1 G28535.OVTOKO.1 99 | G28545_NUGC-2_1 G28545.NUGC-2.1 100 | G28575_OUMS-23_1 G28575.OUMS-23.1 101 | G28610_MHH-ES-1_1 G28610.MHH-ES-1.1 102 | G30594_UACC-893_1 G30594.UACC-893.1 103 | G30631_SU-DHL-10_1 G30631.SU-DHL-10.1 104 | G41663_OVISE_5 G41663.OVISE.5 105 | G41682_KYSE-510_5 G41682.KYSE-510.5 106 | G41706_RT4_5 G41706.RT4.5 107 | G41709_FaDu_5 G41709.FaDu.5 108 | G41710_SNU-16_5 G41710.SNU-16.5 109 | G41724_HGC-27_5 G41724.HGC-27.5""" 110 | 111 | for line in pairs_txt.split("\n"): 112 | (before, after) = re.split("\s+", line) 113 | translation[before] = after 114 | 115 | 116 | return translation 117 | 118 | 119 | 120 | if __name__=='__main__': 121 | main() 122 | -------------------------------------------------------------------------------- /util/Terra/organize_StarF_results_for_benchmarking.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import subprocess 5 | import logging 6 | 7 | logging.basicConfig(stream=sys.stderr, level=logging.INFO) 8 | logger = logging.getLogger(__name__) 9 | 10 | def main(): 11 | 12 | usage = "\n\tusage: {} local.starF.files.list ANALYSIS_NAME\n\n".format(sys.argv[0]) 13 | if len(sys.argv) < 3: 14 | sys.stderr.write(usage) 15 | sys.exit(1) 16 | 17 | local_files_list = sys.argv[1] 18 | analysis_name = sys.argv[2] 19 | 20 | translation = get_sample_name_translation() 21 | 22 | 23 | with open(local_files_list) as fh: 24 | for filename in fh: 25 | filename = filename.rstrip() 26 | if filename[-3:] != ".gz": 27 | raise RuntimeError("Error, not identifying filename {} as gzipped") 28 | 29 | sample_name = os.path.basename(filename) 30 | sample_name, count = re.subn(".STAR-Fusion.tsv.gz", "", sample_name) 31 | if count != 1: 32 | raise RuntimeError("didn't find .STAR-Fusion.tsv.gz in sample_name: {}".format(sample_name)) 33 | 34 | if sample_name in translation: 35 | sample_name = translation[sample_name] 36 | 37 | outdir = "/".join(["samples", sample_name, analysis_name]) 38 | if not os.path.exists(outdir): 39 | os.makedirs(outdir) 40 | 41 | outputfile = os.path.join(outdir, "star-fusion.fusion_predictions.abridged.tsv") 42 | 43 | logger.info("-writing {}".format(outputfile)) 44 | subprocess.check_call(" ".join(["gunzip", "-c", filename, ">", outputfile]), shell=True) 45 | 46 | 47 | 48 | sys.exit(0) 49 | 50 | 51 | 52 | def get_sample_name_translation(): 53 | 54 | translation = dict() 55 | 56 | pairs_txt = """G20476_DMS_454_2 G20476.DMS_454.2 57 | G20495_786-O_2 G20495.786-O.2 58 | G20498_KYSE-180_2 G20498.KYSE-180.2 59 | G20500_IGR-37_2 G20500.IGR-37.2 60 | G25214_MKN7_1 G25214.MKN7.1 61 | G25225_NCI-H522_1 G25225.NCI-H522.1 62 | G26175_A172_2 G26175.A172.2 63 | G26182_KMS-12-BM_2 G26182.KMS-12-BM.2 64 | G26199_LN-229_2 G26199.LN-229.2 65 | G26212_A-673_2 G26212.A-673.2 66 | G26216_KP-2_2 G26216.KP-2.2 67 | G26228_Hs_683_2 G26228.Hs_683.2 68 | G26236_NCI-H716_2 G26236.NCI-H716.2 69 | G26249_KMS-26_2 G26249.KMS-26.2 70 | G26253_KMS-34_2 G26253.KMS-34.2 71 | G26262_NCI-H889_2 G26262.NCI-H889.2 72 | G27214_PC-3_1 G27214.PC-3.1 73 | G27219_Panc_03_27_1 G27219.Panc_03.27.1 74 | G27233_A-498_1 G27233.A-498.1 75 | G27259_AN3_CA_1 G27259.AN3_CA.1 76 | G27280_TC-71_1 G27280.TC-71.1 77 | G27367_BFTC-909_1 G27367.BFTC-909.1 78 | G27376_COLO_792_1 G27376.COLO_792.1 79 | G27453_SNU-398_2 G27453.SNU-398.2 80 | G27463_SK-MEL-1_2 G27463.SK-MEL-1.2 81 | G27476_PK-59_2 G27476.PK-59.2 82 | G27479_SK-MEL-3_2 G27479.SK-MEL-3.2 83 | G27488_SNU-620_2 G27488.SNU-620.2 84 | G27516_SK-MEL-28_2 G27516.SK-MEL-28.2 85 | G27544_SF268_2 G27544.SF268.2 86 | G28011_KLE_1 G28011.KLE.1 87 | G28034_MDA-MB-361_1 G28034.MDA-MB-361.1 88 | G28045_KYSE-270_1 G28045.KYSE-270.1 89 | G28050_KMM-1_1 G28050.KMM-1.1 90 | G28054_KYSE-520_1 G28054.KYSE-520.1 91 | G28070_LN-18_1 G28070.LN-18.1 92 | G28072_MDA-MB-175-VII_1 G28072.MDA-MB-175-VII.1 93 | G28077_MG-63_1 G28077.MG-63.1 94 | G28081_JHH-7_1 G28081.JHH-7.1 95 | G28087_MDA-MB-436_1 G28087.MDA-MB-436.1 96 | G28535_OVTOKO_1 G28535.OVTOKO.1 97 | G28545_NUGC-2_1 G28545.NUGC-2.1 98 | G28575_OUMS-23_1 G28575.OUMS-23.1 99 | G28610_MHH-ES-1_1 G28610.MHH-ES-1.1 100 | G30594_UACC-893_1 G30594.UACC-893.1 101 | G30631_SU-DHL-10_1 G30631.SU-DHL-10.1 102 | G41663_OVISE_5 G41663.OVISE.5 103 | G41682_KYSE-510_5 G41682.KYSE-510.5 104 | G41706_RT4_5 G41706.RT4.5 105 | G41709_FaDu_5 G41709.FaDu.5 106 | G41710_SNU-16_5 G41710.SNU-16.5 107 | G41724_HGC-27_5 G41724.HGC-27.5""" 108 | 109 | for line in pairs_txt.split("\n"): 110 | (before, after) = re.split("\s+", line) 111 | translation[before] = after 112 | 113 | 114 | return translation 115 | 116 | 117 | if __name__=='__main__': 118 | main() 119 | -------------------------------------------------------------------------------- /util/__get_figs_for_paper.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "usage: $0 repo_basedir\n"; 7 | 8 | my $basedir = $ARGV[0] or die $usage; 9 | 10 | chdir $basedir or die "Error, cannot cd to $basedir"; 11 | 12 | # make the dir structure 13 | unless (-d "figs_for_paper") { 14 | &process_cmd("mkdir -p figs_for_paper"); 15 | } 16 | 17 | 18 | my @targets_and_dests = ( 19 | 20 | ## Figure 2 21 | 22 | # Fig 2-a 23 | ["simulated_data/allow_rev_and_paralogs.combined.pdf", 24 | "figs_for_paper/fig_2a.sim50_vs_101.boxplots.pdf"], 25 | 26 | # Fig 2-b_top, expression vs. sensitivity heatmap PE 50 27 | ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.sensitivity_vs_expr.dat.genes_vs_samples_heatmap.pdf", 28 | "figs_for_paper/fig_2b_top.sim_50.sens_vs_expr.heatmap.pdf"], 29 | 30 | # Fig 2-b_bottom, expression vs. sensitivity heatmap PE 101 31 | ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.sensitivity_vs_expr.dat.genes_vs_samples_heatmap.pdf", 32 | "figs_for_paper/fig_2b_bottom.sim_101.sens_vs_expr.heatmap.pdf"], 33 | 34 | 35 | ## Figure 3 36 | ["cancer_cell_lines/Edgren_subset/edgren.min3.UpSetR.pdf", 37 | "figs_for_paper/fig_3a.four_breast_cancer_cell_lines_UpSetR_plot_2nd_page.pdf"], 38 | 39 | ["cancer_cell_lines/Edgren_subset/preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.enrich_stats.pdf", 40 | "figs_for_paper/fig_3b.valid_fusion_enrichment_2nd_page.pdf"], 41 | 42 | ## Figure 4 43 | ["cancer_cell_lines/all.auc.rankings.iu\=1.okp\=1.boxplot.pdf", 44 | "figs_for_paper/fig_4a_cancer_leaderboard_rankings.pdf"], 45 | 46 | ["cancer_cell_lines/__min_7_agree/min_7.okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1_scatter.pdf", 47 | "figs_for_paper/fig_4d_peak_accuracy_min7progsagree.pdf"], 48 | 49 | 50 | 51 | #################### 52 | ## Supplementary Figures 53 | 54 | # supp fig 1 55 | ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.TP_and_FP_counts_vs_minFrags_eaProg.pdf", 56 | "figs_for_paper/supp_fig1.pe50_TP_FP_vs_minReads.pdf"], 57 | 58 | # supp fig 2 59 | ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.TP_and_FP_counts_vs_minFrags_eaProg.pdf", 60 | "figs_for_paper/supp_fig2.pe101_TP_FP_vs_minReads.pdf"], 61 | 62 | # supp fig 3a 63 | ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.tpr_ppv_at_maxF1_scatter.pdf", 64 | "figs_for_paper/supp_fig3a.pe50_max_F1_scatter.pdf"], 65 | 66 | # supp fig 3b 67 | ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.tpr_ppv_at_maxF1_scatter.pdf", 68 | "figs_for_paper/supp_fig3b.pe101_max_F1_scatter.pdf"], 69 | 70 | # supp fig 4 71 | ["cancer_cell_lines/okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1.dat.consolidated.scatters.pdf", 72 | "figs_for_paper/supp_fig4.cancer_maxF1_ea_truthset.pdf"], 73 | 74 | # supp fig 5 75 | ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.best.dat.before_vs_after.pdf", 76 | "figs_for_paper/supp_fig5.before_vs_after_paralog_equiv_pe101.pdf"], 77 | 78 | # supp fig 6 79 | ["cancer_cell_lines/preds.collected.gencode_mapped.wAnnot.filt.matrix.binary.sample_cor_matrix.pdf", 80 | "figs_for_paper/supp_fig6.cancer_correlated_preds.pdf"], 81 | 82 | # supp fig 7 83 | ["cancer_cell_lines/all.auc.rankings_per_prog_adj.boxplot.pdf", 84 | "figs_for_paper/supp_fig7.effect_iu_okp_on_cancer_ranking_dist.pdf"], 85 | 86 | # supp fig 8 87 | ["cancer_cell_lines/all.auc.rankings.iu\=1.okp\=0.boxplot.pdf", 88 | "figs_for_paper/supp_fig8.cancer_rankings_equiv_para_off.pdf"], 89 | 90 | 91 | ############################ 92 | ## Supplementary data files 93 | 94 | # supp table 1 95 | ["simulated_data/sim_50/preds.collected.gencode_mapped.wAnnot.filt", 96 | "figs_for_paper/supp_table1.pe50_fusion_filtered_preds.tsv"], 97 | 98 | # supp table 2 99 | ["simulated_data/sim_101/preds.collected.gencode_mapped.wAnnot.filt", 100 | "figs_for_paper/supp_table2.pe101_fusion_filtered_preds.tsv"], 101 | 102 | # supp table 4 103 | ["cancer_cell_lines/preds.collected.gencode_mapped.wAnnot.filt", 104 | "figs_for_paper/supp_table4.cancer_fusion_filtered_preds.tsv"], 105 | 106 | ); 107 | 108 | 109 | 110 | foreach my $target_and_dest (@targets_and_dests) { 111 | 112 | my ($from, $to) = @$target_and_dest; 113 | 114 | &process_cmd("cp $from $to"); 115 | 116 | } 117 | 118 | 119 | 120 | 121 | 122 | exit(0); 123 | 124 | #### 125 | sub process_cmd { 126 | my ($cmd) = @_; 127 | 128 | print "CMD: $cmd\n"; 129 | my $ret = system($cmd); 130 | if ($ret) { 131 | die "Error, CMD: $cmd died with ret $ret"; 132 | } 133 | } 134 | 135 | -------------------------------------------------------------------------------- /util/boxplot_runtimes.Rscript: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args<-commandArgs(TRUE) 4 | 5 | dat_filename = args[1] 6 | 7 | data = read.table(dat_filename, header=T, sep='\t') 8 | 9 | library('ggplot2') 10 | library('data.table') 11 | 12 | dt = data.table(data) 13 | dt_median_time = dt[,.(median_time_h=median(time_h, na.rm=T)), by=.(prog)][order(median_time_h)] 14 | 15 | data$prog = factor(data$prog, levels=factor(dt_median_time$prog)) 16 | 17 | write.table(dt_median_time, 'median_runtimes.txt', quote=F, sep="\t") 18 | 19 | pdf_filename = paste(dat_filename, '.boxplot.pdf', sep='') 20 | pdf(pdf_filename) 21 | 22 | p = ggplot(data, aes(factor(prog), time_h)) + 23 | geom_boxplot(outlier.shape=NA) + 24 | theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + ylim(c(0,120)) 25 | 26 | plot(p) 27 | 28 | dev.off() 29 | -------------------------------------------------------------------------------- /util/capture_PR_AUC_for_plotting.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "\n\n\tusage: $0 auc_list.files\n\n"; 7 | 8 | my $auc_files_filename = $ARGV[0] or die $usage; 9 | 10 | main: { 11 | 12 | my @files = `cat $auc_files_filename`; 13 | chomp @files; 14 | 15 | print join("\t", "progname", "min_thresh", "ignoreUnsure", "okpara", "auc") . "\n"; 16 | 17 | foreach my $file (@files) { 18 | 19 | $file =~ /min_(\d+)/ or die "Erorr, no min val extracted from $file"; 20 | 21 | my $min_thresh = $1; 22 | 23 | my $ignoreUnsure = 0; 24 | if ($file =~ /ignoreUnsure/) { 25 | $ignoreUnsure = 1; 26 | } 27 | 28 | my $okpara = 0; 29 | if ($file =~ /okPara/) { 30 | $okpara = 1; 31 | } 32 | 33 | my @data = `cat $file`; 34 | chomp @data; 35 | 36 | foreach my $line (@data) { 37 | my ($progname, $auc) = split(/\t/, $line); 38 | print join("\t", $progname, $min_thresh, $ignoreUnsure, $okpara, $auc) . "\n"; 39 | } 40 | 41 | } 42 | 43 | exit(0); 44 | 45 | } 46 | -------------------------------------------------------------------------------- /util/make_file_listing_input_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Carp; 6 | 7 | 8 | my %restrict_progs; 9 | if (@ARGV) { 10 | my $restrict_progs_file = $ARGV[0]; 11 | open(my $fh, $restrict_progs_file) or die "Error, cannot open file: $restrict_progs_file"; 12 | while(<$fh>) { 13 | chomp; 14 | unless (/\w/) { next; } 15 | my $progname = $_; 16 | $restrict_progs{$progname} = 1; 17 | } 18 | close $fh; 19 | } 20 | 21 | 22 | ## convert prog name tokens to names used in the data table. 23 | my %converter = (CHIMERASCAN => 'ChimeraScan', 24 | CHIMPIPE => 'ChimPipe', 25 | DEFUSE => 'deFuse', 26 | ERICSCRIPT => 'EricScript', 27 | FUSIONHUNTER => 'FusionHunter', 28 | #FUSION_CATCHER_V0994e => 'FusionCatcher', 29 | INFUSION_CF3 => 'InFusion', 30 | JAFFA_ASSEMBLY => 'JAFFA-Assembly', 31 | JAFFA_DIRECT => 'JAFFA-Direct', 32 | JAFFA_HYBRID => 'JAFFA-Hybrid', 33 | MAPSPLICE => 'MapSplice', 34 | NFUSE => 'nFuse', 35 | PRADA => 'PRADA', 36 | SOAP_FUSE => 'SOAP-fuse', 37 | 'STAR_FUSION_GRCh37v19_FL3_v51b3df4' => 'STAR_FUSION_old', 38 | TOPHAT_FUSION => 'TopHat-Fusion', 39 | ARRIBA => ['ARRIBA', 'ARRIBA_hc'], ## scoring regular and the hc subset separately 40 | PIZZLY => 'PIZZLY', 41 | STARCHIP => 'STARCHIP', 42 | 'STAR_FUSION_v1.5_hg19_Apr042019' => 'STAR_FUSION_v1.5', 43 | STARCHIP_csm10 => 'STARChip_csm10', 44 | TRINITY_FUSION_C_hg19 => 'TrinityFusion-C', 45 | TRINITY_FUSION_UC_hg19 => 'TrinityFusion-UC', 46 | TRINITY_FUSION_D_hg19 => 'TrinityFusion-D', 47 | STARSEQR => 'STARSEQR', 48 | 'STARSEQR_STAR-SEQR' => 'STARSEQR' 49 | ); 50 | 51 | 52 | while () { 53 | chomp; 54 | my $filename = $_; 55 | 56 | unless (-f $filename) { 57 | print STDERR "warning, $filename is not a file. Skipping...\n"; 58 | next; 59 | } 60 | 61 | if ($filename =~ m|/samples/([^/]+)/([^/]+)/|) { 62 | 63 | my $sample_name = $1; 64 | my $prog = $2; 65 | 66 | if (%restrict_progs && ! exists $restrict_progs{$prog}) { 67 | print STDERR "make_file_listing_input_table:: - skipping $filename, not in restricted list.\n"; 68 | next; 69 | } 70 | 71 | my $proper_progname = $converter{$prog}; 72 | 73 | if ($proper_progname) { 74 | ## In case we have multiple ways of parsing the file and filtering data for different assessements. 75 | if (ref $proper_progname) { 76 | foreach my $progname_adj (@$proper_progname) { 77 | print join("\t", $sample_name, $progname_adj, $filename) . "\n"; 78 | } 79 | } 80 | else { 81 | print join("\t", $sample_name, $proper_progname, $filename) . "\n"; 82 | } 83 | } 84 | else { 85 | # keep original name 86 | print join("\t", $sample_name, $prog, $filename) . "\n"; 87 | } 88 | } 89 | else { 90 | print STDERR "WARNING: not parsing filename as a target: $filename\n"; 91 | } 92 | } 93 | 94 | 95 | exit(0); 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /util/make_supp_AUC_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use File::Basename; 6 | 7 | my $usage = "usage: $0 search_dir token \n\n"; 8 | 9 | my $search_dir = $ARGV[0] or die $usage; 10 | my $token = $ARGV[1] or die $usage; 11 | 12 | my $cmd = "find $search_dir -regex \".\*fusion_preds.txt.scored.PR.AUC\" "; 13 | 14 | my @files = `$cmd`; 15 | chomp @files; 16 | 17 | print join("\t", "read_set", "data_set", "progname", "AUC") . "\n"; 18 | 19 | foreach my $file (@files) { 20 | 21 | print STDERR "-processing $file\n"; 22 | open (my $fh, $file) or die "Error, cannot open file: $file"; 23 | 24 | my $data_set_name = basename(dirname($file)); 25 | 26 | 27 | while (<$fh>) { 28 | print join("\t", $token, $data_set_name, $_); 29 | } 30 | 31 | close $fh; 32 | 33 | } 34 | 35 | exit(0); 36 | 37 | -------------------------------------------------------------------------------- /util/make_supp_ROC_table.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use File::Basename; 6 | 7 | 8 | my $usage = "usage: $0 search_dir token \n\n"; 9 | 10 | my $search_dir = $ARGV[0] or die $usage; 11 | my $token = $ARGV[1] or die $usage; 12 | 13 | my $cmd = "find $search_dir -regex \".\*fusion_preds.txt.scored.ROC\" "; 14 | 15 | my @files = `$cmd`; 16 | 17 | my $printed_header_flag = 0; 18 | 19 | foreach my $file (@files) { 20 | 21 | print STDERR "-processing $file\n"; 22 | open (my $fh, $file) or die "Error, cannot open file: $file"; 23 | 24 | my $data_set_name = basename(dirname($file)); 25 | 26 | 27 | my $header = <$fh>; 28 | unless ($printed_header_flag) { 29 | print join("\t", "read_set", "data_set", $header); 30 | $printed_header_flag = 1; 31 | } 32 | while (<$fh>) { 33 | print join("\t", $token, $data_set_name, $_); 34 | } 35 | 36 | close $fh; 37 | 38 | } 39 | 40 | exit(0); 41 | 42 | -------------------------------------------------------------------------------- /util/paralog_clustering_util/README.md: -------------------------------------------------------------------------------- 1 | # Instructions for computing approximate paralog clusters and simpler blast match clusters for annotating suspicious fusion calls. 2 | 3 | ## blastn 4 | 5 | blastn -query ref_annot.cdna -db ref_annot.cdna -max_target_seqs 1000 -outfmt 6 -evalue 1e-10 -num_threads 20 -word_size 11 > blast_pairs.outfmt6 6 | 7 | 8 | ## group segments 9 | 10 | outfmt6_add_percent_match_length.group_segments.pl blast_pairs.outfmt6 ref_annot.cdna ref_annot.cdna > blast_pairs.outfmt6.grouped 11 | 12 | ## replace with gene symbols 13 | 14 | blast_outfmt6_replace_trans_id_w_gene_symbol.pl ref_annot.cdna blast_pairs.outfmt6.grouped > blast_pairs.outfmt6.grouped.genesym 15 | 16 | 17 | # sort by Evalue asc, per_id desc 18 | 19 | cat blast_pairs.outfmt6.grouped.genesym | sort -k4,4g -k3,3gr > blast_pairs.outfmt6.grouped.genesym.sorted 20 | 21 | 22 | # get top match for each 23 | 24 | get_top_blast_pairs.pl blast_pairs.outfmt6.grouped.genesym.sorted > blast_pairs.outfmt6.grouped.genesym.sorted.top 25 | 26 | # perform Markov clustering 27 | 28 | outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl --outfmt6_grouped blast_pairs.outfmt6.grouped.genesym.sorted.top --min_pct_len 1 --min_per_id 90 --inflation_factor 5 29 | 30 | ln -s dump.out.blast_pairs.outfmt6.grouped.genesym.sorted.top.minLEN_1_pct_len.minPID_90.abc.mci.I50 paralog_clusters.txt 31 | 32 | 33 | -------------------------------------------------------------------------------- /util/paralog_clustering_util/blast_outfmt6_replace_trans_id_w_gene_symbol.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "\n\n\tusage: $0 search_db.fasta blast_results.outfmt6\n\n"; 7 | 8 | my $search_db = $ARGV[0] or die $usage; 9 | my $blast_outfmt6 = $ARGV[1] or die $usage; 10 | 11 | 12 | main: { 13 | 14 | my %trans_to_gene_symbol = &parse_headers($search_db); 15 | 16 | 17 | open (my $fh, $blast_outfmt6) or die "Error, cannot open file $blast_outfmt6"; 18 | while (<$fh>) { 19 | if (/^\#/) { next; } 20 | chomp; 21 | my @x = split(/\t/); 22 | my $transA = $x[0]; 23 | my $geneA = $trans_to_gene_symbol{$transA} or die "Error, no gene for $transA"; 24 | my $transB = $x[1]; 25 | my $geneB = $trans_to_gene_symbol{$transB} or die "Error, no gene for $transB"; 26 | 27 | $x[0] = $geneA; 28 | $x[1] = $geneB; 29 | 30 | if ($geneA ne $geneB) { 31 | print join("\t", @x) . "\n"; 32 | } 33 | } 34 | close $fh; 35 | 36 | exit(0); 37 | 38 | 39 | } 40 | 41 | 42 | #### 43 | sub parse_headers { 44 | my ($search_db) = @_; 45 | 46 | my %trans_to_sym; 47 | 48 | open (my $fh, $search_db) or die "Error, cannot open file $search_db"; 49 | while (<$fh>) { 50 | chomp; 51 | if (/^>/) { 52 | s/>//; 53 | my ($trans_id, $gene_id, $gene_sym) = split(/\s+/); 54 | 55 | unless (defined $gene_sym) { 56 | $gene_sym = $gene_id; 57 | } 58 | 59 | $trans_to_sym{$trans_id} = $gene_sym; 60 | } 61 | } 62 | 63 | close $fh; 64 | 65 | 66 | return(%trans_to_sym); 67 | } 68 | -------------------------------------------------------------------------------- /util/paralog_clustering_util/get_top_blast_pairs.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | 7 | my $usage = "\n\n\tusage: $0 blastn.outfmt6.grouped.geneSym.sorted\n\n** NOTE, MUST BE PRE-SORTED like so:\n" 8 | . " cat blastn.outfmt6.grouped.geneSym | sort -k4,4g -k3,3gr > blastn.outfmt6.grouped.geneSym.sorted \n\n\n"; 9 | 10 | my $input_file = $ARGV[0] or die $usage; 11 | 12 | 13 | main: { 14 | 15 | 16 | # m blastn.outfmt6.grouped.geneSym | sort -k4,4g -k3,3gr > blastn.outfmt6.grouped.geneSym.sorted 17 | 18 | 19 | my %data; 20 | 21 | open (my $fh, $input_file) or die "Error, cannot open file: $input_file"; 22 | while (<$fh>) { 23 | my $line = $_; 24 | chomp; 25 | my @x = split(/\t/); 26 | my $geneA = $x[0]; 27 | my $geneB = $x[1]; 28 | 29 | my $token = join("$;", sort ($geneA, $geneB) ); 30 | 31 | unless ($data{$token}) { 32 | $data{$token} = 1; 33 | print $line; 34 | } 35 | 36 | } 37 | 38 | exit(0); 39 | } 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /util/paralog_clustering_util/outfmt6_add_percent_match_length.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use FindBin; 6 | use lib ("$FindBin::Bin/../../lib"); 7 | use Fasta_reader; 8 | use List::Util qw(min max); 9 | 10 | my $usage = "\n\n\tusage: $0 blast.outfmt6 query_fasta target_fasta\n"; 11 | 12 | my $blast_file = $ARGV[0] or die $usage; 13 | my $query_fasta = $ARGV[1] or die $usage; 14 | my $target_fasta = $ARGV[2] or die $usage; 15 | 16 | main: { 17 | 18 | my %query_seq_lens = &get_seq_lengths($query_fasta); 19 | 20 | my %target_seq_lens; 21 | if ($query_fasta eq $target_fasta) { 22 | %target_seq_lens = %query_seq_lens; 23 | } 24 | else { 25 | %target_seq_lens = &get_seq_lengths($target_fasta); 26 | } 27 | 28 | open (my $fh, $blast_file) or die "Error, cannot open file $blast_file"; 29 | while (<$fh>) { 30 | chomp; 31 | my @x = split(/\t/); 32 | my $query_acc = $x[0]; 33 | my $target_acc = $x[1]; 34 | 35 | my $query_len = $query_seq_lens{$query_acc} or die "Error, cannot find seq length for query: $query_acc"; 36 | my $target_len = $target_seq_lens{$target_acc} or die "Error, cannot find seq length for target: $target_acc"; 37 | 38 | my $query_hit_len = abs($x[7]-$x[6]); 39 | my $db_hit_len = abs($x[9]-$x[8]); 40 | 41 | my $pct_query_len = sprintf("%.2f", $query_hit_len / $query_len * 100); 42 | my $pct_target_len = sprintf("%.2f", $db_hit_len / $target_len * 100); 43 | 44 | push (@x, $query_len, $pct_query_len, $target_len, $pct_target_len, max($pct_query_len, $pct_target_len)); 45 | 46 | print join("\t", @x) . "\n"; 47 | } 48 | 49 | exit(0); 50 | } 51 | 52 | #### 53 | sub get_seq_lengths { 54 | my ($fasta_file) = @_; 55 | 56 | my %seq_lens; 57 | 58 | my $fasta_reader = new Fasta_reader($fasta_file); 59 | while (my $seq_obj = $fasta_reader->next()) { 60 | 61 | my $acc = $seq_obj->get_accession(); 62 | my $seq_len = length($seq_obj->get_sequence()); 63 | 64 | $seq_lens{$acc} = $seq_len; 65 | } 66 | 67 | return(%seq_lens); 68 | } 69 | 70 | -------------------------------------------------------------------------------- /util/terra_partition_to_sample_dirs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, os, re 4 | import subprocess 5 | 6 | def main(): 7 | usage = "\n\tusage: {} files.list.file progname_token\n\n".format(sys.argv[0]) 8 | 9 | if len(sys.argv) < 3: 10 | print(usage, file=sys.stderr) 11 | sys.exit(1) 12 | 13 | files_list_file = sys.argv[1] 14 | progname_token = sys.argv[2] 15 | 16 | if not os.path.exists("samples"): 17 | os.makedirs("samples") 18 | 19 | with open(files_list_file) as fh: 20 | for filename in fh: 21 | orig_filename = filename.rstrip() 22 | filename = os.path.basename(orig_filename) 23 | filename_pts = filename.split(".") 24 | 25 | samplename = filename_pts[0] 26 | if samplename in sample_conversions: 27 | samplename = sample_conversions[samplename] 28 | print("-renamed samplename to: {}".format(samplename)) 29 | 30 | dest_dir = "samples/{}/{}".format(samplename, progname_token) 31 | if not os.path.exists(dest_dir): 32 | os.makedirs(dest_dir) 33 | 34 | cmd = "cp {} {}".format(orig_filename, dest_dir) 35 | subprocess.check_call(cmd, shell=True) 36 | print(cmd, file=sys.stderr) 37 | 38 | sys.exit(0) 39 | 40 | 41 | sample_conversions = { 42 | 'G20476_DMS_454_2' : "G20476.DMS_454.2", 43 | "G20495_786-O_2" : "G20495.786-O.2", 44 | "G20498_KYSE-180_2" : "G20498.KYSE-180.2", 45 | "G20500_IGR-37_2" : "G20500.IGR-37.2", 46 | "G25214_MKN7_1" : "G25214.MKN7.1", 47 | "G25225_NCI-H522_1" : "G25225.NCI-H522.1", 48 | "G26175_A172_2" : "G26175.A172.2", 49 | "G26182_KMS-12-BM_2" : "G26182.KMS-12-BM.2", 50 | "G26199_LN-229_2" : "G26199.LN-229.2", 51 | "G26212_A-673_2" : "G26212.A-673.2", 52 | "G26216_KP-2_2" : "G26216.KP-2.2", 53 | "G26228_Hs_683_2" : "G26228.Hs_683.2", 54 | "G26236_NCI-H716_2" : "G26236.NCI-H716.2", 55 | "G26249_KMS-26_2" : "G26249.KMS-26.2", 56 | "G26253_KMS-34_2" : "G26253.KMS-34.2", 57 | "G26262_NCI-H889_2" : "G26262.NCI-H889.2", 58 | "G27214_PC-3_1" : "G27214.PC-3.1", 59 | "G27219_Panc_03_27_1" : "G27219.Panc_03.27.1", 60 | "G27233_A-498_1" : "G27233.A-498.1", 61 | "G27259_AN3_CA_1" : "G27259.AN3_CA.1", 62 | "G27280_TC-71_1" : "G27280.TC-71.1", 63 | "G27367_BFTC-909_1" : "G27367.BFTC-909.1", 64 | "G27376_COLO_792_1" : "G27376.COLO_792.1", 65 | "G27453_SNU-398_2" : "G27453.SNU-398.2", 66 | "G27463_SK-MEL-1_2" : "G27463.SK-MEL-1.2", 67 | "G27476_PK-59_2" : "G27476.PK-59.2", 68 | "G27479_SK-MEL-3_2" : "G27479.SK-MEL-3.2", 69 | "G27488_SNU-620_2" : "G27488.SNU-620.2", 70 | "G27516_SK-MEL-28_2" : "G27516.SK-MEL-28.2", 71 | "G27544_SF268_2" : "G27544.SF268.2", 72 | "G28011_KLE_1" : "G28011.KLE.1", 73 | "G28034_MDA-MB-361_1" : "G28034.MDA-MB-361.1", 74 | "G28045_KYSE-270_1" : "G28045.KYSE-270.1", 75 | "G28050_KMM-1_1" : "G28050.KMM-1.1", 76 | "G28054_KYSE-520_1" : "G28054.KYSE-520.1", 77 | "G28070_LN-18_1" : "G28070.LN-18.1", 78 | "G28072_MDA-MB-175-VII_1" : "G28072.MDA-MB-175-VII.1", 79 | "G28077_MG-63_1" : "G28077.MG-63.1", 80 | "G28081_JHH-7_1" : "G28081.JHH-7.1", 81 | "G28087_MDA-MB-436_1" : "G28087.MDA-MB-436.1", 82 | "G28535_OVTOKO_1" : "G28535.OVTOKO.1", 83 | "G28545_NUGC-2_1" : "G28545.NUGC-2.1", 84 | "G28575_OUMS-23_1" : "G28575.OUMS-23.1", 85 | "G28610_MHH-ES-1_1" : "G28610.MHH-ES-1.1", 86 | "G30594_UACC-893_1" : "G30594.UACC-893.1", 87 | "G30631_SU-DHL-10_1" : "G30631.SU-DHL-10.1", 88 | "G41663_OVISE_5" : "G41663.OVISE.5", 89 | "G41682_KYSE-510_5" : "G41682.KYSE-510.5", 90 | "G41706_RT4_5" : "G41706.RT4.5", 91 | "G41709_FaDu_5" : "G41709.FaDu.5", 92 | "G41710_SNU-16_5" : "G41710.SNU-16.5", 93 | "G41724_HGC-27_5" : "G41724.HGC-27.5" } 94 | 95 | 96 | if __name__=='__main__': 97 | main() 98 | 99 | 100 | --------------------------------------------------------------------------------