├── Docker
    ├── Dockerfile
    ├── build_docker.sh
    ├── make_simg.sh
    ├── push_docker.sh
    ├── run_docker.sh
    ├── run_eval.sh
    └── run_singularity.sh
├── PerlLib
    ├── DelimParser.pm
    ├── Fasta_reader.pm
    ├── Overlap_piler.pm
    ├── Pipeliner.pm
    └── Process_cmd.pm
├── README.md
├── alt_methods
    ├── STAR-Fusion
    │   └── uger
    │   │   ├── starF_v1.5_hg19.cmd
    │   │   └── starF_v1.5_hg19.conf
    ├── TrinityFusion
    │   ├── uger
    │   │   ├── TrinityFusion-C
    │   │   │   ├── TrinityFusion-C.hg19.cmd
    │   │   │   └── TrinityFusion-C.hg19.conf
    │   │   ├── TrinityFusion-D
    │   │   │   ├── TrinityFusion-D.hg19.cmd
    │   │   │   └── TrinityFusion-D.hg19.conf
    │   │   └── TrinityFusion-UC
    │   │   │   ├── TrinityFusion-UC.hg19.cmd
    │   │   │   └── TrinityFusion-UC.hg19.conf
    │   └── wdl
    │   │   ├── TrinityFusion.wdl
    │   │   ├── inputs.json
    │   │   ├── make_wdl_input_template.sh
    │   │   └── run.sh
    ├── arriba
    │   ├── README.md
    │   ├── arriba_wrapper.pl
    │   └── uger
    │   │   ├── arriba.uger.cmd
    │   │   └── arriba.uger.conf
    ├── prada
    │   ├── Dockerfile
    │   ├── VERSION.txt
    │   └── build_docker.sh
    ├── star-seqr
    │   ├── docker
    │   │   ├── make_simg.sh
    │   │   └── run_test.sh
    │   └── uger
    │   │   ├── star-seqr.uger.cmd
    │   │   └── star-seqr.uger.conf
    └── starchip
    │   ├── Docker
    │       ├── Dockerfile
    │       ├── Pipeliner.pm
    │       ├── VERSION.txt
    │       ├── build_docker.sh
    │       ├── make_simg.sh
    │       ├── push_docker.sh
    │       └── starchip_wrapper.pl
    │   ├── README.md
    │   ├── cleanMe.sh
    │   ├── run_test.sh
    │   ├── test_data
    │       ├── reads_1.fq.gz
    │       └── reads_2.fq.gz
    │   ├── test_outdir
    │       ├── Aligned.out.bam
    │       ├── Chimeric.out.junction
    │       ├── Chimeric.out.sam
    │       ├── Log.final.out
    │       ├── Log.out
    │       ├── Log.progress.out
    │       ├── ReadsPerGene.out.tab
    │       ├── SJ.out.tab
    │       ├── Unmapped.out.mate1
    │       ├── Unmapped.out.mate2
    │       ├── __starchip_chkpts
    │       │   ├── pipeliner.2350.cmds
    │       │   ├── pipeliner.2900.cmds
    │       │   ├── star_align.ok
    │       │   └── starchip.ok
    │       ├── starchip.summary
    │       └── starchip.summary.annotated
    │   └── uger
    │       ├── starchip.uger.cmd
    │       └── starchip.uger.conf
├── benchmarking
    ├── FusionProgParsers
    │   ├── ARRIBA_hc_parser.pm
    │   ├── ARRIBA_parser.pm
    │   ├── ChimPipe_parser.pm
    │   ├── ChimeraScan_parser.pm
    │   ├── DEFUSE_parser.pm
    │   ├── EricScript_parser.pm
    │   ├── FusionCatcher_KP_parser.pm
    │   ├── FusionCatcher_parser.pm
    │   ├── FusionHunter_parser.pm
    │   ├── FusionInspector_parser.pm
    │   ├── InFusion_parser.pm
    │   ├── JAFFA_parser.pm
    │   ├── MapSplice_parser.pm
    │   ├── NFuse_parser.pm
    │   ├── PIZZLY_parser.pm
    │   ├── PRADA_parser.pm
    │   ├── SOAPfuse_parser.pm
    │   ├── STARCHIP_parser.pm
    │   ├── STARFusion_parser.pm
    │   ├── STARSEQR_parser.pm
    │   ├── TopHatFusion_parser.pm
    │   └── TrinityFusion_parser.pm
    ├── Venn_analysis_strategy.pl
    ├── aggregate_peak_F1_stats.R
    ├── all_TP_FP_FN_to_ROC.pl
    ├── all_TP_FP_FN_to_ROC.vary_minF_minS.pl
    ├── all_TP_FP_FN_to_ROC.vary_minF_minS.plot.Rscript
    ├── calc_PR.py
    ├── collect_preds.pl
    ├── collected_preds_to_fusion_prog_support_listing.pl
    ├── compare_A_vs_B_scored_preds.pl
    ├── define_truth_n_unsure_set.pl
    ├── examine_FPs.pl
    ├── filter_collected_preds.pl
    ├── fusion_preds_sensitivity_vs_expr.avg_replicates.pl
    ├── fusion_preds_sensitivity_vs_expr.pl
    ├── fusion_preds_to_TP_FP_FN.pl
    ├── fusion_preds_to_matrix.pl
    ├── fusion_progs_agree_to_matrix.pl
    ├── fusion_sample_TPs_to_matrix.pl
    ├── map_gene_symbols_to_gencode.pl
    ├── notes
    ├── plotters
    │   ├── AUC_barplot.Rscript
    │   ├── AUC_boxplot.from_separate_auc_files.Rscript
    │   ├── AUC_boxplot.from_single_summary_AUC_file.Rscript
    │   ├── plotPRcurves.R
    │   ├── plot_AUC_50_vs_101_boxplots.Rscript
    │   ├── plot_F1_vs_min_frags.R
    │   ├── plot_ROC.Rscript
    │   ├── plot_TP_FP_vs_minSum_per_prog.R
    │   ├── plot_all_auc_barplots.Rscript
    │   ├── plot_before_vs_after_filt_TP_FP_compare.Rscript
    │   ├── plot_median_accuracy_ranking_vs_median_runtime.R
    │   ├── plot_peak_F1_scatter.R
    │   └── plot_upsetR.R
    └── run_prediction_accuracy_assessment_pipeline.pl
├── cancer_cell_lines
    ├── Edgren_subset
    │   ├── analyze_Edgren_subset.pl
    │   ├── cleanMe.sh
    │   ├── edgren.truthset
    │   ├── edgren.truthset.raw
    │   ├── eval_edgren_min_agree.consolidated.pl
    │   ├── eval_edgren_min_agree.pl
    │   ├── examine_validated_enrichment.R
    │   └── runMe.sh
    ├── SuppTable-cancer_cell_lines.csv
    ├── analyze_cancer_data.pl
    ├── cleanMe.sh
    ├── progs_select.txt
    └── runMe.sh
├── cleanMe.sh
├── progs_restrict.txt
├── resources
    ├── genes.aliases
    ├── genes.coords.gz
    ├── notes
    ├── notes.paralog_clustering.2020.txt
    ├── paralog_clusters.2020.I3.dat
    ├── paralog_clusters.2020.I5.dat
    ├── paralog_clusters.dat
    └── paralog_clusters.dat.2019
├── runMe.sh
├── runtime_analysis
    ├── STAR_F_multicore
    │   └── runtimes.txt
    ├── all_progs_cancer
    │   ├── __origfmt
    │   │   └── runtimes.orignames.dat
    │   └── runtimes.txt
    ├── cleanMe.sh
    └── runMe.sh
├── simulated_data
    ├── SuppTable-sim_reads.csv
    ├── analyze_simulated_data.pl
    ├── cleanMe.sh
    ├── runMe.sh
    ├── sim_101
    │   ├── cleanMe.sh
    │   ├── runMe.sh
    │   ├── sim_101.fusion_TPM_values.dat
    │   └── sim_101.truth_set.dat
    └── sim_50
    │   ├── cleanMe.sh
    │   ├── runMe.sh
    │   ├── sim_50.fusion_TPM_values.dat
    │   └── sim_50.truth_set.dat
└── util
    ├── Terra
        ├── organize_FI_results_for_benchmarking.py
        └── organize_StarF_results_for_benchmarking.py
    ├── __get_figs_for_paper.pl
    ├── basic_accuracy_analysis.pl
    ├── boxplot_runtimes.Rscript
    ├── capture_PR_AUC_for_plotting.pl
    ├── make_file_listing_input_table.pl
    ├── make_supp_AUC_table.pl
    ├── make_supp_ROC_table.pl
    ├── paralog_clustering_util
        ├── README.md
        ├── blast_outfmt6_replace_trans_id_w_gene_symbol.pl
        ├── get_top_blast_pairs.pl
        ├── outfmt6_add_percent_match_length.group_segments.pl
        ├── outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl
        └── outfmt6_add_percent_match_length.pl
    └── terra_partition_to_sample_dirs.py


/Docker/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | MAINTAINER bhaas@broadinstitute.org
  3 | 
  4 | RUN apt-get update && apt-get install -y gcc g++ perl python automake make \
  5 |  wget git curl libdb-dev \
  6 |  zlib1g-dev bzip2 libncurses5-dev \
  7 |  texlive-latex-base \
  8 |  default-jre \
  9 |  python-pip python-dev \
 10 |  gfortran \
 11 |  build-essential libghc-zlib-dev libncurses-dev libbz2-dev liblzma-dev libpcre3-dev libxml2-dev \
 12 |  libblas-dev gfortran git unzip ftp libzmq3-dev nano ftp fort77 libreadline-dev \
 13 |  libcurl4-openssl-dev libx11-dev libxt-dev \
 14 |  x11-common libcairo2-dev libpng-dev libreadline-dev libjpeg-dev pkg-config libtbb-dev \
 15 |  && apt-get clean
 16 | 
 17 | RUN curl -L https://cpanmin.us | perl - App::cpanminus
 18 | 
 19 | RUN cpanm install DB_File
 20 | RUN cpanm install URI::Escape
 21 | RUN cpanm install JSON::XS
 22 | 
 23 | 
 24 | ## set up tool config and deployment area:
 25 | 
 26 | ENV SRC /usr/local/src
 27 | ENV BIN /usr/local/bin
 28 | 
 29 | 
 30 | #####
 31 | # Install R
 32 | 
 33 | WORKDIR $SRC
 34 | 
 35 | ENV R_VERSION=R-3.5.2
 36 | 
 37 | RUN curl https://cran.r-project.org/src/base/R-3/$R_VERSION.tar.gz -o $R_VERSION.tar.gz && \
 38 |  tar xvf $R_VERSION.tar.gz && \
 39 |  cd $R_VERSION && \
 40 |  ./configure && make && make install
 41 | 
 42 | 
 43 | 
 44 | RUN curl -L https://cpanmin.us | perl - App::cpanminus
 45 | 
 46 | RUN cpanm install DB_File
 47 | RUN cpanm install Set::IntervalTree
 48 | 
 49 | 
 50 | 
 51 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("ggplot2", dep = TRUE)'
 52 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("Biobase", dep = TRUE)'
 53 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("qvalue", dep = TRUE)'
 54 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("fastcluster", dep = TRUE)'
 55 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("tidyr", dep = TRUE)'
 56 | RUN Rscript -e 'source("http://bioconductor.org/biocLite.R");library(BiocInstaller); biocLite("devtools", dep = TRUE)'
 57 | 
 58 | ## get my hacked version of upsetR
 59 | WORKDIR $SRC
 60 | RUN git clone https://github.com/brianjohnhaas/UpSetR.git && \
 61 |    cd UpSetR && \
 62 |    git checkout d72b0b5 && \
 63 |    Rscript -e 'install.packages("./", repos=NULL, type="source", INSTALL_opts = "--with-keep.source")'
 64 | 
 65 | 
 66 | #########
 67 | # Install FusionAnnotator
 68 | 
 69 | WORKDIR $SRC
 70 | RUN git clone https://github.com/FusionAnnotator/FusionAnnotator.git && \
 71 |  cd FusionAnnotator && \
 72 |  git checkout 0dc2edc25f7881fd552236c5e12b302cef6eea7a 
 73 | 
 74 | ENV FUSION_ANNOTATOR ${SRC}/FusionAnnotator
 75 | 
 76 | 
 77 | ########
 78 | # Install Trinity (just for plotting utilities)
 79 | WORKDIR $SRC
 80 | RUN git clone https://github.com/trinityrnaseq/trinityrnaseq.git && \
 81 |  cd trinityrnaseq && \
 82 |  git checkout 514756d12c614046a4ad50fd63b34e59cdec4c9a
 83 | 
 84 | ENV TRINITY_HOME ${SRC}/trinityrnaseq
 85 | 
 86 | ##############
 87 | # Install fusion benchmarking
 88 | WORKDIR $SRC
 89 | RUN git clone https://github.com/fusiontranscripts/FusionBenchmarking.git && \
 90 |  cd FusionBenchmarking && \
 91 |  git checkout fa3d7bc0ef3757a3c5c65c2f80e216128cfc9f8e
 92 | 
 93 | 
 94 | ENV LC_ALL=C
 95 | 
 96 | ## mini ctat genome lib used by fusion annotator:
 97 | COPY ctat_genome_lib_dir $SRC/ctat_genome_lib_dir/
 98 | 
 99 | ENV CTAT_GENOME_LIB $SRC/ctat_genome_lib_dir
100 | 
101 | COPY run_eval.sh /
102 | 
103 | CMD ["/run_eval.sh"]
104 | 
105 | 


--------------------------------------------------------------------------------
/Docker/build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker build -t trinityctat/fusionbenchmarking .
4 | 
5 | 


--------------------------------------------------------------------------------
/Docker/make_simg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | singularity build trinityctat.fusionbenchmarking.simg docker://trinityctat/fusionbenchmarking
6 | 
7 | 


--------------------------------------------------------------------------------
/Docker/push_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker push trinityctat/fusionbenchmarking
4 | 
5 | 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/Docker/run_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | docker run --rm -it -v `pwd`:/data trinityctat/fusionbenchmarking $*
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/Docker/run_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | cd /data
 6 | 
 7 | cp -r /usr/local/src/FusionBenchmarking FusionBenchmarkingWorkspace
 8 | 
 9 | PROGS_RESTRICT=`pwd`/FusionBenchmarkingWorkspace/progs_restrict.txt
10 | 
11 | cd FusionBenchmarkingWorkspace/cancer_cell_lines && ./runMe.sh ${PROGS_RESTRICT}
12 | 
13 | cd ../simulated_data && ./runMe.sh ${PROGS_RESTRICT}
14 | 
15 | echo done
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/Docker/run_singularity.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | singularity exec -e -B `pwd`:/data trinityctat.fusionbenchmarking.simg /run_eval.sh
6 | 


--------------------------------------------------------------------------------
/PerlLib/Fasta_reader.pm:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/perl -w
  2 | 
  3 | # lightweight fasta reader capabilities:
  4 | package Fasta_reader;
  5 | 
  6 | use strict;
  7 | use warnings;
  8 | use Carp;
  9 | 
 10 | sub new {
 11 |     my ($packagename, $fastaFile) = @_;
 12 | 
 13 | 	## note: fastaFile can be a filename or an IO::Handle
 14 | 	
 15 | 
 16 |     my $self = { fastaFile => undef,,
 17 | 				 fileHandle => undef };
 18 | 
 19 |     bless ($self, $packagename);
 20 |     
 21 |     ## create filehandle
 22 |     my $filehandle = undef;
 23 |     
 24 | 	if (ref $fastaFile eq 'IO::Handle') {
 25 | 		$filehandle = $fastaFile;
 26 | 	}
 27 | 	else {
 28 | 		if ($fastaFile =~ /\.gz$/) {
 29 |             open ($filehandle, "gunzip -c $fastaFile | ") or confess "Error, cannot open file $fastaFile using 'gunzip -c'";
 30 |         }
 31 |         else {
 32 |             open ($filehandle, $fastaFile) or die "Error: Couldn't open $fastaFile\n";
 33 |         }
 34 | 		$self->{fastaFile} = $fastaFile;
 35 | 	}
 36 | 	
 37 | 	$self->{fileHandle} = $filehandle;
 38 | 
 39 |     return ($self);
 40 | }
 41 | 
 42 | 
 43 | 
 44 | #### next() fetches next Sequence object.
 45 | sub next {
 46 |     my $self = shift;
 47 |     my $orig_record_sep = $/;
 48 |     $/="\n>";
 49 |     my $filehandle = $self->{fileHandle};
 50 |     my $next_text_input = <$filehandle>;
 51 |     
 52 | 	if (defined($next_text_input) && $next_text_input !~ /\w/) {
 53 | 		## must have been some whitespace at start of fasta file, before first entry.
 54 | 		## try again:
 55 | 		$next_text_input = <$filehandle>;
 56 | 	}
 57 | 	
 58 | 	my $seqobj = undef;
 59 |     
 60 | 	if ($next_text_input) {
 61 | 		$next_text_input =~ s/^>|>$//g; #remove trailing > char.
 62 | 		$next_text_input =~ tr/\t\n\000-\037\177-\377/\t\n/d; #remove cntrl chars
 63 | 		my ($header, @seqlines) = split (/\n/, $next_text_input);
 64 | 		my $sequence = join ("", @seqlines);
 65 | 		$sequence =~ s/\s//g;
 66 | 		
 67 | 		$seqobj = Sequence->new($header, $sequence);
 68 |     }
 69 |     
 70 |     $/ = $orig_record_sep; #reset the record separator to original setting.
 71 |     
 72 |     return ($seqobj); #returns null if not instantiated.
 73 | }
 74 | 
 75 | 
 76 | #### finish() closes the open filehandle to the query database.
 77 | sub finish {
 78 |     my $self = shift;
 79 |     my $filehandle = $self->{fileHandle};
 80 |     close $filehandle;
 81 |     $self->{fileHandle} = undef;
 82 | }
 83 | 
 84 | ####
 85 | sub retrieve_all_seqs_hash {
 86 | 	my $self = shift;
 87 | 
 88 | 	my %acc_to_seq;
 89 | 	
 90 | 	while (my $seq_obj = $self->next()) {
 91 | 		my $acc = $seq_obj->get_accession();
 92 | 		my $sequence = $seq_obj->get_sequence();
 93 | 
 94 | 		$acc_to_seq{$acc} = $sequence;
 95 | 	}
 96 | 
 97 | 	return(%acc_to_seq);
 98 | }
 99 | 
100 | 
101 | 
102 | ##############################################
103 | package Sequence;
104 | use strict;
105 | 
106 | sub new {
107 |     my ($packagename, $header, $sequence) = @_;
108 |     
109 |     ## extract an accession from the header:
110 |     my ($acc, $rest) = split (/\s+/, $header, 2);
111 |         
112 |     my $self = { accession => $acc,
113 | 		 header => $header,
114 | 		 sequence => $sequence,
115 | 		 filename => undef };
116 |     bless ($self, $packagename);
117 |     return ($self);
118 | }
119 | 
120 | ####
121 | sub get_accession {
122 |     my $self = shift;
123 |     return ($self->{accession});
124 | }
125 | 
126 | ####
127 | sub get_header {
128 |     my $self = shift;
129 |     return ($self->{header});
130 | }
131 | 
132 | ####
133 | sub get_sequence {
134 |     my $self = shift;
135 |     return ($self->{sequence});
136 | }
137 | 
138 | #### 
139 | sub get_FASTA_format {
140 |     my $self = shift;
141 |     my %settings = @_;
142 | 
143 |     my $fasta_line_len = $settings{fasta_line_len} || 60;
144 |     
145 |     my $header = $self->get_header();
146 |     my $sequence = $self->get_sequence();
147 |     if ($fasta_line_len > 0) {
148 |         $sequence =~ s/(\S{$fasta_line_len})/$1\n/g;
149 |         chomp $sequence;
150 |     }
151 |     my $fasta_entry = ">$header\n$sequence\n";
152 |     return ($fasta_entry);
153 | }
154 | 
155 | 
156 | ####
157 | sub write_fasta_file {
158 |     my $self = shift;
159 |     my $filename = shift;
160 | 
161 |     my ($accession, $header, $sequence) = ($self->{accession}, $self->{header}, $self->{sequence});
162 |     
163 | 	my $fasta_entry = $self->get_FASTA_format();
164 | 	
165 |     my $tempfile;
166 |     if ($filename) {
167 | 		$tempfile = $filename;
168 |     } else {
169 | 		my $acc = $accession;
170 | 		$acc =~ s/\W/_/g;
171 | 		$tempfile = "$acc.fasta";
172 |     }
173 |     
174 |     open (TMP, ">$tempfile") or die "ERROR! Couldn't write a temporary file in current directory.\n";
175 |     print TMP $fasta_entry;
176 |     close TMP;
177 |     return ($tempfile);
178 | }
179 | 
180 | ####
181 | sub get_core_read_name {
182 |     my $self = shift;
183 |     
184 |     my $acc = $self->get_accession();
185 |     $acc =~ s|/[12]$||;
186 |     return($acc);
187 | }
188 | 
189 | 
190 | 1; #EOM
191 | 
192 | 
193 | 


--------------------------------------------------------------------------------
/PerlLib/Process_cmd.pm:
--------------------------------------------------------------------------------
 1 | package Process_cmd;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | use Cwd;
 7 | 
 8 | require Exporter;
 9 | our @ISA = qw(Exporter);
10 | our @EXPORT = qw(process_cmd ensure_full_path);
11 | 
12 | 
13 | sub process_cmd {
14 | 	my ($cmd) = @_;
15 | 
16 | 	print STDERR "CMD: $cmd\n";
17 | 
18 | 	my $ret = system($cmd);
19 | 	if ($ret) {
20 | 		confess "Error, cmd:\n$cmd\n died with ret ($ret)";
21 | 	}
22 | 
23 | 	return;
24 | }
25 | 
26 | 
27 | sub ensure_full_path {
28 |     my ($path) = @_;
29 | 
30 |     unless ($path =~ m|^/|) {
31 |         $path = cwd() . "/$path";
32 |     }
33 | 
34 |     return($path);
35 | }
36 | 
37 | 
38 | 
39 | 1; #EOM
40 | 
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fusion Transcript Benchmarking
2 | 
3 | See [wiki](https://github.com/fusiontranscripts/FusionBenchmarking/wiki) for documentation.
4 | 
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/alt_methods/STAR-Fusion/uger/starF_v1.5_hg19.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf starF_v1.5_hg19.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7 --name StarF_v1.5_hg19
2 | 


--------------------------------------------------------------------------------
/alt_methods/STAR-Fusion/uger/starF_v1.5_hg19.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=STAR_FUSION_v1.5_hg19_Apr042019
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/STAR-Fusion/SINGULARITY/star-fusion.v1.5.0.simg  /usr/local/src/STAR-Fusion/STAR-Fusion  --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} -O {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir
11 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-C/TrinityFusion-C.hg19.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-C.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_C_hg19
2 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-C/TrinityFusion-C.hg19.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=TRINITY_FUSION_C_hg19
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg  /usr/local/src/TrinityFusion/TrinityFusion  --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir --chimeric_junctions {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Chimeric.out.junction
11 | 
12 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-D/TrinityFusion-D.hg19.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-D.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_D_hg19
2 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-D/TrinityFusion-D.hg19.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=TRINITY_FUSION_D_hg19
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg  /usr/local/src/TrinityFusion/TrinityFusion  --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir  --max_memory 20G
11 | 
12 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-UC/TrinityFusion-UC.hg19.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 20 --queue broad --run_conf TrinityFusion-UC.hg19.conf --h_rt 72:00:00 --project_name regevlab --os RedHat7 --name TrinF_UC_hg19
2 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/uger/TrinityFusion-UC/TrinityFusion-UC.hg19.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=TRINITY_FUSION_UC_hg19
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/SINGULARITY/ctat-trinityfusion/TrinityFusion.v0.2.0.simg  /usr/local/src/TrinityFusion/TrinityFusion  --left_fq {__LEFT_FQ__} --right_fq {__RIGHT_FQ__} --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --CPU 1 --genome_lib_dir /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir --chimeric_junctions {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Chimeric.out.junction --aligned_bam {__LOCAL_ANALYSIS_DIR__}/STAR_FUSION_v1.5_hg19_Apr042019/Aligned.out.bam
11 | 
12 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/wdl/TrinityFusion.wdl:
--------------------------------------------------------------------------------
  1 | 
  2 | task TRINITY_FUSION_UC_TASK {
  3 | 
  4 | 	String sample_name
  5 | 	File left_fq
  6 |     File right_fq
  7 | 	File genome_lib_tar
  8 | 	File chimeric_junctions_file
  9 | 	File aligned_bam
 10 | 
 11 | 	command <<<
 12 | 
 13 |     set -e
 14 | 
 15 |     # untar the genome lib
 16 |     tar xvf ${genome_lib_tar}
 17 | 	rm ${genome_lib_tar}
 18 | 	
 19 | 	# TrinityFusion
 20 | 
 21 |     /usr/local/src/TrinityFusion/TrinityFusion \
 22 |          --left_fq ${left_fq} \
 23 |          --right_fq ${right_fq} \
 24 |          --chimeric_junctions ${chimeric_junctions_file} \
 25 |          --aligned_bam ${aligned_bam} \
 26 |          --CPU 10 \
 27 |          --genome_lib_dir ctat_genome_lib_build_dir \
 28 |          --output_dir ${sample_name}
 29 |      
 30 |      
 31 |     cp ${sample_name}/TrinityFusion-UC.fusion_predictions.tsv ${sample_name}.TrinityFusion-UC.fusion_predictions.tsv
 32 | 
 33 |     gzip ${sample_name}.TrinityFusion-UC.fusion_predictions.tsv
 34 |     
 35 |     >>>
 36 |     
 37 |     output {
 38 |       File TrinityFusion_UC="${sample_name}.TrinityFusion-UC.fusion_predictions.tsv.gz"
 39 |     }
 40 |     
 41 | 
 42 |     runtime {
 43 |             docker: "trinityctat/trinityfusion:0.2.0"
 44 |             disks: "local-disk 500 SSD"
 45 |             memory: "30G"
 46 |             cpu: "10"
 47 |             preemptible: 0
 48 |             maxRetries: 0
 49 |     }
 50 | }
 51 | 
 52 | 
 53 | task TRINITY_FUSION_D_TASK {
 54 | 
 55 | 	String sample_name
 56 | 	File left_fq
 57 |     File right_fq
 58 | 	File genome_lib_tar
 59 | 
 60 | 
 61 | 	command <<<
 62 | 
 63 |     set -e
 64 | 
 65 |     # untar the genome lib
 66 |     tar xvf ${genome_lib_tar}
 67 | 	rm ${genome_lib_tar}
 68 | 	
 69 | 	# TrinityFusion
 70 | 
 71 |     /usr/local/src/TrinityFusion/TrinityFusion \
 72 |          --left_fq ${left_fq} \
 73 |          --right_fq ${right_fq} \
 74 |          --CPU 10 \
 75 |          --genome_lib_dir ctat_genome_lib_build_dir \
 76 |          --output_dir ${sample_name}
 77 |      
 78 |      
 79 |     cp ${sample_name}/TrinityFusion-D.fusion_predictions.tsv ${sample_name}.TrinityFusion-D.fusion_predictions.tsv
 80 | 
 81 |     gzip ${sample_name}.TrinityFusion-D.fusion_predictions.tsv
 82 |     
 83 |     >>>
 84 |     
 85 |     output {
 86 |       File TrinityFusion_D="${sample_name}.TrinityFusion-D.fusion_predictions.tsv.gz"
 87 |     }
 88 |     
 89 | 
 90 |     runtime {
 91 |             docker: "trinityctat/trinityfusion:0.2.0"
 92 |             disks: "local-disk 500 SSD"
 93 |             memory: "30G"
 94 |             cpu: "10"
 95 |             preemptible: 0
 96 |             maxRetries: 0
 97 |     }
 98 | }
 99 | 
100 | 
101 | 
102 | workflow trinity_fusion_wf {
103 | 	Boolean? TrinityFusion_C
104 |     Boolean? TrinityFusion_UC
105 |     Boolean? TrinityFusion_D
106 |     
107 | 	String sample_name
108 | 	File left_fq
109 |     File right_fq
110 | 	File genome_lib_tar
111 | 
112 | 	File? chimeric_junctions_file
113 | 	File? star_aligned_bam
114 | 
115 | 
116 | 	if (defined(TrinityFusion_UC)) {
117 |     	call TRINITY_FUSION_UC_TASK {
118 |           input:
119 |         	sample_name=sample_name,
120 |             left_fq=left_fq,
121 |             right_fq=right_fq,
122 |             genome_lib_tar=genome_lib_tar,
123 |             chimeric_junctions_file=chimeric_junctions_file,
124 | 			aligned_bam=star_aligned_bam
125 |         }
126 | 	}
127 | 
128 | 	if (defined(TrinityFusion_D)) {
129 |     	call TRINITY_FUSION_D_TASK {
130 |           input:
131 |         	sample_name=sample_name,
132 |             left_fq=left_fq,
133 |             right_fq=right_fq,
134 |             genome_lib_tar=genome_lib_tar
135 |         }
136 | 	}
137 | 
138 | }
139 | 
140 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/wdl/inputs.json:
--------------------------------------------------------------------------------
1 | {
2 |   "trinity_fusion_wf.genome_lib_tar": "inputs/ctat_testkit_genome_lib_dir.star1.5.tar",
3 |   "trinity_fusion_wf.left_fq": "inputs/rnaseq_1.fastq.gz",
4 |   "trinity_fusion_wf.right_fq": "inputs/rnaseq_2.fastq.gz",
5 |   "trinity_fusion_wf.TrinityFusion_D": "true",
6 |   "trinity_fusion_wf.sample_name": "mysample"
7 | }
8 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/wdl/make_wdl_input_template.sh:
--------------------------------------------------------------------------------
1 | java -jar ~/utilities/wdltool-0.12.jar  inputs TrinityFusion.wdl 
2 | 


--------------------------------------------------------------------------------
/alt_methods/TrinityFusion/wdl/run.sh:
--------------------------------------------------------------------------------
1 | java -jar ~/utilities/cromwell-39.jar run TrinityFusion.wdl --inputs inputs.json
2 | 


--------------------------------------------------------------------------------
/alt_methods/arriba/arriba_wrapper.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
 7 | use FindBin;
 8 | use lib ("$FindBin::Bin/../../PerlLib");
 9 | use Process_cmd;
10 | 
11 | 
12 | my $usage = <<__EOUSAGE__;
13 | 
14 | #########################################################################
15 | #
16 | # Required:
17 | #
18 | # --left_reads <string>               left reads file (reads1.fastq.gz)
19 | # --right_reads <string>              right reads file (reads2.fastq.gz)
20 | # --arriba_singularity_img <string>   arriba singularity img
21 | # --arriba_references_dir <string>    arriba references directory
22 | # --output_dir <string>               output directory
23 | #
24 | # Optional:
25 | #
26 | # --mount <string>                    dirctory to mount
27 | #
28 | ########################################################################
29 | 
30 | __EOUSAGE__
31 | 
32 | 
33 |     ;
34 | 
35 | 
36 |     
37 | my $help_flag;
38 | my $left_reads;
39 | my $right_reads;
40 | my $arriba_singularity_img;
41 | my $arriba_references_dir;
42 | my $output_dir;
43 | my $mount = "";
44 | 
45 | &GetOptions ( 'h' => \$help_flag,
46 | 
47 |               ## all required
48 |               'left_reads=s' => \$left_reads,
49 |               'right_reads=s' => \$right_reads,
50 |               'arriba_singularity_img=s' => \$arriba_singularity_img,
51 |               'arriba_references_dir=s' => \$arriba_references_dir,
52 |               'output_dir=s' => \$output_dir,
53 | 
54 |               # optional
55 |               'mount=s' => \$mount,
56 |     );
57 | 
58 | 
59 | if ($help_flag) {
60 |     die $usage;
61 | }
62 | 
63 | unless($left_reads && $right_reads && $arriba_singularity_img && $arriba_references_dir && $output_dir) {
64 |     die $usage;
65 | }
66 | 
67 | if ($mount) {
68 |     $mount = &ensure_full_path($mount);
69 |     $mount = " -B $mount ";
70 | }
71 | 
72 | $left_reads = &ensure_full_path($left_reads);
73 | $right_reads = &ensure_full_path($right_reads);
74 | $arriba_singularity_img = &ensure_full_path($arriba_singularity_img);
75 | $arriba_references_dir = &ensure_full_path($arriba_references_dir);
76 | $output_dir = &ensure_full_path($output_dir);
77 | 
78 | 
79 | main: {
80 | 
81 |     unless (-d $output_dir) {
82 |         &process_cmd("mkdir -p $output_dir");
83 |     }
84 |     
85 |     my $cmd = "singularity exec -e $mount"
86 |             . " -B $output_dir:/output "
87 |             . " -B $arriba_references_dir:/references:ro "
88 |             . " -B $left_reads:/read1.fastq.gz:ro "
89 |             . " -B $right_reads:/read2.fastq.gz:ro "
90 |             . " $arriba_singularity_img arriba.sh ";
91 |     
92 |     &process_cmd($cmd);
93 |     
94 | }
95 | 
96 | 


--------------------------------------------------------------------------------
/alt_methods/arriba/uger/arriba.uger.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf arriba.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7  --name arriba
2 | 


--------------------------------------------------------------------------------
/alt_methods/arriba/uger/arriba.uger.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=ARRIBA
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=/home/unix/bhaas/GITHUB/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/arriba/arriba_wrapper.pl --left_reads {__LEFT_FQ__} --right_reads {__RIGHT_FQ__} --arriba_singularity_img /seq/RNASEQ/TOOLS/ARRIBA/SINGULARITY/arriba-1.1.0.simg --arriba_references_dir /seq/RNASEQ/TOOLS/ARRIBA/references --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --mount /seq/RNASEQ
11 | 


--------------------------------------------------------------------------------
/alt_methods/prada/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | MAINTAINER bhaas@broadinstitute.org
 3 | 
 4 | RUN apt-get update && apt-get install -y gcc g++ perl python automake make \
 5 |                                        wget git curl libdb-dev \
 6 |                                        zlib1g-dev bzip2 libncurses5-dev \
 7 |                                        texlive-latex-base \
 8 |                                        default-jre \
 9 |                                        python-pip python-dev \
10 |                                        gfortran \
11 |                                        build-essential libghc-zlib-dev libncurses-dev libbz2-dev liblzma-dev libpcre3-dev libxml2-dev \
12 |                                        libblas-dev gfortran git unzip ftp libzmq3-dev nano ftp fort77 libreadline-dev \
13 |                                        libcurl4-openssl-dev libx11-dev libxt-dev \
14 |                                        x11-common libcairo2-dev libpng12-dev libreadline6-dev libjpeg8-dev pkg-config libtbb-dev \
15 |                    && apt-get clean
16 | 
17 | 
18 | RUN sed -i -e 's/:\/\/(archive.ubuntu.com\|security.ubuntu.com)/old-releases.ubuntu.com/g' /etc/apt/sources.list
19 | 
20 | 
21 | ## install old java-7
22 | RUN apt-get update && apt-get install -y python3-software-properties software-properties-common
23 | 
24 | RUN add-apt-repository ppa:openjdk-r/ppa && \
25 |     apt-get update && apt-get install -y openjdk-7-jdk
26 | 
27 | 
28 | RUN mv /usr/bin/java /usr/bin/java8 && \
29 |     ln -s /usr/lib/jvm/java-7-openjdk-amd64/bin/java /usr/bin/java7 && \
30 |     ln -s /usr/bin/java7 /usr/bin/java
31 | 
32 | 
33 | 
34 | ## install prada
35 | 
36 | WORKDIR /usr/local/src
37 | RUN wget https://downloads.sourceforge.net/project/prada/pyPRADA/pyPRADA_1.2.tar.gz  && \
38 |     tar xvf pyPRADA_1.2.tar.gz
39 | 
40 | 


--------------------------------------------------------------------------------
/alt_methods/prada/VERSION.txt:
--------------------------------------------------------------------------------
1 | 1.2
2 | 


--------------------------------------------------------------------------------
/alt_methods/prada/build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ev
4 | 
5 | VERSION=`cat VERSION.txt`
6 | 
7 | docker build -t fusiontranscripts/prada:${VERSION} .
8 | docker build -t fusiontranscripts/prada:latest .
9 | 


--------------------------------------------------------------------------------
/alt_methods/star-seqr/docker/make_simg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | VERSION=0.6.7
4 | 
5 | singularity build star-seqr.v${VERSION}.simg docker://eagenomics/starseqr:$VERSION
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/alt_methods/star-seqr/docker/run_test.sh:
--------------------------------------------------------------------------------
1 | singularity run -e -B /home/bhaas star-seqr.v0.6.7.simg  starseqr.py -1 ~/garb/reads_1.fq.gz -2 ~/garb/reads_2.fq.gz -i /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa.star.idx -g /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf -r /home/bhaas/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa -m 1 -vv -p `pwd`/seqrout
2 | 
3 | 


--------------------------------------------------------------------------------
/alt_methods/star-seqr/uger/star-seqr.uger.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf star-seqr.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7  --name starseqr
2 | 


--------------------------------------------------------------------------------
/alt_methods/star-seqr/uger/star-seqr.uger.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=STARSEQR
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ /seq/RNASEQ/TOOLS/STAR-SEQR/SINGULARITY/star-seqr.v0.6.7.simg starseqr.py  -1 {__LEFT_FQ__}  -2 {__RIGHT_FQ__} -i /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa.star.idx -g /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf -r /seq/RNASEQ/CTAT_GENOME_LIB/GRCh37_gencode_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa -m 1 -vv -p {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} -t 1
11 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/Docker/VERSION.txt:
--------------------------------------------------------------------------------
1 | 1.3ec
2 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/Docker/build_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ev
4 | 
5 | VERSION=`cat VERSION.txt`
6 | 
7 | docker build -t fusiontranscripts/starchip:${VERSION} .
8 | docker build -t fusiontranscripts/starchip:latest .
9 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/Docker/make_simg.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | VERSION=`cat VERSION.txt`
4 | 
5 | singularity build starchip.v${VERSION}.simg docker://fusiontranscripts/starchip:$VERSION
6 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/Docker/push_docker.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ev
4 | 
5 | VERSION=`cat VERSION.txt`
6 | 
7 | docker push fusiontranscripts/starchip:${VERSION}
8 | docker push fusiontranscripts/starchip:latest
9 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/Docker/starchip_wrapper.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
  7 | use File::Basename;
  8 | use FindBin;
  9 | use lib ("$FindBin::Bin");
 10 | use Pipeliner;
 11 | 
 12 | 
 13 | my $help_flag;
 14 | 
 15 | my $output_token = "starchip";
 16 | my $chim_seg_min = 15;
 17 | 
 18 | my $usage = <<__EOUSAGE;
 19 | 
 20 | ######################################################################################################
 21 | #
 22 | # Required:
 23 | 
 24 | # --left_fq <string>   reads_1.fq.gz
 25 | #
 26 | # --right_fq <string>  reads_2.fq.gz
 27 | #
 28 | # --starchip_parameters_file <string>   the starchip parameters file (indicates where the star index is)
 29 | #
 30 | # --output_dir <string>  output directory
 31 | #
 32 | # Optional:
 33 | #
 34 | #  --output_token <string>  token for output files (default: $output_token)
 35 | #
 36 | #  --chim_seg_min <int>    value for STAR --chimSegmentMin and --chimJunctionOverhangMin  (default: $chim_seg_min)
 37 | #
 38 | #######################################################################################################
 39 | 
 40 | __EOUSAGE
 41 | 
 42 |     ;
 43 | 
 44 | 
 45 | my $left_fq;
 46 | my $right_fq;
 47 | my $starchip_parameters_file;
 48 | my $output_dir;
 49 | 
 50 | 
 51 | &GetOptions ( 'h' => \$help_flag,
 52 |               'left_fq=s' => \$left_fq,
 53 |               'right_fq=s' => \$right_fq,
 54 |               'starchip_parameters_file=s' => \$starchip_parameters_file,
 55 |               'chim_seg_min=i' => \$chim_seg_min,
 56 |               'output_dir=s' => \$output_dir);
 57 | 
 58 | 
 59 | if ($help_flag) {
 60 |     die $usage;
 61 | }
 62 | 
 63 | unless ($left_fq && $right_fq && $starchip_parameters_file && $output_dir) {
 64 |     die $usage;
 65 | }
 66 | 
 67 | $left_fq = Pipeliner::ensure_full_path($left_fq);
 68 | $right_fq = Pipeliner::ensure_full_path($right_fq);
 69 | $starchip_parameters_file = Pipeliner::ensure_full_path($starchip_parameters_file);
 70 | $output_dir = Pipeliner::ensure_full_path($output_dir);
 71 | 
 72 | 
 73 | 
 74 | main: {
 75 |     
 76 |     my $starchip_reference_dirname = dirname($starchip_parameters_file);
 77 |      
 78 |     my $star_index_dir = "$starchip_reference_dirname/ref_genome.fa.star.idx";
 79 |     
 80 |     if (! -d $output_dir) {
 81 |         &Pipeliner::process_cmd("mkdir -p $output_dir");
 82 |     }
 83 |     chdir($output_dir) or die "Error, cannot cd to $output_dir";
 84 | 
 85 |     &Pipeliner::process_cmd("ln -sf $starchip_reference_dirname");
 86 |     
 87 |     ## Run STAR:
 88 |     my $cmd = "STAR --genomeDir $star_index_dir "
 89 |             . " --readFilesIn $left_fq $right_fq "
 90 |             . " --outReadsUnmapped Fastx "
 91 |             . " --quantMode GeneCounts "
 92 |             . " --chimSegmentMin $chim_seg_min "
 93 |             . " --chimJunctionOverhangMin $chim_seg_min "
 94 |             . " --outSAMstrandField intronMotif "
 95 |             . " --readFilesCommand zcat "
 96 |             . " --outSAMtype BAM Unsorted ";
 97 | 
 98 |     my $chkpt_dir = "__starchip_chkpts";
 99 |     my $pipeliner = new Pipeliner( '-checkpoint_dir' => $chkpt_dir, '-verbose' => 2 );
100 |     
101 |     $pipeliner->add_commands( new Command($cmd, "star_align.ok") );
102 | 
103 |     ## run STARChip
104 |     
105 |     $cmd = "/usr/local/src/starchip-1.3e/starchip-fusions.pl $output_token Chimeric.out.junction $starchip_reference_dirname/hg19.parameters.txt";
106 |     
107 |     $pipeliner->add_commands( new Command($cmd, "starchip.ok") );
108 | 
109 |     $pipeliner->run();
110 | 
111 |     exit(0);
112 | 
113 | }
114 | 
115 | 
116 |     
117 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/README.md:
--------------------------------------------------------------------------------
 1 | instructions found at: https://github.com/LosicLab/starchip/tree/master/example
 2 | 
 3 | ## starchip setup
 4 | cp ~/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_genome.fa .
 5 | 
 6 | cp ~/CTAT_GENOMICS/genome_libs_StarF1.5/GRCh37_v19_CTAT_lib_Feb092018/ctat_genome_lib_build_dir/ref_annot.gtf .
 7 | 
 8 | singularity shell -e starchip.v1.3e.simg
 9 | 
10 | /usr/local/src/starchip-1.3e/setup.sh ref_annot.gtf ref_genome.fa references/
11 | 
12 | cd references
13 | wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz
14 | gunzip rmsk.txt.gz
15 | cut -f6-8 rmsk.txt > hg19.repeats.bed
16 | 
17 | 
18 | 
19 | ## run test
20 | 
21 | singularity shell -e starchip.v1.3e.simg
22 | 
23 | - star alignment, using STAR v2.5.3a
24 | 
25 | STAR --genomeDir ../CTAT_GENOMICS/genome_libs_StarFpre-v1.3/GRCh37_gencode_v19_CTAT_lib_Nov012017/ctat_genome_lib_build_dir/ref_genome.fa.star.idx --readFilesIn ../GITHUB/CTAT_FUSIONS/STAR-Fusion/testing/reads_1.fq.gz ../GITHUB/CTAT_FUSIONS/STAR-Fusion/testing/reads_2.fq.gz --outReadsUnmapped Fastx --quantMode GeneCounts --chimSegmentMin 15 --chimJunctionOverhangMin 15 --outSAMstrandField intronMotif --readFilesCommand zcat --outSAMtype BAM Unsorted 
26 | 
27 | - run example
28 | 
29 | /usr/local/src/starchip-1.3e/starchip-fusions.pl  ladeda2 reference/example/Chimeric.out.junction  reference/hg19.parameters.txt  
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -rf test_outdir/
4 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/run_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | ## download the starchip reference bundle and unpack
 6 | ## replace /seq/RNASEQ/TOOLS/STARCHIP/reference
 7 | ## replace /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3e.simg  with your location for the simg.
 8 | 
 9 | 
10 | singularity exec -e -B `pwd` \
11 |     -B /seq/RNASEQ/TOOLS/STARCHIP/reference:/usr/local/src/reference \
12 |        /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3e.simg \
13 |        /usr/local/bin/starchip_wrapper.pl \
14 |             --left_fq test_data/reads_1.fq.gz \
15 |             --right_fq test_data/reads_2.fq.gz \
16 |             --starchip_parameters_file /usr/local/src/reference/hg19.parameters \
17 |             --output_dir `pwd`/test_outdir
18 | 
19 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/test_data/reads_1.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_data/reads_1.fq.gz


--------------------------------------------------------------------------------
/alt_methods/starchip/test_data/reads_2.fq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_data/reads_2.fq.gz


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/Aligned.out.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/Aligned.out.bam


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/Log.final.out:
--------------------------------------------------------------------------------
 1 |                                  Started job on |	Apr 05 15:37:29
 2 |                              Started mapping on |	Apr 05 15:37:55
 3 |                                     Finished on |	Apr 05 15:37:58
 4 |        Mapping speed, Million of reads per hour |	6.03
 5 | 
 6 |                           Number of input reads |	5026
 7 |                       Average input read length |	100
 8 |                                     UNIQUE READS:
 9 |                    Uniquely mapped reads number |	1522
10 |                         Uniquely mapped reads % |	30.28%
11 |                           Average mapped length |	91.19
12 |                        Number of splices: Total |	725
13 |             Number of splices: Annotated (sjdb) |	682
14 |                        Number of splices: GT/AG |	715
15 |                        Number of splices: GC/AG |	7
16 |                        Number of splices: AT/AC |	0
17 |                Number of splices: Non-canonical |	3
18 |                       Mismatch rate per base, % |	0.69%
19 |                          Deletion rate per base |	0.01%
20 |                         Deletion average length |	1.17
21 |                         Insertion rate per base |	0.01%
22 |                        Insertion average length |	1.00
23 |                              MULTI-MAPPING READS:
24 |         Number of reads mapped to multiple loci |	280
25 |              % of reads mapped to multiple loci |	5.57%
26 |         Number of reads mapped to too many loci |	0
27 |              % of reads mapped to too many loci |	0.00%
28 |                                   UNMAPPED READS:
29 |        % of reads unmapped: too many mismatches |	0.00%
30 |                  % of reads unmapped: too short |	64.15%
31 |                      % of reads unmapped: other |	0.00%
32 |                                   CHIMERIC READS:
33 |                        Number of chimeric reads |	2207
34 |                             % of chimeric reads |	43.91%
35 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/Log.progress.out:
--------------------------------------------------------------------------------
1 |            Time    Speed        Read     Read   Mapped   Mapped   Mapped   Mapped Unmapped Unmapped Unmapped Unmapped
2 |                     M/hr      number   length   unique   length   MMrate    multi   multi+       MM    short    other
3 | ALL DONE!
4 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/__starchip_chkpts/pipeliner.2350.cmds:
--------------------------------------------------------------------------------
1 | STAR --genomeDir /usr/local/src/reference/ref_genome.fa.star.idx  --readFilesIn /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_1.fq.gz /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_2.fq.gz  --outReadsUnmapped Fastx  --quantMode GeneCounts  --chimSegmentMin 15  --chimJunctionOverhangMin 15  --outSAMstrandField intronMotif  --readFilesCommand zcat  --outSAMtype BAM Unsorted 
2 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/__starchip_chkpts/pipeliner.2900.cmds:
--------------------------------------------------------------------------------
1 | STAR --genomeDir /usr/local/src/reference/ref_genome.fa.star.idx  --readFilesIn /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_1.fq.gz /ahg/regev/users/bhaas/seq/bhaas/GIT/CTAT_FUSIONS/STAR-Fusion_benchmarking_data/alt_methods/starchip/test_data/reads_2.fq.gz  --outReadsUnmapped Fastx  --quantMode GeneCounts  --chimSegmentMin 15  --chimJunctionOverhangMin 15  --outSAMstrandField intronMotif  --readFilesCommand zcat  --outSAMtype BAM Unsorted 
2 | /usr/local/src/starchip-1.3e/starchip-fusions.pl starchip Chimeric.out.junction /usr/local/src/reference/hg19.parameters.txt
3 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/__starchip_chkpts/star_align.ok:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/__starchip_chkpts/star_align.ok


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/__starchip_chkpts/starchip.ok:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/alt_methods/starchip/test_outdir/__starchip_chkpts/starchip.ok


--------------------------------------------------------------------------------
/alt_methods/starchip/test_outdir/starchip.summary:
--------------------------------------------------------------------------------
 1 | Partner1	Partner2	SpanningReads	SplitReads	AvgAS	NearGene1	Distance1	NearGene2	Distance2	ConsensusSeq
 2 | chr17:48943419:-	chr17:35880751:-	26	8	0	TOB1	0	SYNRG	0	.
 3 | chr17:38243106:+	chr17:46371709:+	72	25	0	THRA	0	AC090627.1	0	.
 4 | chr20:56886178:+	chr19:17256207:+	8	5	0	RAB22A	0	MYO9B	0	.
 5 | chr4:76846964:+	chr1:247094880:+	22	2	0	NAAA	0	AHCTF1	0	.
 6 | chr17:57970686:+	chr17:47021337:-	17	9	0	RPS6KB1	0	SNF8	0	.
 7 | chr17:48548389:-	chr17:37595418:+	5	4	0	ACSF2	0	MED1	0	.
 8 | chr17:37374426:+	chr17:35479453:+	40	4	0	STAC2	0	ACACA	0	.
 9 | chr17:46384693:-	chr17:38243106:-	18	5	0	AC090627.1	0	THRA	0	.
10 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/uger/starchip.uger.cmd:
--------------------------------------------------------------------------------
1 | /home/unix/bhaas/GITHUB/broad_uge/util/run_RNASEQ_pipeline_many_samples_UGER_array.pl --annot_conf /seq/regev_genome_portal/RESOURCES/human/Hg19/Config/Gencode_v19.config --reads_list_file samples.txt --project_base_dir PROCESSING_DIR --num_threads_each 1 --memory 50 --queue broad --run_conf starchip.uger.conf --h_rt 20:00:00 --project_name regevlab --os RedHat7  --name starchip
2 | 


--------------------------------------------------------------------------------
/alt_methods/starchip/uger/starchip.uger.conf:
--------------------------------------------------------------------------------
 1 | ## Template variables appear in '{__TEMPLATE__}' format and are derived from other configuration files, and should not be edited.
 2 | 
 3 | [GLOBALS]
 4 | USE_QTRIM_READS=F
 5 | 
 6 | [CUSTOM_050]
 7 | RUN=T
 8 | CUSTOM_DIR=STARCHIP_csm10
 9 | USE_GZIP_FIFO=FALSE
10 | CMD=singularity exec -e -B /seq/RNASEQ -B /seq/RNASEQ/TOOLS/STARCHIP/reference:/usr/local/src/reference /seq/RNASEQ/TOOLS/STARCHIP/SINGULARITY/starchip.v1.3eb.simg /usr/local/bin/starchip_wrapper.pl --left_fq {__LEFT_FQ__}  --right_fq {__RIGHT_FQ__} --starchip_parameters_file /usr/local/src/reference/hg19.parameters --output_dir {__LOCAL_ANALYSIS_DIR__}/{__CUSTOM_DIR__} --chim_seg_min 10
11 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/ARRIBA_hc_parser.pm:
--------------------------------------------------------------------------------
  1 | package ARRIBA_hc_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =ARRIBA_format
  9 | 
 10 | 
 11 | 0       #gene1
 12 | 1       gene2
 13 | 2       strand1(gene/fusion)
 14 | 3       strand2(gene/fusion)
 15 | 4       breakpoint1
 16 | 5       breakpoint2
 17 | 6       site1
 18 | 7       site2
 19 | 8       type
 20 | 9       direction1
 21 | 10      direction2
 22 | 11      split_reads1
 23 | 12      split_reads2
 24 | 13      discordant_mates
 25 | 14      coverage1
 26 | 15      coverage2
 27 | 16      confidence
 28 | 17      closest_genomic_breakpoint1
 29 | 18      closest_genomic_breakpoint2
 30 | 19      filters
 31 | 20      fusion_transcript
 32 | 21      reading_frame
 33 | 22      peptide_sequence
 34 | 23      read_identifiers
 35 | 
 36 | 
 37 | 0       PID1
 38 | 1       DAP
 39 | 2       -/-
 40 | 3       -/-
 41 | 4       2:230020534
 42 | 5       5:10681281
 43 | 6       splice-site
 44 | 7       splice-site
 45 | 8       translocation
 46 | 9       upstream
 47 | 10      downstream
 48 | 11      305
 49 | 12      304
 50 | 13      300
 51 | 14      8156
 52 | 15      6135
 53 | 16      high
 54 | 17      .
 55 | 18      .
 56 | 19      duplicates(79),mismatches(9)
 57 | 20      ACACCGACCCCAGATGTAAAGCGGGACCCCAGCCCCTCGCCCCCCGGCGCGATCGACAGTCTCGCCAGCGTCTCCTCTGCCAAAACCCAGGGCTGGAAGATGTGGCAGCCGGCCACGGAGCGCCTGCAG___CACTTTCAGACCATGCTGAAGTCTAAATTGAATGTCTTAACACTGAAAAAGGAACCTCTCCCAGCGGTCATCTTCCATGAGCCGGAGGCCATTGAGCTGTGCACGACCACACCGCTGATGAAGACAAGGACTCACAGTGGCTGCAAG|GGTGACAAAGATTTCCCCCCGGCGGCTGCGCAGGTGGCTCACCAGAAGCCGCATGCCTCCATGGACAAGCATCCTTCCCCAAGAACCCAGCACATCCAGCAGCCACGCAAGTGAGCCTGGAGTCCACCAGCCTGCCCCATGGCCCCGGCTCTGCTGCACTTGGTATTTCCCTGACAGAGAGAACCAGCAGTTTCGCCCAAATCCTACTCTGCTGGGAAATCTAAGGCAAAACCAAGTGCTCTGTCCTTTGCCTTACATTTCCATATTTAAAACTAGAAACAGCTCCAGC
 58 | 21      in-frame
 59 | 22      MWQPATERLQHFQTMLKSKLNVLTLKKEPLPAVIFHEPEAIELCTTTPLMKTRTHSGCK|GDKDFPPAAAQVAHQKPHASMDKHPSPRTQHIQQPRK*
 60 | 23      .
 61 | 
 62 | 
 63 | =cut
 64 | 
 65 | 
 66 | 
 67 | sub parse_fusion_result_file {
 68 |     my ($file) = @_;
 69 | 
 70 |     my @fusions;
 71 | 
 72 |     open (my $fh, $file) or die "Error, cannot open file $file";
 73 |     my $header = <$fh>;
 74 |     while (<$fh>) {
 75 |         chomp;
 76 |         my @x = split(/\t/);
 77 | 
 78 |         my $conf_level = $x[16];
 79 |         unless ($conf_level =~ /high/i) { next; }  ### ARRIBA hc requires high confidence predictions only
 80 |         
 81 |         my $geneA = $x[0];
 82 |         my $geneB = $x[1];
 83 | 
 84 |         $geneA =~ s/\(\d+\)//g;
 85 |         $geneB =~ s/\(\d+\)//g;
 86 |         
 87 |         my $coord_info_A = $x[4];
 88 |         my ($chrA, $coordA) = split(/:/, $coord_info_A);
 89 | 
 90 |         my $coord_info_B = $x[5];
 91 |         my ($chrB, $coordB) = split(/:/, $coord_info_B);
 92 | 
 93 |         my $junction_read_count = $x[11] + $x[12];
 94 |         my $spanning_frags = $x[13];
 95 | 
 96 |         
 97 |         
 98 |         my $struct = {
 99 |             geneA => $geneA,
100 |             chrA => $chrA,
101 |             coordA => $coordA,
102 | 
103 |             geneB => $geneB,
104 |             chrB => $chrB,
105 |             coordB => $coordB,
106 | 
107 |             span_reads => $spanning_frags,
108 |             junc_reads => $junction_read_count,
109 |         };
110 | 
111 |         push (@fusions, $struct);
112 | 
113 |     }
114 | 
115 |     close $fh;
116 | 
117 |     return(@fusions);
118 | }
119 | 
120 | 
121 | 1; #EOM
122 | 
123 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/ARRIBA_parser.pm:
--------------------------------------------------------------------------------
  1 | package ARRIBA_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =ARRIBA_format
  9 | 
 10 | 
 11 | 0       #gene1
 12 | 1       gene2
 13 | 2       strand1(gene/fusion)
 14 | 3       strand2(gene/fusion)
 15 | 4       breakpoint1
 16 | 5       breakpoint2
 17 | 6       site1
 18 | 7       site2
 19 | 8       type
 20 | 9       direction1
 21 | 10      direction2
 22 | 11      split_reads1
 23 | 12      split_reads2
 24 | 13      discordant_mates
 25 | 14      coverage1
 26 | 15      coverage2
 27 | 16      confidence
 28 | 17      closest_genomic_breakpoint1
 29 | 18      closest_genomic_breakpoint2
 30 | 19      filters
 31 | 20      fusion_transcript
 32 | 21      reading_frame
 33 | 22      peptide_sequence
 34 | 23      read_identifiers
 35 | 
 36 | 
 37 | 0       PID1
 38 | 1       DAP
 39 | 2       -/-
 40 | 3       -/-
 41 | 4       2:230020534
 42 | 5       5:10681281
 43 | 6       splice-site
 44 | 7       splice-site
 45 | 8       translocation
 46 | 9       upstream
 47 | 10      downstream
 48 | 11      305
 49 | 12      304
 50 | 13      300
 51 | 14      8156
 52 | 15      6135
 53 | 16      high
 54 | 17      .
 55 | 18      .
 56 | 19      duplicates(79),mismatches(9)
 57 | 20      ACACCGACCCCAGATGTAAAGCGGGACCCCAGCCCCTCGCCCCCCGGCGCGATCGACAGTCTCGCCAGCGTCTCCTCTGCCAAAACCCAGGGCTGGAAGATGTGGCAGCCGGCCACGGAGCGCCTGCAG___CACTTTCAGACCATGCTGAAGTCTAAATTGAATGTCTTAACACTGAAAAAGGAACCTCTCCCAGCGGTCATCTTCCATGAGCCGGAGGCCATTGAGCTGTGCACGACCACACCGCTGATGAAGACAAGGACTCACAGTGGCTGCAAG|GGTGACAAAGATTTCCCCCCGGCGGCTGCGCAGGTGGCTCACCAGAAGCCGCATGCCTCCATGGACAAGCATCCTTCCCCAAGAACCCAGCACATCCAGCAGCCACGCAAGTGAGCCTGGAGTCCACCAGCCTGCCCCATGGCCCCGGCTCTGCTGCACTTGGTATTTCCCTGACAGAGAGAACCAGCAGTTTCGCCCAAATCCTACTCTGCTGGGAAATCTAAGGCAAAACCAAGTGCTCTGTCCTTTGCCTTACATTTCCATATTTAAAACTAGAAACAGCTCCAGC
 58 | 21      in-frame
 59 | 22      MWQPATERLQHFQTMLKSKLNVLTLKKEPLPAVIFHEPEAIELCTTTPLMKTRTHSGCK|GDKDFPPAAAQVAHQKPHASMDKHPSPRTQHIQQPRK*
 60 | 23      .
 61 | 
 62 | 
 63 | =cut
 64 | 
 65 | 
 66 | 
 67 | sub parse_fusion_result_file {
 68 |     my ($file) = @_;
 69 | 
 70 |     my @fusions;
 71 | 
 72 |     open (my $fh, $file) or die "Error, cannot open file $file";
 73 |     my $header = <$fh>;
 74 |     while (<$fh>) {
 75 |         chomp;
 76 |         my @x = split(/\t/);
 77 | 
 78 |         my $geneA = $x[0];
 79 |         my $geneB = $x[1];
 80 | 
 81 |         $geneA =~ s/\(\d+\)//g;
 82 |         $geneB =~ s/\(\d+\)//g;
 83 |         
 84 |         my $coord_info_A = $x[4];
 85 |         my ($chrA, $coordA) = split(/:/, $coord_info_A);
 86 | 
 87 |         my $coord_info_B = $x[5];
 88 |         my ($chrB, $coordB) = split(/:/, $coord_info_B);
 89 | 
 90 |         my $junction_read_count = $x[11] + $x[12];
 91 |         my $spanning_frags = $x[13];
 92 | 
 93 |         
 94 |         
 95 |         my $struct = {
 96 |             geneA => $geneA,
 97 |             chrA => $chrA,
 98 |             coordA => $coordA,
 99 | 
100 |             geneB => $geneB,
101 |             chrB => $chrB,
102 |             coordB => $coordB,
103 | 
104 |             span_reads => $spanning_frags,
105 |             junc_reads => $junction_read_count,
106 |         };
107 | 
108 |         push (@fusions, $struct);
109 | 
110 |     }
111 | 
112 |     close $fh;
113 | 
114 |     return(@fusions);
115 | }
116 | 
117 | 
118 | 1; #EOM
119 | 
120 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/ChimeraScan_parser.pm:
--------------------------------------------------------------------------------
  1 | package ChimeraScan_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | =chimerascan_format
  8 | 
  9 | 0       #chrom5p
 10 | 1       start5p
 11 | 2       end5p
 12 | 3       chrom3p
 13 | 4       start3p
 14 | 5       end3p
 15 | 6       chimera_cluster_id
 16 | 7       score
 17 | 8       strand5p
 18 | 9       strand3p
 19 | 10      transcript_ids_5p
 20 | 11      transcript_ids_3p
 21 | 12      genes5p
 22 | 13      genes3p
 23 | 14      type
 24 | 15      distance
 25 | 16      total_frags
 26 | 17      spanning_frags
 27 | 18      unique_alignment_positions
 28 | 19      isoform_fraction_5p
 29 | 20      isoform_fraction_3p
 30 | 21      breakpoint_spanning_reads
 31 | 22      chimera_ids
 32 | 
 33 | 0       chr17
 34 | 1       38219062
 35 | 2       38243105
 36 | 3       chr17
 37 | 4       46371708
 38 | 5       46385190
 39 | 6       CLUSTER41
 40 | 7       138
 41 | 8       +
 42 | 9       +
 43 | 10      ENST00000450525.2:0-1066,ENST00000450525.2:0-1213,ENST00000584985.1:0-1155,ENST00000546243.1:0-977,ENST00000394121.4:0-1131,ENST00000264637.4:0-1155,ENST00000546243.1:0-1124,ENST00000584985.1:0-1302,ENST00000394121.4:0-1278,ENST00000264637.4:0-1302
 44 | 11      ENST00000421610.2:169-667,ENST00000604191.1:0-570,ENST00000421610.2:0-667
 45 | 12      THRA
 46 | 13      AC090627.1
 47 | 14      Intrachromosomal
 48 | 15      8121588
 49 | 16      138
 50 | 17      71
 51 | 18      129
 52 | 19      0.896103896104
 53 | 20      1.0
 54 | 21      >4910959/2;pos=4;strand=-,AAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCA,>2034127/1;pos=0;strand=-,CAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGT,>774891/2;pos=2;strand=-,AAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGT,>19849147/1;pos=8;strand=-,TGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCTG,>10432608/1;pos=7;strand=-,CTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCA,>820632/1;pos=5;strand=-,AACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAG,>5743073/2;pos=3;strand=-,AAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTC,>9753659/1;pos=7;strand=-,CTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCT,>20253102/2;pos=13;strand=-,ATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCGCAGTGTCAGCTAAAGAA,>15409265/1;pos=6;strand=-,ACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGC,>7246100/2;pos=1;strand=-,AAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTG,>13967970/1;pos=15;strand=-,GTTCTCCGAGCAATTTCGAGTGCAAGTGCCACAGTGTCAGCTAAAGAAAC,>12125678/2;pos=1257;strand=+,CACCCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAAT,>17295693/1;pos=1259;strand=+,CCCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTT,>563772/1;pos=1260;strand=+,CCGTGTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTC,>3811711/2;pos=1264;strand=+,GTGGTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGT,>12114145/1;pos=1267;strand=+,GTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCA,>4357847/2;pos=1268;strand=+,TGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAA,>8496244/2;pos=1270;strand=+,GACTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGT,>20818261/1;pos=1272;strand=+,CTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGC,>10659063/2;pos=1272;strand=+,CTTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGAGCAAGTGC,>7520969/1;pos=1273;strand=+,TTTGCCAAAAAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCC,>10039588/1;pos=1282;strand=+,AAACTGCCCATGTTCTCCGAGCAATTTCGAGTGCAAGTGCAACAGTGTCA
 55 | 22      C5666846,C5666847,C6157374,C4569163,C3378772,C4569165,C3378770,C3378771,C5666883,C0478383,C0478385,C0478384,C5666848,C6157506,C0478515,C6157375,C6157376,C4569164,C4569205,C3378807
 56 | 
 57 | =cut
 58 |     
 59 | 
 60 | 
 61 | sub parse_fusion_result_file {
 62 |     my ($chimeraScan_file) = @_;
 63 | 
 64 |     my @fusions;
 65 | 
 66 |     open (my $fh, $chimeraScan_file) or die "Error, cannot open file $chimeraScan_file";
 67 |     my $header = <$fh>;
 68 |     while (<$fh>) {
 69 |         if (/^\#/) { next; }
 70 |         chomp;
 71 |         my @x = split(/\t/);
 72 | 
 73 |         my $chrA = $x[0];
 74 |         my $chrA_start = $x[1];
 75 |         my $chrA_end = $x[2];
 76 | 
 77 |         my $chrB = $x[3];
 78 |         my $chrB_start = $x[4];
 79 |         my $chrB_end = $x[5];
 80 | 
 81 |         my $chrA_strand = $x[8];
 82 |         my $chrB_strand = $x[9];
 83 | 
 84 | 
 85 |         my $geneA = $x[12];
 86 |         my $geneB = $x[13];
 87 | 
 88 | 
 89 |         my $brkpt_A = ($chrA_strand eq '+') ? $chrA_end : $chrA_start;
 90 |         my $brkpt_B = ($chrB_strand eq '+') ? $chrB_end : $chrB_start;
 91 | 
 92 | 
 93 |         my $total_frags = $x[16];
 94 | 
 95 |         my $junction_count = $x[17];
 96 |         my $spanning_count = $total_frags - $junction_count;
 97 | 
 98 | 
 99 |         my $struct = {
100 | 
101 |             geneA => $geneA,
102 |             chrA => $chrA,
103 |             coordA => $brkpt_A,
104 | 
105 |             geneB => $geneB,
106 |             chrB => $chrB,
107 |             coordB => $brkpt_B,
108 | 
109 |             span_reads => $spanning_count,
110 |             junc_reads => $junction_count,
111 |         };
112 | 
113 |         push (@fusions, $struct);
114 |     }
115 | 
116 |     close $fh;
117 | 
118 |     return(@fusions);
119 | }
120 | 
121 | 1; #EOM
122 | 
123 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/DEFUSE_parser.pm:
--------------------------------------------------------------------------------
  1 | package DEFUSE_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =defuse_format
  9 | 
 10 | 0       cluster_id
 11 | 1       splitr_sequence
 12 | 2       splitr_count
 13 | 3       splitr_span_pvalue
 14 | 4       splitr_pos_pvalue
 15 | 5       splitr_min_pvalue
 16 | 6       adjacent
 17 | 7       altsplice
 18 | 8       break_adj_entropy1
 19 | 9       break_adj_entropy2
 20 | 10      break_adj_entropy_min
 21 | 11      breakpoint_homology
 22 | 12      breakseqs_estislands_percident
 23 | 13      cdna_breakseqs_percident
 24 | 14      deletion
 25 | 15      est_breakseqs_percident
 26 | 16      eversion
 27 | 17      exonboundaries
 28 | 18      expression1
 29 | 19      expression2
 30 | 20      gene1
 31 | 21      gene2
 32 | 22      gene_align_strand1
 33 | 23      gene_align_strand2
 34 | 24      gene_chromosome1
 35 | 25      gene_chromosome2
 36 | 26      gene_end1
 37 | 27      gene_end2
 38 | 28      gene_location1
 39 | 29      gene_location2
 40 | 30      gene_name1
 41 | 31      gene_name2
 42 | 32      gene_start1
 43 | 33      gene_start2
 44 | 34      gene_strand1
 45 | 35      gene_strand2
 46 | 36      genome_breakseqs_percident
 47 | 37      genomic_break_pos1
 48 | 38      genomic_break_pos2
 49 | 39      genomic_strand1
 50 | 40      genomic_strand2
 51 | 41      interchromosomal
 52 | 42      interrupted_index1
 53 | 43      interrupted_index2
 54 | 44      inversion
 55 | 45      library_name
 56 | 46      max_map_count
 57 | 47      max_repeat_proportion
 58 | 48      mean_map_count
 59 | 49      min_map_count
 60 | 50      num_multi_map
 61 | 51      num_splice_variants
 62 | 52      orf
 63 | 53      read_through
 64 | 54      repeat_proportion1
 65 | 55      repeat_proportion2
 66 | 56      span_count
 67 | 57      span_coverage1
 68 | 58      span_coverage2
 69 | 59      span_coverage_max
 70 | 60      span_coverage_min
 71 | 61      splice_score
 72 | 62      splicing_index1
 73 | 63      splicing_index2
 74 | 64      probability
 75 | 
 76 | 0       3247
 77 | 1       GCGCACTTCCCTGAGGACACTGTGGAGCAGAAGGCAGAAAGCGTGGGCAGAATTATGCCTCACACGGAGGTGAGCCCCTGACCAAGACTCCAAAGTCCCACCTCCCGTCACCCAGCTGGGGTGCACCCAGCTGGGACATCGGTTGCTTTCAGTGAGAGAGTCAAATGGCTCAC|CCAGGGCTCTCCCCAGATACCATTTCAAATTCCTGTTAATTTTATTTTAATCCTGAATTCTGAGTTTGAATGTATACCCAGATCAGCCCTGTCTTTGTTTTCACTCACTGGTGTGGATGTAGCATGCCTCCATTAAGCTTTTTATTAACTTGCCTTGTTTTTGTCTCTGGCCTCGTTACCT
 78 | 2       6
 79 | 3       0.0891263313789634
 80 | 4       0.854900607940091
 81 | 5       0.564814798312889
 82 | 6       N
 83 | 7       N
 84 | 8       3.57167229721571
 85 | 9       3.4325554405491
 86 | 10      3.4325554405491
 87 | 11      0
 88 | 12      0
 89 | 13      0
 90 | 14      Y
 91 | 15      0
 92 | 16      N
 93 | 17      N
 94 | 18      2876
 95 | 19      0
 96 | 20      ENSG00000167107
 97 | 21      ENSG00000227011
 98 | 22      +
 99 | 23      -
100 | 24      17
101 | 25      17
102 | 26      48552206
103 | 27      51065012
104 | 28      intron
105 | 29      downstream
106 | 30      ACSF2
107 | 31      C17orf112
108 | 32      48503519
109 | 33      51062880
110 | 34      +
111 | 35      +
112 | 36      0
113 | 37      48548600
114 | 38      51089613
115 | 39      +
116 | 40      -
117 | 41      N
118 | 42      -
119 | 43      -
120 | 44      N
121 | 45      defuse_outdir
122 | 46      1
123 | 47      0
124 | 48      1
125 | 49      1
126 | 50      0
127 | 51      1
128 | 52      N
129 | 53      N
130 | 54      0
131 | 55      0
132 | 56      8
133 | 57      1.23121459994238
134 | 58      1.40196698971541
135 | 59      1.40196698971541
136 | 60      1.23121459994238
137 | 61      2
138 | 62      -
139 | 63      -
140 | 64      0.510155154639065
141 | 
142 | =cut
143 | 
144 |     
145 | 
146 | 
147 | sub parse_fusion_result_file {
148 |     my ($defuse_out_file) = @_;
149 | 
150 |     my @fusions;
151 | 
152 |     open (my $fh, $defuse_out_file) or die "Error, cannot open file $defuse_out_file";
153 |     my $header = <$fh>;
154 |     while (<$fh>) {
155 |         chomp;
156 |         my @x = split(/\t/);
157 | 
158 |         my $geneA = $x[30];
159 |         my $geneB = $x[31];
160 | 
161 |         my $junction_count = $x[2]; # splitr_count
162 |         unless ($junction_count =~ /\w/) {
163 |             $junction_count = 0;
164 |         }
165 | 
166 |         my $spanning_count = $x[56]; # span_count
167 |         unless ($spanning_count =~ /\w/) {
168 |             $spanning_count = 0;
169 |         }
170 | 
171 |         my $chrA = $x[24];
172 |         my $brkpt_A = $x[37];
173 |         my $chrB = $x[25];
174 |         my $brkpt_B = $x[38];
175 | 
176 |         my $struct = {
177 | 
178 |             geneA => $geneA,
179 |             chrA => $chrA,
180 |             coordA => $brkpt_A,
181 | 
182 |             geneB => $geneB,
183 |             chrB => $chrB,
184 |             coordB => $brkpt_B,
185 | 
186 |             span_reads => $spanning_count,
187 |             junc_reads => $junction_count,
188 |         };
189 | 
190 |         push (@fusions, $struct);
191 |     }
192 | 
193 |     return(@fusions);
194 | }
195 | 
196 | 1; #EOM
197 | 
198 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/EricScript_parser.pm:
--------------------------------------------------------------------------------
  1 | package EricScript_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =ericscript_format
  9 | 
 10 | 0       GeneName1
 11 | 1       GeneName2
 12 | 2       chr1
 13 | 3       Breakpoint1
 14 | 4       strand1
 15 | 5       chr2
 16 | 6       Breakpoint2
 17 | 7       strand2
 18 | 8       EnsemblGene1
 19 | 9       EnsemblGene2
 20 | 10      crossingreads
 21 | 11      spanningreads
 22 | 12      mean.insertsize
 23 | 13      homology
 24 | 14      fusiontype
 25 | 15      InfoGene1
 26 | 16      InfoGene2
 27 | 17      JunctionSequence
 28 | 18      GeneExpr1
 29 | 19      GeneExpr2
 30 | 20      GeneExpr_Fused
 31 | 21      ES
 32 | 22      GJS
 33 | 23      US
 34 | 24      EricScore
 35 | 
 36 | 0       PPP1CB
 37 | 1       SPDYA
 38 | 2       2
 39 | 3       28781842
 40 | 4       +
 41 | 5       2
 42 | 6       28783907
 43 | 7       +
 44 | 8       ENSG00000213639
 45 | 9       ENSG00000163806
 46 | 10      42
 47 | 11      31
 48 | 12      210.54
 49 | 13      ENSG00000186298 (93%)
 50 | 14      Read-Through
 51 | 15      protein phosphatase 1, catalytic subunit, beta isozyme [Source:HGNC Symbol;Acc:HGNC:9282]
 52 | 16      speedy/RINGO cell cycle regulator family member A [Source:HGNC Symbol;Acc:HGNC:30613]
 53 | 17      tctgcctatagcagccattgtggatgagaagatcttctgttgtcatggagGATTGTCACCAGACCTGCAATCTATGGAGCAGATTCGGAGAATTATGAGA
 54 | 18      15.81
 55 | 19      0.03
 56 | 20      29.76
 57 | 21      0.6967
 58 | 22      0.667
 59 | 23      0.738095238095238
 60 | 24      0.981308796513694
 61 | 
 62 | =cut
 63 |     
 64 | 
 65 | sub parse_fusion_result_file {
 66 |     my ($ericscript_out_file) = @_;
 67 | 
 68 |     my @fusions;
 69 | 
 70 |     open (my $fh, $ericscript_out_file) or die "Error, cannot open file $ericscript_out_file";
 71 |     my $header = <$fh>;
 72 |     while (<$fh>) {
 73 |         chomp;
 74 |         my @x = split(/\t/);
 75 | 
 76 |         my $geneA = $x[0];
 77 |         my $geneB = $x[1];
 78 | 
 79 |         my $junction_count = $x[10];
 80 |         my $spanning_count = $x[11];
 81 | 
 82 |         my $chrA = $x[2];
 83 |         my $brkpt_A = $x[3];
 84 |         my $chrB = $x[5];
 85 |         my $brkpt_B = $x[6];
 86 | 
 87 |         my $struct = {
 88 | 
 89 |             geneA => $geneA,
 90 |             chrA => $chrA,
 91 |             coordA => $brkpt_A,
 92 | 
 93 |             geneB => $geneB,
 94 |             chrB => $chrB,
 95 |             coordB => $brkpt_B,
 96 | 
 97 |             span_reads => $spanning_count,
 98 |             junc_reads => $junction_count,
 99 |         };
100 | 
101 |         push (@fusions, $struct);
102 |     }
103 | 
104 |     return(@fusions);
105 | }
106 | 
107 | 
108 | 1; #EOM
109 | 
110 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/FusionCatcher_KP_parser.pm:
--------------------------------------------------------------------------------
 1 | package FusionCatcher_KP_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | 
 8 | 
 9 | =FusionCatcher_format
10 | 
11 | ## described at: https://github.com/ndaniel/fusioncatcher/blob/master/doc/manual.md
12 | 
13 | 0       Fusion_gene_1
14 | 1       Fusion_gene_2
15 | 2       Count_paired-end_reads
16 | 3       Fusion_gene_symbol_1
17 | 4       Fusion_gene_symbol_2
18 | 5       Fusion_description
19 | 6       Analysis_status
20 | 7       Counts_of_common_mapping_reads
21 | 
22 | 0       ENSG00000175121
23 | 1       ENSG00000178053
24 | 2       9465
25 | 3       WFDC5
26 | 4       MLF1
27 | 5       
28 | 6       further_analysis
29 | 7       0
30 | 
31 | 
32 | 
33 | =cut
34 | 
35 | 
36 | 
37 | sub parse_fusion_result_file {
38 |     
39 |     my ($fusionCatcher_file) = @_;
40 |     
41 |     my @fusions;
42 |     
43 |     open (my $fh, $fusionCatcher_file) or die "Error, cannot open file $fusionCatcher_file";
44 |     my $header = <$fh>;
45 |     while (<$fh>) {
46 |         chomp;
47 |         my @x = split(/\t/);
48 |         
49 |         my $geneA = $x[3];
50 |         my $geneB = $x[4];
51 |         
52 |         unless ($geneA =~ /\w/ && $geneB =~ /\w/) { next; } # not scoring fusions not tied to genes.
53 |         
54 |         my $brkpt_A = $x[8];
55 |         my $brkpt_B = $x[9];
56 |         
57 |         my $junction_count = $x[2];
58 |         
59 |         my $struct = {
60 |             
61 |             geneA => $geneA,
62 |             chrA => "NA",
63 |             coordA => "NA",
64 |             
65 |             geneB => $geneB,
66 |             chrB => "NA",
67 |             coordB =>  "NA",
68 |             
69 |             span_reads => 0, # treat total count as junction here.
70 |             junc_reads => $junction_count,
71 |         };
72 |         
73 |         push (@fusions, $struct);
74 |     }
75 |     
76 |     close $fh;
77 |     
78 |     return(@fusions);
79 | }
80 | 
81 | 1; #EOM
82 | 
83 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/FusionCatcher_parser.pm:
--------------------------------------------------------------------------------
  1 | package FusionCatcher_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | 
  9 | =FusionCatcher_format
 10 | 
 11 | ## described at: https://github.com/ndaniel/fusioncatcher/blob/master/doc/manual.md
 12 | 
 13 | 
 14 | 0       Gene_1_symbol(5end_fusion_partner)
 15 | 1       Gene_2_symbol(3end_fusion_partner)
 16 | 2       Fusion_description
 17 | 3       Counts_of_common_mapping_reads
 18 | 4       Spanning_pairs         # Count of pair-end reads supporting the fusion
 19 | 5       Spanning_unique_reads  # Count of unique reads (i.e. unique mapping positions) mapping on the fusion junction. Shortly, here are counted all the reads which map on fusion junction minus the PCR duplicated reads.
 20 | 6       Longest_anchor_found
 21 | 7       Fusion_finding_method
 22 | 8       Fusion_point_for_gene_1(5end_fusion_partner)
 23 | 9       Fusion_point_for_gene_2(3end_fusion_partner)
 24 | 10      Gene_1_id(5end_fusion_partner)
 25 | 11      Gene_2_id(3end_fusion_partner)
 26 | 12      Exon_1_id(5end_fusion_partner)
 27 | 13      Exon_2_id(3end_fusion_partner)
 28 | 14      Fusion_sequence
 29 | 15      Predicted_effect
 30 | 16      Predicted_fused_transcripts
 31 | 17      Predicted_fused_proteins
 32 | 
 33 | 0       THRA
 34 | 1       THRA1/BTR
 35 | 2       no_protein,antisense,known_fusion
 36 | 3       0
 37 | 4       74
 38 | 5       20
 39 | 6       25
 40 | 7       BOWTIE
 41 | 8       17:40086853:+
 42 | 9       17:48294347:+
 43 | 10      ENSG00000126351
 44 | 11      ENSG00000235300
 45 | 12      ENSE00000863335
 46 | 13      ENSE00001677074
 47 | 14      GTGGACTTTGCCAAAAAACTGCCCATGTTCTCCGAG*CAATTTCGAGTGCAAGTGCCACAGTGTCAGCTAAAG
 48 | 15      CDS(truncated)/exonic(no-known-CDS)
 49 | 
 50 | =cut
 51 | 
 52 | 
 53 | 
 54 | sub parse_fusion_result_file {
 55 |     
 56 |     my ($fusionCatcher_file) = @_;
 57 |     
 58 |     my @fusions;
 59 |     
 60 |     open (my $fh, $fusionCatcher_file) or die "Error, cannot open file $fusionCatcher_file";
 61 |     my $header = <$fh>;
 62 |     while (<$fh>) {
 63 |         chomp;
 64 |         my @x = split(/\t/);
 65 |         
 66 |         my $geneA = $x[0];
 67 |         my $geneB = $x[1];
 68 |         
 69 |         unless ($geneA =~ /\w/ && $geneB =~ /\w/) { next; } # not scoring fusions not tied to genes.
 70 |         
 71 |         my $brkpt_A = $x[8];
 72 |         my $brkpt_B = $x[9];
 73 |         
 74 |         my ($chrA, $coordA, $orientA) = split(/:/, $brkpt_A);
 75 |         my ($chrB, $coordB, $orientB) = split(/:/, $brkpt_B);
 76 |         
 77 |         my $spanning_count = $x[4];
 78 |         my $junction_count = $x[5];
 79 |         
 80 |         my $struct = {
 81 |             
 82 |             geneA => $geneA,
 83 |             chrA => $chrA,
 84 |             coordA => $coordA,
 85 |             
 86 |             geneB => $geneB,
 87 |             chrB => $chrB,
 88 |             coordB => $coordB,
 89 |             
 90 |             span_reads => $spanning_count,
 91 |             junc_reads => $junction_count,
 92 |         };
 93 |         
 94 |         push (@fusions, $struct);
 95 |     }
 96 |     
 97 |     close $fh;
 98 |     
 99 |     return(@fusions);
100 | }
101 | 
102 | 1; #EOM
103 | 
104 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/FusionInspector_parser.pm:
--------------------------------------------------------------------------------
 1 | package FusionInspector_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | sub parse_fusion_result_file {
 8 |     my ($FI_file) = @_;
 9 | 
10 |     my @fusions;
11 | 
12 |     my $fh;
13 | 
14 |     if ($FI_file =~ /\.gz$/) {
15 |         open ($fh, "gunzip -c $FI_file | ") or die "Error, cannot open file $FI_file";
16 |     }
17 |     else {
18 |         open ($fh, $FI_file) or die "Error, cannot open file $FI_file";
19 |     }
20 |     
21 |     my $header = <$fh>;
22 | 
23 |     my @x = split(/\t/, $header);
24 |     my %idx;
25 |     for (my $i = 0; $i <= $#x; $i++) {
26 |         $idx{$x[$i]} = $i;
27 |     }
28 | 
29 |     while (<$fh>) {
30 |         chomp;
31 |         my @x = split(/\t/);
32 | 
33 |         my $fusion = $x[ $idx{'#FusionName'} ];
34 |         my $junction_reads = $x[ $idx{'JunctionReadCount'} ];
35 |         my $spanning_reads = $x[ $idx{'SpanningFragCount'} ];
36 |         my $fusion_gene_A = $x[ $idx{'LeftGene'} ];
37 |         my $chr_coords_A = $x[ $idx{'LeftBreakpoint'} ];
38 |         my $fusion_gene_B = $x[ $idx{'RightGene'} ];
39 |         my $chr_coords_B = $x[ $idx{'RightBreakpoint'} ];
40 |         
41 |         my $LeftBreakDinuc = uc $x[ $idx{'LeftBreakDinuc'} ];
42 |         my $RightBreakDinuc = uc $x[ $idx{'RightBreakDinuc'} ];
43 |         
44 |         my $splice_combo = "${LeftBreakDinuc}-${RightBreakDinuc}";
45 |         if ($splice_combo !~ /^(GT\-AG|GC\-AG|CT\-AC)$/) { next; } # require canonical splice breakpoints, eliminate RT-artifacts
46 |         
47 |         my $rest;
48 |         ($fusion_gene_A, $rest) = split(/\^/, $fusion_gene_A);
49 |         ($fusion_gene_B, $rest) = split(/\^/, $fusion_gene_B);
50 | 
51 |         if ($junction_reads < 1) { next; } # require at least one junction read
52 |         
53 |         if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions
54 | 
55 |         my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A);
56 |         my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B);
57 | 
58 | 
59 |         my $struct = {
60 |             geneA => $fusion_gene_A,
61 |             chrA => $chrA || ".",
62 |             coordA => $coordA || ".",
63 | 
64 |             geneB => $fusion_gene_B,
65 |             chrB => $chrB || ".",
66 |             coordB => $coordB || ".",
67 | 
68 |             span_reads => $spanning_reads,
69 |             junc_reads => $junction_reads,
70 |         };
71 | 
72 |         push (@fusions, $struct);
73 | 
74 |     }
75 | 
76 |     close $fh;
77 | 
78 |     
79 |     return(@fusions);
80 | }
81 | 
82 | 1; #EOM
83 | 
84 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/InFusion_parser.pm:
--------------------------------------------------------------------------------
 1 | package InFusion_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | 
 8 | =infusion_format
 9 | 
10 | 0       #id
11 | 1       ref1
12 | 2       break_pos1
13 | 3       region1
14 | 4       ref2
15 | 5       break_pos2
16 | 6       region2
17 | 7       num_span  # should be num_split
18 | 8       num_paired
19 | 9       genes_1
20 | 10      genes_2
21 | 11      fusion_class
22 | 
23 | 0       5591
24 | 1       20
25 | 2       35689536
26 | 3       [35689535,35689672]
27 | 4       1
28 | 5       84946639
29 | 6       [84946634,84946695]
30 | 7       5
31 | 8       8210
32 | 9       RBL1
33 | 10      RPF1
34 | 11      inter-chromosomal
35 | 
36 | =cut
37 | 
38 | 
39 | sub parse_fusion_result_file {
40 |     my ($preds_file) = @_;
41 | 
42 |     my @fusions;
43 | 
44 |     open (my $fh, $preds_file) or die "Error, cannot open file $preds_file";
45 |     my $header = <$fh>;
46 |     while (<$fh>) {
47 |         chomp;
48 |         my @x = split(/\t/);
49 | 
50 |         my $geneA = $x[9];
51 |         my $geneB = $x[10];
52 | 
53 |         $geneA =~ s/;/,/g;
54 |         $geneB =~ s/;/,/g; # others use commas instead of semicolons, so lets be consistent here.
55 | 
56 |         my $chrA = $x[1];
57 |         my $chrB = $x[4];
58 | 
59 |         my $brkpt_A = $x[2];
60 |         my $brkpt_B = $x[5];
61 | 
62 |         my $junction_count = $x[7];
63 |         my $spanning_count = $x[8];
64 | 
65 |         my $struct = {
66 | 
67 |             geneA => $geneA,
68 |             chrA => $chrA,
69 |             coordA => $brkpt_A,
70 | 
71 |             geneB => $geneB,
72 |             chrB => $chrB,
73 |             coordB => $brkpt_B,
74 | 
75 |             span_reads => $spanning_count,
76 |             junc_reads => $junction_count,
77 |         };
78 | 
79 |         push (@fusions, $struct);
80 |     }
81 | 
82 |     close $fh;
83 | 
84 |     return(@fusions);
85 | }
86 | 
87 | 
88 | 
89 | 1; #EOM
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/JAFFA_parser.pm:
--------------------------------------------------------------------------------
  1 | package JAFFA_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =JAFFA_format
  9 | 
 10 | ## described here: https://github.com/Oshlack/JAFFA/wiki/OutputDescription
 11 | 
 12 | 0       "sample"
 13 | 1       "fusion genes"
 14 | 2       "chrom1"
 15 | 3       "base1"
 16 | 4       "chrom2"
 17 | 5       "base2"
 18 | 6       "gap (kb)"
 19 | 7       "spanning pairs"    # spanning: The number of read-pairs, where each read in the pair aligns entirely on either side of the breakpoint. You might see a "-" in some of these. This indicates that no spanning pairs were found, but that the contig had only a small amount of flanking sequence to align reads to. i.e. the spanning pairs results may not be indicative of the true support for the fusion event.
 20 | 8       "spanning reads"    # junction: The number of reads aligning to the breakpoint, with at least 15 bases of flanking sequence either side (by default).
 21 | 9       "inframe"
 22 | 10      "aligns"
 23 | 11      "rearrangement"
 24 | 12      "contig"
 25 | 13      "contig break"
 26 | 14      "classification"
 27 | 15      "known"
 28 | 
 29 | 0       "jaffa-direct"
 30 | 1       "PROP1:FLRT1"
 31 | 2       "chr5"
 32 | 3       177421107
 33 | 4       "chr11"
 34 | 5       63883691
 35 | 6       Inf
 36 | 7       "13674"
 37 | 8       3261
 38 | 9       TRUE
 39 | 10      TRUE
 40 | 11      TRUE
 41 | 12      "Locus_1_Transcript_2940/6203_Confidence_0.001_Length_3574"
 42 | 13      3017
 43 | 14      "HighConfidence"
 44 | 15      "-"
 45 | 
 46 | =cut
 47 |     
 48 | sub parse_fusion_result_file {
 49 |     my ($jaffa_out_file) = @_;
 50 | 
 51 |     my @fusions;
 52 | 
 53 |     open (my $fh, $jaffa_out_file) or die "Error, cannot open file $jaffa_out_file";
 54 |     my $header = <$fh>;
 55 |     while (<$fh>) {
 56 |         chomp;
 57 |         s/\"//g;
 58 | 
 59 |         my @x = split(/,/);
 60 | 
 61 |         my $fusion = $x[1];
 62 |         my ($geneA, $geneB) = split(/:/, $fusion);
 63 | 
 64 |         my $junction_count = $x[8];
 65 |         unless ($junction_count =~ /\w/) {
 66 |             $junction_count = 0;
 67 |         }
 68 | 
 69 |         my $spanning_count = $x[7];
 70 |         unless ($spanning_count =~ /\w/) {
 71 |             $spanning_count = 0;
 72 |         }
 73 | 
 74 |         my $chrA = $x[2];
 75 |         my $brkpt_A = $x[3];
 76 |         my $chrB = $x[4];
 77 |         my $brkpt_B = $x[5];
 78 | 
 79 |         my $struct = {
 80 | 
 81 |             geneA => $geneA,
 82 |             chrA => $chrA,
 83 |             coordA => $brkpt_A,
 84 | 
 85 |             geneB => $geneB,
 86 |             chrB => $chrB,
 87 |             coordB => $brkpt_B,
 88 | 
 89 |             span_reads => $spanning_count,
 90 |             junc_reads => $junction_count,
 91 |         };
 92 | 
 93 |         push (@fusions, $struct);
 94 |     }
 95 | 
 96 |     return(@fusions);
 97 | }
 98 | 
 99 | 
100 | 1; #EOM
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/MapSplice_parser.pm:
--------------------------------------------------------------------------------
  1 | package MapSplice_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | =mapsplice_format
  8 | 
  9 |     Go here: http://www.netlab.uky.edu/p/bioinfo/MapSplice2FusionJunctionFormat
 10 | 
 11 | 0       chr1~chr1
 12 | 1       8926354
 13 | 2       236647038
 14 | 3       FUSIONJUNC_827
 15 | 4       2     # junction count  (coverage: number of reads aligned to the fusion junction)
 16 | 5       -+
 17 | 6       255,0,0
 18 | 7       2
 19 | 8       20,36,169,36,
 20 | 9       0,227720721,
 21 | 10      0.693147
 22 | 11      3
 23 | 12      CTGC
 24 | 13      0
 25 | 14      1
 26 | 15      0.500000
 27 | 16      20
 28 | 17      0
 29 | 18      10
 30 | 19      2
 31 | 20      0
 32 | 21      2
 33 | 22      2
 34 | 23      0
 35 | 24      0
 36 | 25      2
 37 | 26      0
 38 | 27      8   # spanning count  (encompassing_read pair_count: Number of reads pairs surround the fusion(but not cross the fusion))
 39 | 28      8926374
 40 | 29      236647074
 41 | 30      8926354,64M54P50M|
 42 | 31      236647038,133M|
 43 | 32      0
 44 | 33      0
 45 | 34      0.553333
 46 | 35      0.413333
 47 | 36      114
 48 | 37      114
 49 | 38      133
 50 | 39      133
 51 | 40      1.79176
 52 | 41      0.01
 53 | 42      1
 54 | 43      114
 55 | 44      133
 56 | 45      194
 57 | 46      100
 58 | 47      162.5
 59 | 48      4
 60 | 49      0
 61 | 50      4
 62 | 51      0
 63 | 52      not_matched
 64 | 53      not_matched
 65 | 54      GCGGGTTTGCTCCCAACATC
 66 | 55      ATTTCTCCTTGATGACATTCTTCAG
 67 | 56      1
 68 | 57      from_fusion
 69 | 58      fusion
 70 | 59      -,+
 71 | 60      ENO1,
 72 | 61      EDARADD,
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | =cut
 80 | 
 81 | 
 82 | sub parse_fusion_result_file {
 83 |     my ($mapsplice_out_file) = @_;
 84 | 
 85 |     my @fusions;
 86 | 
 87 |     my $get_unique_gene_list_sref = sub {
 88 |         my ($gene_txt) = @_;
 89 | 
 90 |         my %genes;
 91 |         my @fields = split(/,/, $gene_txt);
 92 |         foreach my $gene (@fields) {
 93 |             if ($gene) {
 94 |                 $genes{$gene} = 1;
 95 |             }
 96 |         }
 97 | 
 98 |         my $unique_gene_list = join(",", keys %genes);
 99 | 
100 |         return($unique_gene_list);
101 |     };
102 | 
103 | 
104 |     open (my $fh, $mapsplice_out_file) or die "Error, cannot open file $mapsplice_out_file";
105 |     while (<$fh>) {
106 |         chomp;
107 |         my @x = split(/\t/);
108 | 
109 |         my $geneA =  &$get_unique_gene_list_sref($x[60]);
110 |         my $geneB =  &$get_unique_gene_list_sref($x[61]);
111 | 
112 |         my $junction_count = $x[4];
113 |         my $spanning_count = $x[27];
114 | 
115 |         my ($chrA, $chrB) = split(/\~/, $x[0]);
116 |         unless ($chrA =~ /chr/ && $chrB =~ /chr/) {
117 |             confess "Erorr, didn't parse chr vals from $x[0] of $_";
118 |         }
119 | 
120 |         $chrA =~ s/chr//;
121 |         $chrB =~ s/chr//;
122 | 
123 |         my $brkpt_A = $x[1];
124 |         my $brkpt_B = $x[2];
125 | 
126 |         my $struct = {
127 | 
128 |             geneA => $geneA,
129 |             chrA => $chrA,
130 |             coordA => $brkpt_A,
131 | 
132 |             geneB => $geneB,
133 |             chrB => $chrB,
134 |             coordB => $brkpt_B,
135 | 
136 |             span_reads => $spanning_count,
137 |             junc_reads => $junction_count,
138 |         };
139 | 
140 |         push (@fusions, $struct);
141 |     }
142 | 
143 |     return(@fusions);
144 | 
145 | }
146 | 
147 | 
148 | 1; #EOM
149 | 
150 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/NFuse_parser.pm:
--------------------------------------------------------------------------------
  1 | package NFuse_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =nFUSE_format
  9 | 
 10 | 0       cluster_id
 11 | 1       adjacent
 12 | 2       altsplice
 13 | 3       break_adj_entropy1
 14 | 4       break_adj_entropy2
 15 | 5       break_adj_entropy_min
 16 | 6       breakpoint_homology
 17 | 7       breakpos1
 18 | 8       breakpos2
 19 | 9       cdna_breakseqs_percident
 20 | 10      chromosome1
 21 | 11      chromosome2
 22 | 12      defuse_probability
 23 | 13      deletion
 24 | 14      est_breakseqs_percident
 25 | 15      estisland_breakseqs_percident
 26 | 16      eversion
 27 | 17      exonboundaries
 28 | 18      gene1
 29 | 19      gene2
 30 | 20      gene_align_strand1
 31 | 21      gene_align_strand2
 32 | 22      gene_chromosome1
 33 | 23      gene_chromosome2
 34 | 24      gene_end1
 35 | 25      gene_end2
 36 | 26      gene_location1
 37 | 27      gene_location2
 38 | 28      gene_name1
 39 | 29      gene_name2
 40 | 30      gene_start1
 41 | 31      gene_start2
 42 | 32      gene_strand1
 43 | 33      gene_strand2
 44 | 34      genome_breakseqs_percident
 45 | 35      genomic_break_pos1
 46 | 36      genomic_break_pos2
 47 | 37      genomic_strand1
 48 | 38      genomic_strand2
 49 | 39      interchromosomal
 50 | 40      inversion
 51 | 41      library_name
 52 | 42      max_map_count
 53 | 43      mean_map_count
 54 | 44      min_map_count
 55 | 45      num_multi_map
 56 | 46      num_splice_variants
 57 | 47      orf
 58 | 48      readthrough
 59 | 49      reference1
 60 | 50      reference2
 61 | 51      repeat_list1
 62 | 52      repeat_list2
 63 | 53      repeat_proportion1
 64 | 54      repeat_proportion2
 65 | 55      repeat_proportion_max
 66 | 56      sequence
 67 | 57      span_count        # spanning count
 68 | 58      span_coverage1
 69 | 59      span_coverage2
 70 | 60      span_coverage_max
 71 | 61      span_coverage_min
 72 | 62      splice_score
 73 | 63      splitreads_count    # junction count
 74 | 64      splitreads_min_pvalue
 75 | 65      splitreads_pos_pvalue
 76 | 66      splitreads_span_pvalue
 77 | 67      strand1
 78 | 68      strand2
 79 | 
 80 | 0       16780
 81 | 1       N
 82 | 2       N
 83 | 3       3.46807512488172
 84 | 4       3.60632674564779
 85 | 5       3.46807512488172
 86 | 6       0
 87 | 7       393
 88 | 8       150364072
 89 | 9       0
 90 | 10      3
 91 | 11      6
 92 | 12      0.798023102489623
 93 | 13      N
 94 | 14      0
 95 | 15      0
 96 | 16      N
 97 | 17      N
 98 | 18      ENSG00000183396
 99 | 19      ENSG00000213091
100 | 20      -
101 | 21      +
102 | 22      3
103 | 23      6
104 | 24      48659288
105 | 25      150364489
106 | 26      coding
107 | 27      intron
108 | 28      TMEM89
109 | 29      PHBP1
110 | 30      48658192
111 | 31      150363682
112 | 32      -
113 | 33      +
114 | 34      0
115 | 35      48658896
116 | 36      150364072
117 | 37      +
118 | 38      +
119 | 39      Y
120 | 40      N
121 | 41      tmp.defuse_outdir
122 | 42      2
123 | 43      1.33333333333333
124 | 44      1
125 | 45      2
126 | 46      1
127 | 47      N
128 | 48      N
129 | 49      ENSG00000183396|ENST00000330862
130 | 50      6
131 | 51      -
132 | 52      -
133 | 53      0
134 | 54      0
135 | 55      0
136 | 56      CCCACTCTGGGTGGAAGTCCCCTTTATTTGGATTTGCCGCTGGGTGGCTAGATGACGTAGGTGGCCTTCGATGTGGACCAGGAGGGCATCCAGCATGTGCAGGACCCCACGGAGCAGGGTGTGGTCTGAGATTGGGGCCCGCCGTTTCCAGGGTCCGCAGGGCTCAGTGGTCACCTGCGGATGC|ACCACTGACTTGAGGATCTCAGTCATGATGGACGTCAGCACACGCTCATCATAGTCCTCTCCGGTGATGGCGAAGATGCGAGGAAGCTGGCTAGAGACGGGCCGGAAGAGGATGCACAGTGTGATGTTGACATTCTGTAAATATTTGCTACCAGTGATGACTGGCACAGTA
137 | 57      6
138 | 58      0.780927305360403
139 | 59      0.615731144611087
140 | 60      0.780927305360403
141 | 61      0.615731144611087
142 | 62      1
143 | 63      1655
144 | 64      0.717074880359308
145 | 65      0.58076168734171
146 | 66      0.844296742119671
147 | 67      -
148 | 68      +
149 | 
150 | 
151 | =cut
152 |     
153 | 
154 | sub parse_fusion_result_file {
155 |     my ($nFUSE_out_file) = @_;
156 | 
157 |     my @fusions;
158 | 
159 |     open (my $fh, $nFUSE_out_file) or die "Error, cannot open file $nFUSE_out_file";
160 |     my $header = <$fh>;
161 |     while (<$fh>) {
162 |         chomp;
163 |         my @x = split(/\t/);
164 | 
165 |         my $geneA = $x[28];
166 |         my $geneB = $x[29];
167 | 
168 |         my $chrA = $x[10];
169 |         my $chrB = $x[11];
170 | 
171 |         my $brkpt_A = $x[35];
172 |         my $brkpt_B = $x[36];
173 | 
174 |         my $spanning_count = $x[57];
175 |         my $junction_count = $x[63];
176 | 
177 |         my $struct = {
178 | 
179 |             geneA => $geneA,
180 |             chrA => $chrA,
181 |             coordA => $brkpt_A,
182 | 
183 |             geneB => $geneB,
184 |             chrB => $chrB,
185 |             coordB => $brkpt_B,
186 | 
187 |             span_reads => $spanning_count,
188 |             junc_reads => $junction_count,
189 |         };
190 | 
191 |         push (@fusions, $struct);
192 |     }
193 | 
194 |     return(@fusions);
195 | }
196 | 
197 | 
198 | 1; #EOM
199 | 
200 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/PRADA_parser.pm:
--------------------------------------------------------------------------------
  1 | package PRADA_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | 
  8 | =PRADA_format
  9 | 
 10 | 0       Gene_A
 11 | 1       Gene_B
 12 | 2       A_chr
 13 | 3       B_chr
 14 | 4       A_strand
 15 | 5       B_strand
 16 | 6       Discordant_n     # span count
 17 | 7       JSR_n
 18 | 8       perfectJSR_n
 19 | 9       Junc_n
 20 | 10      Position_Consist
 21 | 11      Junction         # extract junction count
 22 | 12      Identity
 23 | 13      Align_Len
 24 | 14      Evalue
 25 | 15      BitScore 
 26 | 
 27 | 0       TRPC4AP
 28 | 1       MRPL45
 29 | 2       20
 30 | 3       17
 31 | 4       -1
 32 | 5       1
 33 | 6       6
 34 | 7       3
 35 | 8       3
 36 | 9       2
 37 | 10      PARTIALLY
 38 | 11      TRPC4AP:20:33665849_MRPL45:17:36478009,2|TRPC4AP:20:33665849_MRPL45:17:36476502,1
 39 | 12      100.00
 40 | 13      12
 41 | 14      0.68
 42 | 15      22.9
 43 | 
 44 | =cut
 45 | 
 46 | 
 47 | 
 48 | sub parse_fusion_result_file {
 49 |     my ($prada_file) = @_;
 50 | 
 51 |     my @fusions;
 52 | 
 53 |     open (my $fh, $prada_file) or die "Error, cannot open file $prada_file";
 54 |     my $header = <$fh>;
 55 |     while (<$fh>) {
 56 |         chomp;
 57 |         my @x = split(/\t/);
 58 | 
 59 |         my $span_count = $x[6];
 60 | 
 61 |         my $fusion_info = $x[11];
 62 |         # CPNE1:20:34243124_PI3:20:43804502,2
 63 | 
 64 |         my @fusion_evidence = split(/\|/, $fusion_info);
 65 |         foreach my $f_info (@fusion_evidence) {
 66 | 
 67 |             $f_info =~ /^(\S+):([^\:]+):(\d+)_(\S+):([^\:]+):(\d+),(\d+)$/ or die "Error, cannot parse $f_info";
 68 | 
 69 |             my $geneA = $1;
 70 |             my $chrA = $2;
 71 |             my $coordA = $3;
 72 | 
 73 |             my $geneB = $4;
 74 |             my $chrB = $5;
 75 |             my $coordB = $6;
 76 | 
 77 |             my $junc_reads = $7;
 78 | 
 79 | 
 80 |             my $struct = {
 81 |                 geneA => $geneA,
 82 |                 chrA => $chrA,
 83 |                 coordA => $coordA,
 84 | 
 85 |                 geneB => $geneB,
 86 |                 chrB => $chrB,
 87 |                 coordB => $coordB,
 88 | 
 89 |                 span_reads => $span_count,
 90 |                 junc_reads => $junc_reads,
 91 |             };
 92 | 
 93 |             push (@fusions, $struct);
 94 |         }
 95 |     }
 96 | 
 97 | 
 98 |     close $fh;
 99 | 
100 |     return(@fusions);
101 | }
102 | 
103 | 1; #EOM
104 | 
105 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/SOAPfuse_parser.pm:
--------------------------------------------------------------------------------
 1 | package SOAPfuse_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | sub parse_fusion_result_file {
 8 |     my ($soap_file) = @_;
 9 |     
10 | =soapfuse_format
11 | 
12 |         0       up_gene
13 |         1       up_chr
14 |         2       up_strand
15 |         3       up_Genome_pos
16 |         4       up_loc
17 |         5       dw_gene
18 |         6       dw_chr
19 |         7       dw_strand
20 |         8       dw_Genome_pos
21 |         9       dw_loc
22 |         10      Span_reads_num     # S
23 |         11      Junc_reads_num     # J
24 |         12      Fusion_Type
25 |         13      down_fusion_part_frame-shift_or_not
26 | 
27 |         0       SLMO2
28 |         1       chr20
29 |         2       -
30 |         3       57610027
31 |         4       M
32 |         5       ATP5E
33 |         6       chr20
34 |         7       -
35 |         8       57605484
36 |         9       E
37 |         10      3
38 |         11      4
39 |         12      INTRACHR-SS-OGO-0GAP
40 |         13      NA
41 | 
42 | 
43 | =cut
44 | 
45 |     ;
46 | 
47 | 
48 |     my @fusions;
49 | 
50 |     open (my $fh, $soap_file) or die "Error, cannot open file $soap_file";
51 |     my $header = <$fh>;
52 |     while (<$fh>) {
53 |         chomp;
54 |         my @x = split(/\t/);
55 | 
56 |         my $geneA = $x[0];
57 |         my $geneB = $x[5];
58 | 
59 |         $geneA =~ s/SOAPfuse.*//;
60 |         $geneB =~ s/SOAPfuse.*//;
61 | 
62 |         my $struct = {
63 |             geneA => $geneA,
64 |             chrA => $x[1],
65 |             coordA => $x[3],
66 | 
67 |             geneB => $geneB,
68 |             chrB => $x[6],
69 |             coordB => $x[8],
70 | 
71 |             span_reads => $x[10],
72 |             junc_reads => $x[11],
73 | 
74 |         };
75 | 
76 | 
77 |         push (@fusions, $struct);
78 |     }
79 | 
80 | 
81 |     return(@fusions);
82 | }
83 | 
84 | 
85 | 1; #EOM
86 | 
87 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/STARCHIP_parser.pm:
--------------------------------------------------------------------------------
 1 | package STARCHIP_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | 
 8 | =STARCHIP_format
 9 | 
10 | 0       Partner1
11 | 1       Partner2
12 | 2       SpanningReads
13 | 3       SplitReads
14 | 4       AvgAS
15 | 5       NearGene1
16 | 6       Distance1
17 | 7       NearGene2
18 | 8       Distance2
19 | 9       ConsensusSeq
20 | 
21 | 0       chr9:133729451:-
22 | 1       chr22:23632600:-
23 | 2       21
24 | 3       18
25 | 4       87.4
26 | 5       ABL1
27 | 6       0
28 | 7       BCR
29 | 8       0
30 | 9       gggctctatgggtttctgaatgtcatcgtccactcagccactggatttaagcagagttcaaaagcccttcagcggccagtagcatctgactttgagcctcagggtctgagtgaagccgctcg
31 | 
32 | =cut
33 | 
34 | 
35 | 
36 | sub parse_fusion_result_file {
37 |     my ($file) = @_;
38 | 
39 |     my @fusions;
40 | 
41 |     open (my $fh, $file) or die "Error, cannot open file $file";
42 |     my $header = <$fh>;
43 |     while (<$fh>) {
44 |         if (/^\#/) { next; }
45 |         chomp;
46 |         my ($chr_coords_A, $chr_coords_B,
47 |             $span_count, $junc_count,
48 |             $avgAS,
49 |             $geneA,
50 |             $distA,
51 |             $geneB,
52 |             $distB,
53 |             $seq) = split(/\t/);
54 |         
55 |         my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A);
56 |         my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B);
57 | 
58 |         my $struct = {
59 |             geneA => $geneA,
60 |             chrA => $chrA,
61 |             coordA => $coordA,
62 | 
63 |             geneB => $geneB,
64 |             chrB => $chrB,
65 |             coordB => $coordB,
66 | 
67 |             span_reads => $span_count,
68 |             junc_reads => $junc_count,
69 |         };
70 |         
71 |         push (@fusions, $struct);
72 | 
73 |     }
74 | 
75 |     close $fh;
76 | 
77 |     return(@fusions);
78 | }
79 | 
80 | 
81 | 1; #EOM
82 | 
83 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/STARFusion_parser.pm:
--------------------------------------------------------------------------------
  1 | package STARFusion_parser;
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | 
  7 | use FindBin;
  8 | use DelimParser;
  9 | 
 10 | =STARFusion_format
 11 | 
 12 | 0       #FusionName
 13 | 1       JunctionReadCount
 14 | 2       SpanningFragCount
 15 | 3       SpliceType
 16 | 4       LeftGene
 17 | 5       LeftBreakpoint
 18 | 6       RightGene
 19 | 7       RightBreakpoint
 20 | 8       LargeAnchorSupport
 21 | 9       LeftBreakDinuc
 22 | 10      LeftBreakEntropy
 23 | 11      RightBreakDinuc
 24 | 12      RightBreakEntropy
 25 | 
 26 | 0       THRA--AC090627.1
 27 | 1       76
 28 | 2       104
 29 | 3       ONLY_REF_SPLICE
 30 | 4       THRA^ENSG00000126351.8
 31 | 5       chr17:38243106:+
 32 | 6       AC090627.1^ENSG00000235300.3
 33 | 7       chr17:46371709:+
 34 | 8       YES_LDAS
 35 | 9       GT
 36 | 10      1.8892
 37 | 11      AG
 38 | 12      1.9656
 39 | 
 40 | 
 41 | =cut
 42 | 
 43 | 
 44 | 
 45 | sub parse_fusion_result_file {
 46 |     my ($starFusion_file) = @_;
 47 | 
 48 |     my @fusions;
 49 | 
 50 |     my $fh;
 51 | 
 52 |     if ($starFusion_file =~ /\.gz$/) {
 53 |         open($fh, "gunzip -c $starFusion_file | ");
 54 |     }
 55 |     else {
 56 |         open ($fh, $starFusion_file) or die "Error, cannot open file $starFusion_file";
 57 |     }
 58 |     
 59 |     unless($fh) {
 60 |         confess "Error, no filehandle opened on $starFusion_file";
 61 |     }
 62 |     
 63 |     my $tab_reader = new DelimParser::Reader($fh, "\t");
 64 |     
 65 |     while(my $row = $tab_reader->get_row()) {
 66 | 
 67 |         chomp;
 68 | 
 69 |         my $fusion = $tab_reader->get_row_val($row, "#FusionName");
 70 |         my $junction_reads = $tab_reader->get_row_val($row, "JunctionReadCount");
 71 |         my $spanning_reads = $tab_reader->get_row_val($row, "SpanningFragCount");
 72 |         my $splice_type = $tab_reader->get_row_val($row, "SpliceType");
 73 |         my $fusion_gene_A = $tab_reader->get_row_val($row, "LeftGene");
 74 |         my $chr_coords_A = $tab_reader->get_row_val($row, "LeftBreakpoint");
 75 |         my $fusion_gene_B = $tab_reader->get_row_val($row, "RightGene");
 76 |         my $chr_coords_B = $tab_reader->get_row_val($row, "RightBreakpoint");
 77 |         
 78 |         my $rest;
 79 |         ($fusion_gene_A, $rest) = split(/\^/, $fusion_gene_A);
 80 |         ($fusion_gene_B, $rest) = split(/\^/, $fusion_gene_B);
 81 | 
 82 |         if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions
 83 | 
 84 |         my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A);
 85 |         my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B);
 86 | 
 87 | 
 88 |         my $struct = {
 89 |             geneA => $fusion_gene_A,
 90 |             chrA => $chrA || ".",
 91 |             coordA => $coordA || ".",
 92 | 
 93 |             geneB => $fusion_gene_B,
 94 |             chrB => $chrB || ".",
 95 |             coordB => $coordB || ".",
 96 | 
 97 |             span_reads => $spanning_reads,
 98 |             junc_reads => $junction_reads,
 99 |         };
100 | 
101 |         push (@fusions, $struct);
102 | 
103 |     }
104 | 
105 |     close $fh;
106 | 
107 |     return(@fusions);
108 | }
109 | 
110 | 
111 | 1; #EOM
112 | 
113 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/TopHatFusion_parser.pm:
--------------------------------------------------------------------------------
 1 | package TopHatFusion_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | sub parse_fusion_result_file {
 8 |     my ($tophatfusion_file) = @_;
 9 | 
10 | =tophatfusion_format
11 | 
12 | from: http://tophat.cbcb.umd.edu/data/result.html#detail
13 | 
14 | 1. Sample name in which a fusion is identified 
15 | 2. Gene on the "left" side of the fusion 
16 | 3. Chromosome ID on the left 
17 | 4. Coordinates on the left 
18 | 5. Gene on the "right" side 
19 | 6. Chromosome ID on the right 
20 | 7. Coordinates on the right 
21 | 8. Number of spanning reads 
22 | 9. Number of spanning mate pairs 
23 | 10. Number of spanning mate pairs where one end spans a fusion 
24 | If you follow the the 9th column, it shows coordinates "number1:number2" where one end is located at a distance of "number1" bases from the left genomic coordinate of a fusion and "number2" is similarly defined
25 | 
26 | 0       sample_1
27 | 1       PNRC2
28 | 2       chr1
29 | 3       24289902
30 | 4       DGKD
31 | 5       chr2
32 | 6       234263228
33 | 7       147  # J
34 | 8       134  # S
35 | 9       102
36 | 10      953.07
37 | 
38 | =cut
39 | 
40 |     ;
41 | 
42 | 
43 |     my @fusions;
44 | 
45 |     open (my $fh, $tophatfusion_file) or die "Error, cannot open file $tophatfusion_file";
46 |     while (<$fh>) {
47 |         chomp;
48 |         my @x = split(/\t/);
49 | 
50 |         my $struct = {
51 | 
52 |             geneA => $x[1],
53 |             chrA => $x[2],
54 |             coordA => $x[3],
55 | 
56 |             geneB => $x[4],
57 |             chrB => $x[5],
58 |             coordB => $x[6],
59 | 
60 |             span_reads => $x[8],
61 |             junc_reads => $x[7],
62 | 
63 |         };
64 | 
65 |         push (@fusions, $struct);
66 |     }
67 | 
68 |     close $fh;
69 | 
70 |     return(@fusions);
71 | }
72 | 
73 | 
74 | 1; #EOM
75 | 
76 | 


--------------------------------------------------------------------------------
/benchmarking/FusionProgParsers/TrinityFusion_parser.pm:
--------------------------------------------------------------------------------
 1 | package TrinityFusion_parser;
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | 
 8 | =TrinityFusion_format
 9 | 
10 | 0       #FusionName
11 | 1       JunctionReadCount
12 | 2       SpanningFragCount
13 | 3       trans_acc
14 | 4       trans_brkpt
15 | 5       LeftGene
16 | 6       LeftBreakpoint
17 | 7       RightGene
18 | 8       RightBreakpoint
19 | 9       SpliceType
20 | 10      annots
21 | 
22 | 0       CWC22--AC104532.2
23 | 1       3105
24 | 2       9676
25 | 3       TRINITY_DN196_c0_g1_i1
26 | 4       284-283
27 | 5       CWC22
28 | 6       chr2:180835228
29 | 7       AC104532.2
30 | 8       chr19:5914396
31 | 9       ONLY_REF_SPLICE
32 | 10      ["INTERCHROMOSOMAL[chr2--chr19]"]
33 | 
34 | =cut
35 | 
36 | 
37 | sub parse_fusion_result_file {
38 |     my ($file) = @_;
39 | 
40 |     my @fusions;
41 |     
42 |     open (my $fh, $file) or die "Error, cannot open file $file";
43 |     while (<$fh>) {
44 |         if (/^\#/) { next; }
45 |         
46 |         chomp;
47 | 
48 |         my @x = split("\t");
49 |         my $fusion_gene_A = $x[5];
50 |         my $fusion_gene_B = $x[7];
51 |         
52 |         if ($fusion_gene_A eq $fusion_gene_B) { next; } # no self-fusions
53 | 
54 |         my $splice_type = $x[9];
55 |         unless ($splice_type eq "ONLY_REF_SPLICE") { next; } # otherwise, too many assembly artifacts
56 |         
57 |         my $chr_coords_A = $x[6];
58 |         my $chr_coords_B = $x[8];
59 |         
60 |         my ($chrA, $coordA, $orientA) = split(/:/, $chr_coords_A);
61 |         my ($chrB, $coordB, $orientB) = split(/:/, $chr_coords_B);
62 | 
63 |         my $junction_reads = $x[1];
64 |         my $spanning_reads = $x[2];
65 |         
66 |         my $struct = {
67 |             geneA => $fusion_gene_A,
68 |             chrA => $chrA || ".",
69 |             coordA => $coordA || ".",
70 | 
71 |             geneB => $fusion_gene_B,
72 |             chrB => $chrB || ".",
73 |             coordB => $coordB || ".",
74 | 
75 |             span_reads => $spanning_reads,
76 |             junc_reads => $junction_reads,
77 |         };
78 |         
79 |         push (@fusions, $struct);
80 | 
81 |     }
82 | 
83 |     close $fh;
84 | 
85 |     return(@fusions);
86 | }
87 | 
88 | 
89 | 1; #EOM
90 | 
91 | 


--------------------------------------------------------------------------------
/benchmarking/aggregate_peak_F1_stats.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | library(ggplot2)
 4 | library(dplyr)
 5 | library(tidyr)
 6 | 
 7 | args<-commandArgs(TRUE)
 8 | if (length(args) != 3) {
 9 |     stop("require param: maxF1_file_suffix low_val high_val")  # example: okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1.dat 3 13
10 | }
11 | 
12 | file_suffix = args[1]
13 | low_val = as.numeric(args[2])
14 | high_val = as.numeric(args[3])
15 | 
16 | 
17 | dfs_list = list()
18 | 
19 | for (minprogs in seq(low_val, high_val)) {
20 |     dat_file = sprintf("__min_%d_agree/min_%d.%s", minprogs, minprogs, file_suffix)
21 |     data = read.table(dat_file)
22 | 
23 |     data$min_progs_agree = minprogs
24 | 
25 |     dfs_list[[as.character(minprogs)]] <- data
26 | }
27 | 
28 | all_data = do.call(rbind, dfs_list)
29 | 
30 | write.table(all_data, file=sprintf("%s.consolidated.dat", file_suffix), quote=F, sep="\t")
31 | 
32 | pdf(sprintf("%s.consolidated.scatters.pdf", file_suffix), height=5, width=11)
33 | 
34 | p = all_data %>% ggplot(aes(x=PPV, y=TPR, color=prog, shape=prog)) + geom_point() +  scale_shape_manual(values=rep(seq(0,25), 2)) + facet_wrap(~min_progs_agree)
35 | 
36 | plot(p)
37 | 
38 | dev.off()
39 | 


--------------------------------------------------------------------------------
/benchmarking/all_TP_FP_FN_to_ROC.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | 
  7 | my $usage = "\n\n\tusage: $0 summary.TP_FP_FN\n\n";
  8 | 
  9 | my $tp_fp_fn_file = $ARGV[0] or die $usage;
 10 | 
 11 | main: {
 12 | 
 13 |     my %data = &parse_file($tp_fp_fn_file);    
 14 |     
 15 |     print join("\t", "prog", "min_sum_frags", "TP", "FP", "FN", "TPR", "PPV", "F1") . "\n";
 16 | 
 17 |     foreach my $prog (keys %data) {
 18 |         my $progdata_href = $data{$prog};
 19 |         
 20 |         &make_ROC($prog, $progdata_href);
 21 |     }
 22 |     
 23 |     exit(0);
 24 | }
 25 | 
 26 | 
 27 | ####
 28 | sub make_ROC {
 29 |     my ($prog_name, $progdata_href) = @_;
 30 | 
 31 |     my %data = %$progdata_href;
 32 |     
 33 |     my @TP_fusions = ($data{TP}) ? @{$data{TP}} : ();
 34 |     my @FP_fusions = ($data{FP}) ? @{$data{FP}} : ();
 35 |     my @FN_fusions = ($data{FN}) ? @{$data{FN}} : ();
 36 |     
 37 |     my $num_truth_fusions = scalar(@TP_fusions) + scalar(@FN_fusions);
 38 |     my $num_total_FP = scalar(@FP_fusions);
 39 |     
 40 |     my @uniq_vals = sort {$a<=>$b} &get_unique(@TP_fusions, @FP_fusions);
 41 |     
 42 |     for (my $i = 0; $i < $#uniq_vals; $i++) {
 43 | 
 44 |         my $min_val = $uniq_vals[$i];
 45 |         
 46 |         @TP_fusions = grep { $_ >= $min_val } @TP_fusions;
 47 |         
 48 |         @FP_fusions = grep { $_ >= $min_val } @FP_fusions;
 49 | 
 50 |         my $num_TP = scalar(@TP_fusions);
 51 |         my $num_FP = scalar(@FP_fusions);
 52 |         my $num_FN = $num_truth_fusions - $num_TP;
 53 |         
 54 |         my $TPR = sprintf("%.2f", $num_TP / $num_truth_fusions); # True Positive Rate
 55 | 
 56 |         my $FDR = sprintf("%.2f", $num_FP / ($num_FP + $num_TP)); # False Discovery Rate
 57 |         
 58 |         my $PPV = 1 - $FDR; # Positive Predictive Value
 59 |         
 60 | 
 61 |         my $Sn = $TPR;   # using true positive rate as 'sensitivity' measure
 62 |         my $Sp = $PPV;   # using positive predictive value as 'specificity' measure
 63 |         
 64 |         
 65 |         my $F1 = "NA";
 66 |         eval {
 67 |             $F1 = sprintf("%.3f", 2 * $Sn * $Sp / ($Sn + $Sp) );
 68 |         };
 69 |         
 70 |         print join("\t", $prog_name, $min_val, $num_TP, $num_FP, $num_FN, $TPR, $PPV, $F1) . "\n";
 71 |     }
 72 |     
 73 |     return;
 74 | }
 75 | 
 76 | 
 77 | ####
 78 | sub parse_file {
 79 |     my ($fusions_file) = @_;
 80 |     my %data;
 81 |     
 82 |     
 83 |     my %seen;
 84 |     
 85 |     open (my $fh, $fusions_file) or die $!;
 86 |     
 87 |     my $header = <$fh>;
 88 |     unless ($header =~ /^pred_result/) {
 89 |         die "Error, not reading expected header format for $fusions_file";
 90 |     }
 91 |     while (<$fh>) {
 92 |         chomp;
 93 |         my @x = split(/\t/);
 94 |         unless (scalar @x == 10) {
 95 |             die "Error, did not parse 10 fields from row: $_";
 96 |         }
 97 |         my ($pred_type, $sample_name, $progname, $fusion, $J, $S, 
 98 |             $mapped_gencode_A, $mapped_gencode_B, $explanation, $selected_fusion) = @x;
 99 |         
100 |         unless ($pred_type =~ /^(TP|FP|FN)$/) { next; }
101 |                 
102 |         if ($selected_fusion ne '.') {
103 |             $fusion = $selected_fusion;
104 |         }
105 |         
106 |         my $fusion_token = join("::", $progname, $sample_name, $fusion);
107 |         
108 |         if ($seen{$fusion_token}) {
109 |             die "Error, already processed fusion [$fusion_token], and these should be unique entries in this file $fusions_file";
110 |         }
111 |         $seen{$fusion_token} = 1 ;
112 |         
113 |         my $val = $J + $S;
114 |         
115 |         push (@{$data{$progname}->{$pred_type}}, $val);
116 |     }
117 |     close $fh;
118 |     
119 |     return(%data);
120 |     
121 | }
122 | 
123 | ####
124 | sub get_unique {
125 |     my (@vals) = @_;
126 |     
127 |     my %v = map { + $_ => 1 } @vals;
128 | 
129 |     return(keys %v);
130 | }
131 | 


--------------------------------------------------------------------------------
/benchmarking/all_TP_FP_FN_to_ROC.vary_minF_minS.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | 
  7 | my $usage = "\n\n\tusage: $0 summary.TP_FP_FN\n\n";
  8 | 
  9 | my $tp_fp_fn_file = $ARGV[0] or die $usage;
 10 | 
 11 | main: {
 12 | 
 13 |     my %data = &parse_file($tp_fp_fn_file);    
 14 |     
 15 |     print join("\t", "prog", "min_J", "min_S", "min_sum_frags", "TP", "FP", "FN", "TPR", "PPV", "F1") . "\n";
 16 | 
 17 |     for (my $min_J = 0; $min_J <= 5; $min_J++) {
 18 | 
 19 |         for (my $min_S = 0; $min_S <= 5; $min_S++) {
 20 |             
 21 |             
 22 |             foreach my $prog (keys %data) {
 23 |                 my $progdata_href = $data{$prog};
 24 |                 
 25 |                 &make_ROC($prog, $progdata_href, $min_J, $min_S);
 26 |             }
 27 |         }
 28 |     }
 29 |     exit(0);
 30 | }
 31 | 
 32 | use Data::Dumper;
 33 | 
 34 | ####
 35 | sub make_ROC {
 36 |     my ($prog_name, $progdata_href, $min_J, $min_S) = @_;
 37 | 
 38 |     my %data = %$progdata_href;
 39 |     
 40 |     my @TP_fusions = ($data{TP}) ? @{$data{TP}} : ();
 41 |     my @FP_fusions = ($data{FP}) ? @{$data{FP}} : ();
 42 |     my @FN_fusions = ($data{FN}) ? @{$data{FN}} : ();
 43 |     
 44 |     my $num_truth_fusions = scalar(@TP_fusions) + scalar(@FN_fusions);
 45 |     my $num_total_FP = scalar(@FP_fusions);
 46 |     
 47 |     my @uniq_vals = sort {$a<=>$b} &get_unique(@TP_fusions, @FP_fusions);
 48 | 
 49 |     @uniq_vals = grep { $_ >= $min_J + $min_S } @uniq_vals;
 50 | 
 51 |     @TP_fusions = grep { $_->{J} >= $min_J && $_->{S} >= $min_S} @TP_fusions;
 52 |     @FP_fusions = grep { $_->{J} >= $min_J && $_->{S} >= $min_S} @FP_fusions;
 53 |     
 54 |     for (my $i=0; $i < $#uniq_vals; $i++) {
 55 |         
 56 |         my $min_val = $uniq_vals[$i];
 57 |         
 58 |         @TP_fusions = grep { $_->{sum} >= $min_val  } @TP_fusions;
 59 |         
 60 |         @FP_fusions = grep { $_->{sum} >= $min_val } @FP_fusions;
 61 |         
 62 |         my $num_TP = scalar(@TP_fusions);
 63 |         my $num_FP = scalar(@FP_fusions);
 64 |         my $num_FN = $num_truth_fusions - $num_TP;
 65 |         
 66 |         my $TPR = sprintf("%.2f", $num_TP / $num_truth_fusions); # True Positive Rate
 67 | 
 68 |         my $FDR = sprintf("%.2f", $num_FP / ($num_FP + $num_TP)); # False Discovery Rate
 69 |         
 70 |         my $PPV = 1 - $FDR; # Positive Predictive Value
 71 |         
 72 | 
 73 |         my $Sn = $TPR;   # using true positive rate as 'sensitivity' measure
 74 |         my $Sp = $PPV;   # using positive predictive value as 'specificity' measure
 75 |         
 76 |         
 77 |         my $F1 = "NA";
 78 |         eval {
 79 |             $F1 = sprintf("%.3f", 2 * $Sn * $Sp / ($Sn + $Sp) );
 80 |         };
 81 |         
 82 |         print join("\t", $prog_name, $min_J, $min_S, $min_val, $num_TP, $num_FP, $num_FN, $TPR, $PPV, $F1) . "\n";
 83 |     }
 84 |     
 85 |     return;
 86 | }
 87 | 
 88 | 
 89 | ####
 90 | sub parse_file {
 91 |     my ($fusions_file) = @_;
 92 |     my %data;
 93 |         
 94 |     my %seen;
 95 |     
 96 |     open (my $fh, $fusions_file) or die $!;
 97 |     
 98 |     my $header = <$fh>;
 99 |     unless ($header =~ /^pred_result/) {
100 |         die "Error, not reading expected header format for $fusions_file";
101 |     }
102 |     while (<$fh>) {
103 |         chomp;
104 |         my @x = split(/\t/);
105 |         unless (scalar @x == 10) {
106 |             die "Error, did not parse 10 fields from row: $_";
107 |         }
108 |         my ($pred_type, $sample_name, $progname, $fusion, $J, $S, 
109 |             $mapped_gencode_A, $mapped_gencode_B, $explanation, $selected_fusion) = @x;
110 |         
111 |         unless ($pred_type =~ /^(TP|FP|FN)$/) { next; }
112 |                 
113 |         if ($selected_fusion ne '.') {
114 |             $fusion = $selected_fusion;
115 |         }
116 |         
117 |         my $fusion_token = join("::", $progname, $sample_name, $fusion);
118 |         
119 |         if ($seen{$fusion_token}) {
120 |             die "Error, already processed fusion [$fusion_token], and these should be unique entries in this file $fusions_file";
121 |         }
122 |         $seen{$fusion_token} = 1 ;
123 |         
124 |         my $val = $J + $S;
125 |         
126 |         push (@{$data{$progname}->{$pred_type}}, { sum => $val,
127 |                                                    J => $J,
128 |                                                    S => $S } );
129 |     }
130 |     close $fh;
131 |     
132 |     return(%data);
133 |     
134 | }
135 | 
136 | ####
137 | sub get_unique {
138 |     my (@vals) = @_;
139 |     
140 |     my %v = map { + $_->{sum} => 1 } @vals;
141 | 
142 |     return(keys %v);
143 | }
144 | 


--------------------------------------------------------------------------------
/benchmarking/all_TP_FP_FN_to_ROC.vary_minF_minS.plot.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | parser = ArgumentParser()
 8 | 
 9 | parser$add_argument("--datafile", help="input data file", required=TRUE, nargs=1)
10 | parser$add_argument("--progname", help="fusion prog name", required=TRUE, nargs=1)
11 | args = parser$parse_args()
12 | 
13 | 
14 | library(tidyverse)
15 | 
16 | pdf(paste0(args$progname, ".plot.pdf"))
17 | data = read.table(args$datafile, header=T)
18 | 
19 | data = data %>% filter( min_sum_frags <= 10 & min_J <= 1 & min_S <= 1)
20 | 
21 | data = data %>% unite(col='JS', min_J, min_S, sep=',')
22 | 
23 | data = data %>% filter(prog==args$progname & JS != "1,1")
24 | 
25 | data %>% ggplot(aes(x=min_sum_frags, y=F1, color=JS)) + geom_point() + geom_line()
26 | 
27 | message("see plot file: ", paste0(args$progname, ".plot.pdf"))
28 | 
29 | quit(save = "no", status = 0, runLast = FALSE)
30 | 


--------------------------------------------------------------------------------
/benchmarking/calc_PR.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys
  4 | import math
  5 | import argparse
  6 | 
  7 | # contributed by Bo Li, mod by bhaas
  8 | 
  9 | ntruth = 0
 10 | 
 11 | 
 12 | def main():
 13 | 
 14 |     parser = argparse.ArgumentParser(description="computes Precision-Recall Curve and AUC values", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 15 |     
 16 |     parser.add_argument("--in_ROC", dest="in_ROC_file", type=str, default="", required=True, help="input ROC file")
 17 | 
 18 |     parser.add_argument("--out_PR", dest="out_PR_file", type=str, default="", required=True, help="output PR file")
 19 | 
 20 |     parser.add_argument("--min_read_support", dest="min_read_support", type=int, default=0, help="minimum read support for including data point in AUC computation")
 21 | 
 22 |     args = parser.parse_args()
 23 | 
 24 |     
 25 |     ntotal = 25000**2  # all possible gene pairs, rough approx.
 26 |     prog = ""
 27 |     ltp = lfp = 0
 28 |     auc = 0.0
 29 | 
 30 |     with open(args.in_ROC_file) as fin, open(args.out_PR_file, "w") as fout:
 31 |         # write header
 32 |         fout.write("{}\t{}\t{}\t{}\n".format('prog', 'recall', 'precision', 'actual'))
 33 |         next(fin) # skip header line
 34 |         for line in fin:
 35 |             fields = line.strip().split()
 36 | 
 37 |             min_frags = int(fields[1])
 38 |             if (min_frags < args.min_read_support):
 39 |                 continue
 40 |             
 41 |             tp = int(fields[2])
 42 |             fp = int(fields[3])
 43 |             fn = int(fields[4])
 44 | 
 45 |             global ntruth
 46 |             ntruth = tp + fn
 47 | 
 48 |             if prog != fields[0]:
 49 |                 # prog switch
 50 |                 if prog != "":
 51 |                     # process last line of prev prog and report
 52 |                     auc += output(fout, prog, 0, 0, ltp, lfp)
 53 |                     print("{}\t{:.2f}".format(prog, auc))
 54 |                 # first line of next prog, reinit vals
 55 |                 prog = fields[0]
 56 |                 ltp = ntruth
 57 |                 lfp = ntotal - ntruth
 58 |                 auc = output(fout, prog, ltp, lfp)
 59 | 
 60 |             # add to auc
 61 |             auc += output(fout, prog, tp, fp, ltp, lfp)
 62 |             ltp = tp
 63 |             lfp = fp
 64 | 
 65 |         if prog != "":
 66 |             # last line of file, process last prog results
 67 |             auc += output(fout, prog, 0, 0, ltp, lfp)
 68 |             print("{}\t{:.2f}".format(prog, auc))
 69 | 
 70 | 
 71 |     sys.exit(0)
 72 | 
 73 | 
 74 | 
 75 | 
 76 | def output(fout, prog, ntp, nfp, nltp = -1, nlfp = -1):
 77 |     """ return delta auc """
 78 | 
 79 |     dauc = 0.0
 80 |     if nltp < 0:
 81 |         recall = 1.0
 82 |         precision = ntp * 1.0 / (ntp + nfp)
 83 |         fout.write("{}\t{}\t{}\t0\n".format(prog, recall, precision))
 84 |     elif ntp == 0 and nfp == 0:
 85 |         assert nltp >= 0 and nlfp >= 0 and nltp + nlfp > 0
 86 |         lrecall = nltp * 1.0 / ntruth
 87 |         lprecision = nltp * 1.0 / (nltp + nlfp)
 88 |         recall = 0.0
 89 |         precision = lprecision
 90 |         if lrecall > 0.0:
 91 |             fout.write("{}\t{}\t{}\t0\n".format(prog, recall, precision))
 92 |             #dauc = 0.5 * lrecall * lprecision
 93 |             dauc = lrecall * lprecision
 94 |     else:
 95 |         recall = ntp * 1.0 / ntruth
 96 |         precision = ntp * 1.0 / (ntp + nfp)
 97 | 
 98 |         if nltp > ntp:
 99 |             lrecall = nltp * 1.0 / ntruth
100 |             lprecision = nltp * 1.0 / (nltp + nlfp)
101 |             
102 |             rate = (nlfp - nfp) * 1.0 / (nltp - ntp)
103 |             trecall = lrecall - 0.01
104 |             x = nltp - ntp - 0.01 * ntruth
105 |             tlrecall = lrecall
106 |             tlprecision = lprecision
107 |             while trecall > recall:
108 |                 trecall = (ntp + x) * 1.0 / ntruth
109 |                 tprecision = (ntp + x) * 1.0 / (ntp + x + nfp + rate * x)
110 |                 fout.write("{}\t{}\t{}\t0\n".format(prog, trecall, tprecision))
111 |                 dauc += 0.5 * (tlprecision + tprecision) * (tlrecall - trecall)
112 | 
113 |                 tlrecall = trecall 
114 |                 tlprecision = tprecision
115 |                 trecall -= 0.01
116 |                 x -= 0.01 * ntruth
117 | 
118 |             dauc += 0.5 * (tlprecision + precision) * (tlrecall - recall)    
119 | 
120 |         fout.write("{}\t{}\t{}\t1\n".format(prog, recall, precision))
121 |     
122 |     return dauc
123 | 
124 | 
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 


--------------------------------------------------------------------------------
/benchmarking/collect_preds.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | use FindBin;
  7 | use lib ("$FindBin::Bin/../PerlLib");
  8 | 
  9 | 
 10 | my $usage = "usage: $0 fusion_result_file_listing.dat\n\n";
 11 | 
 12 | my $fusion_result_file_listing = $ARGV[0] or die $usage;
 13 | 
 14 | my $fusion_prog_parser_lib_dir = "$FindBin::Bin/FusionProgParsers";
 15 | 
 16 | my %prog_type_to_file_parser = ( 
 17 |     'ChimPipe' => 'ChimPipe_parser',
 18 |     'ChimeraScan' => 'ChimeraScan_parser',
 19 |     'deFuse' => 'DEFUSE_parser',
 20 |     'EricScript' => 'EricScript_parser',
 21 | 
 22 |     'Fusion.*Catcher' => 'FusionCatcher_parser',
 23 |     'FC_V0997c' => 'FusionCatcher_parser',
 24 |     
 25 |     'FusionHunter' => 'FusionHunter_parser',
 26 | 
 27 |     'FusionInspector' => 'FusionInspector_parser',
 28 |     'FI-*' => 'FusionInspector_parser',
 29 |     
 30 |     'InFusion' => 'InFusion_parser', 
 31 | 
 32 |     'JAFFA-Assembly' => 'JAFFA_parser',
 33 |     'JAFFA-Direct' => 'JAFFA_parser',
 34 |     'JAFFA-Hybrid' => 'JAFFA_parser',
 35 | 
 36 |     'MapSplice' => 'MapSplice_parser',
 37 | 
 38 |     'nFuse' => 'NFuse_parser',
 39 | 
 40 |     'PRADA' => 'PRADA_parser',
 41 | 
 42 |     'SOAP-fuse' => 'SOAPfuse_parser',
 43 | 
 44 |     'STAR_FUSION' => 'STARFusion_parser',
 45 |     'starfusion' => 'STARFusion_parser', 
 46 |     'STARF' => 'STARFusion_parser', 
 47 |     
 48 |     'TopHat-Fusion' => 'TopHatFusion_parser',
 49 | 
 50 |     'PIZZLY' => 'PIZZLY_parser',
 51 | 
 52 |     'ARRIBA' => 'ARRIBA_parser',
 53 |     'ARRIBA_hc' => 'ARRIBA_hc_parser',
 54 |     
 55 |     'STARCHIP' => 'STARCHIP_parser',
 56 |     'STARChip_csm10' => 'STARCHIP_parser',
 57 |     'STARCHIP_csm10' => 'STARCHIP_parser',
 58 |     'STARCHIP_csm10_pG_Apr302019' => 'STARCHIP_parser',
 59 |     'STARCHIP_csm10_pGm2_May012019' => 'STARCHIP_parser',
 60 |     
 61 |     #'TrinityFusion' => 'TrinityFusion_parser',
 62 |     #'TrinityFusion-D' => 'TrinityFusion_parser',
 63 |     #'TrinityFusion-C' => 'TrinityFusion_parser',
 64 |     #'TrinityFusion-UC' => 'TrinityFusion_parser',
 65 |     'TRINITY.*FUSION' => 'TrinityFusion_parser',
 66 |     
 67 |     'STARSEQR' => 'STARSEQR_parser'
 68 |     
 69 |     );
 70 | 
 71 | 
 72 | 
 73 | foreach my $module (values %prog_type_to_file_parser) {
 74 |     my $module_path = "$fusion_prog_parser_lib_dir/$module.pm";
 75 | 
 76 |     require($module_path);
 77 | 
 78 | }
 79 | 
 80 | 
 81 | main: {
 82 | 
 83 | 
 84 |     # print header
 85 |     print join("\t", "sample", "prog", "fusion", "J", "S") . "\n";
 86 |     
 87 |     open(my $fh, $fusion_result_file_listing) or die "Error, cannot open file $fusion_result_file_listing";
 88 |     while (<$fh>) {
 89 |         chomp;
 90 |         my ($sample_name, $prog_name, $result_file) = split(/\t/);
 91 | 
 92 | 
 93 | 
 94 |         my $parser_module;
 95 | 
 96 |         if (exists $prog_type_to_file_parser{$prog_name}) {
 97 |             $parser_module = $prog_type_to_file_parser{$prog_name};
 98 |         }
 99 |         else {
100 |             ## use regex to find parser
101 |             foreach my $name (keys %prog_type_to_file_parser) {
102 |                 if ($prog_name =~ /$name/i) {
103 |                     $parser_module = $prog_type_to_file_parser{$name};
104 |                     last;
105 |                 }
106 |             }
107 |         }
108 | 
109 |         unless (defined $parser_module) {
110 | 
111 |             die "Error, no parser for prog [$prog_name] ";
112 |         }
113 |         
114 |         my $parser_function = $parser_module . "::" . "parse_fusion_result_file";
115 |         
116 |         no strict 'refs';
117 |         my @fusions = &$parser_function($result_file);
118 | 
119 |         &add_sum_fusions(\@fusions);
120 |         
121 |         @fusions = reverse sort { $a->{sum_frags} <=> $b->{sum_frags} } @fusions;
122 |         
123 |         foreach my $fusion (@fusions) {
124 | 
125 |             my $fusion_name = join("--", $fusion->{geneA}, $fusion->{geneB});
126 | 
127 |             my $junc_count = $fusion->{junc_reads};
128 |             my $span_count = $fusion->{span_reads};
129 | 
130 | 
131 |             print join("\t", $sample_name, $prog_name, $fusion_name, $junc_count, $span_count) . "\n";
132 |         }
133 |                     
134 |     }
135 |     close $fh;
136 | 
137 | 
138 |     exit(0);
139 | }
140 | 
141 | ####
142 | sub add_sum_fusions {
143 |     my ($fusions_aref) = @_;
144 | 
145 |     foreach my $fusion (@$fusions_aref) {
146 | 
147 |         $fusion->{sum_frags} = $fusion->{junc_reads} + $fusion->{span_reads};
148 | 
149 |     }
150 | 
151 | }
152 | 


--------------------------------------------------------------------------------
/benchmarking/collected_preds_to_fusion_prog_support_listing.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "\n\n\tusage: $0 fusion_preds.collected progs_to_consider.txt\n\n";
 7 | 
 8 | my $preds_file = $ARGV[0] or die $usage;
 9 | my $progs_to_consider_file = $ARGV[1] or die $usage;
10 | 
11 | main: {
12 | 
13 |     my %progs_to_consider;
14 |     {
15 |         open(my $fh, $progs_to_consider_file) or die "Error, cannot open file $progs_to_consider_file";
16 |         while (<$fh>) {
17 |             s/^\s+|\s+$//g;
18 |             my $prog = $_;
19 |             $progs_to_consider{$prog} = 1;
20 |         }
21 |         close $fh;
22 |     }
23 |     
24 |     my %fusion_to_prog;
25 | 
26 |     open (my $fh, $preds_file) or die "Error, cannot open file $preds_file";
27 |     my $header = <$fh>;
28 |     unless ($header =~ /^sample\tprog/) {
29 |         die "Error, missing expected header in $preds_file";
30 |     }
31 |     while (<$fh>) {
32 |         chomp;
33 |         my ($sample_name, $prog, $fusion_name, $junc_support, $frag_support) = split(/\t/);
34 | 
35 |         unless ($progs_to_consider{$prog}) { next; }
36 | 
37 |         $fusion_name = uc $fusion_name;
38 | 
39 |         my ($genesA, $genesB) = split(/--/, $fusion_name);
40 | 
41 |         foreach my $geneA (split(/,/, $genesA)) {
42 |             foreach my $geneB (split(/,/, $genesB)) {
43 |                 
44 |                 my $fusion_name_use = join("--", $geneA, $geneB);
45 |                 $fusion_name = "$sample_name|$fusion_name_use";
46 |                 
47 |                 $fusion_to_prog{$fusion_name}->{$prog} = 1;
48 |             }
49 |         }
50 |     }
51 |     close $fh;
52 |     
53 |     my @fusion_structs;
54 |     foreach my $fusion_name (keys %fusion_to_prog) {
55 |         my $progs_href = $fusion_to_prog{$fusion_name};
56 |         
57 |         my @prognames = sort keys %$progs_href;
58 |         my $num_progs = scalar(@prognames);
59 |         
60 |         push (@fusion_structs, { fusion_name => $fusion_name,
61 |                                  prognames => \@prognames,
62 |                                  count => $num_progs,
63 |               } );
64 | 
65 |     }
66 | 
67 |     @fusion_structs = reverse sort {$a->{count} <=> $b->{count} } @fusion_structs;
68 | 
69 |     foreach my $fusion_struct (@fusion_structs) {
70 |         print join("\t", $fusion_struct->{fusion_name}, 
71 |                    join(",", @{$fusion_struct->{prognames}}),
72 |                    $fusion_struct->{count},
73 |             ) . "\n";
74 |     }
75 | 
76 |     exit(0);
77 | }
78 | 
79 | 


--------------------------------------------------------------------------------
/benchmarking/compare_A_vs_B_scored_preds.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use FindBin;
 7 | use lib ("$FindBin::Bin/../PerlLib");
 8 | use DelimParser;
 9 | 
10 | my $usage = "usage: $0 pred_results.scored  progA  progB\n\n";
11 | 
12 | my $scored_preds_file = $ARGV[0] or die $usage;
13 | my $progA = $ARGV[1] or die $usage;
14 | my $progB = $ARGV[2] or die $usage;
15 | 
16 | 
17 | 
18 | main: {
19 | 
20 |     my %TP_preds;
21 |     my %FP_preds;
22 |     
23 |     open(my $fh, $scored_preds_file) or die $!;
24 |     
25 |     my $delim_parser = new DelimParser::Reader($fh, "\t");
26 | 
27 |     while (my $row = $delim_parser->get_row()) {
28 | 
29 |         my $prog = $row->{prog};
30 |         if ($prog eq "$progA" || $prog eq "$progB") {
31 |          
32 |             my $J = $row->{J};
33 |             my $S = $row->{S};
34 |             
35 |             my $pred_result = $row->{pred_result};
36 |             if ($pred_result eq "TP") {
37 |                 my $selected_fusion = $row->{selected_fusion};
38 |                 $TP_preds{$selected_fusion}->{$prog} = "($J,$S)";
39 |             }
40 |             elsif ($pred_result eq "FP") {
41 |                 my $fusion = $row->{fusion};
42 |                 $FP_preds{$fusion}->{$prog} = "($J,$S)";
43 |             }
44 |         }
45 | 
46 |     }
47 | 
48 |     print "#pred_result\tfusion\t$progA\t$progB\n";
49 |     foreach my $fusion (keys %TP_preds) {
50 |         my $progA_results = $TP_preds{$fusion}->{$progA} || ".";
51 |         my $progB_results = $TP_preds{$fusion}->{$progB} || ".";
52 | 
53 |         print join("\t", "TP", $fusion, $progA_results, $progB_results) . "\n";
54 |     }
55 | 
56 |     print "#pred_result\tfusion\t$progA\t$progB\n";
57 |     foreach my $fusion (keys %FP_preds) {
58 |         my $progA_results = $FP_preds{$fusion}->{$progA} || ".";
59 |         my $progB_results = $FP_preds{$fusion}->{$progB} || ".";
60 |         
61 |         print join("\t", "FP", $fusion, $progA_results, $progB_results) . "\n";
62 |     }
63 | 
64 | 
65 |     exit(0);
66 | }
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/benchmarking/define_truth_n_unsure_set.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use File::Basename;
 6 | 
 7 | my $usage = "\n\n\tusage: $0 preds.collected.byProg min_truth\n\n";
 8 | 
 9 | my $preds_collected = $ARGV[0] or die $usage;
10 | my $min_truth = $ARGV[1] or die $usage;
11 | 
12 | main: {
13 |     
14 |     open(my $fh, $preds_collected) or die "Error, cannot open file $preds_collected";
15 | 
16 |     my $out_basename = basename($preds_collected);
17 |     
18 |     open(my $ofh_truth, ">$out_basename.min_${min_truth}.truth_set") or die $!;
19 |     open(my $ofh_unsure, ">$out_basename.min_${min_truth}.unsure_set") or die $!;
20 |     
21 |     while (<$fh>) {
22 |         chomp;
23 |         my ($fusion_name, $prog_list, $prog_count) = split(/\t/);
24 |         if ($prog_count >= $min_truth) {
25 |             print $ofh_truth "$fusion_name\n";
26 |         }
27 |         elsif ($prog_count > 1) {
28 |             print $ofh_unsure "$fusion_name\n";
29 |         }
30 |     }
31 | 
32 |     close $fh;
33 |     close $ofh_truth;
34 |     close $ofh_unsure;
35 | 
36 |     exit(0);
37 | }
38 | 


--------------------------------------------------------------------------------
/benchmarking/filter_collected_preds.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "\n\n\tusage: $0 preds.collected.wAnnot\n\n";
 7 | 
 8 | my $preds_file = $ARGV[0] or die $usage;
 9 | 
10 | open (my $fh, $preds_file) or die "Error, cannot open file $preds_file";
11 | while(<$fh>) {
12 |     my $line = $_;
13 |     my @x = split(/\t/);
14 |     my $fusion_name = $x[2];
15 |     my $annot = $x[7];
16 |     
17 |     if ($fusion_name =~ /(^HLA\-)|\-HLA\-/ 
18 |         ||
19 |         ($annot && 
20 |          ($annot =~ /chrM:/i
21 |           ||
22 |           $annot =~ /NEIGHBOR/
23 |           ||
24 |           $annot =~ /BLAST/
25 |           ||
26 |           $annot =~ /GTEx|BodyMap|DGD_PARALOGS|HGNC_GENEFAM|Greger_Normal|Babiceanu_Normal|ConjoinG/
27 |           ||
28 |           $fusion_name =~ /IG[HKL].*--IG[HKL]/   
29 |          )
30 |         )
31 |         ) 
32 |     {
33 |         next;
34 |     }
35 |     print $line;
36 | }
37 | 
38 | exit(0);
39 | 
40 | 


--------------------------------------------------------------------------------
/benchmarking/fusion_preds_sensitivity_vs_expr.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use FindBin;
  6 | 
  7 | use POSIX;
  8 | use Data::Dumper;
  9 | 
 10 | my $usage = "\n\n\tusage: $0 preds.collected.scored fusion_TPM_values.dat\n\n";
 11 | 
 12 | 
 13 | my $preds_file = $ARGV[0] or die $usage;
 14 | my $fusion_TPMs_file = $ARGV[1] or die $usage;
 15 | 
 16 | my $max_bin = 10;
 17 | 
 18 | main: {
 19 | 
 20 |     my %fusion_to_expr_bin = &parse_fusions_into_expr_bins($fusion_TPMs_file);
 21 |     
 22 |     my %count_ref_fusions_in_bin;
 23 |     foreach my $bin (values %fusion_to_expr_bin) {
 24 |         $count_ref_fusions_in_bin{$bin}++;
 25 |     }
 26 | 
 27 |     #print Dumper(\%count_ref_fusions_in_bin);
 28 |     print STDERR "counts of ref fusions per bin:\n";
 29 |     for my $bin (1..$max_bin) {
 30 |         my $count = $count_ref_fusions_in_bin{$bin} || 0;
 31 |         print STDERR join("\t", $bin, $count) . "\n";
 32 |     }
 33 |         
 34 |     my %method_to_fusion_pred = &parse_fusion_predictions($preds_file);
 35 |     
 36 |     
 37 |     ## for each method, determine the counts in each bin.
 38 |     
 39 |     print "#\t" . join("\t", (1..$max_bin)) . "\n";
 40 |     foreach my $method (keys %method_to_fusion_pred) {
 41 |         
 42 |         my $fusion_preds_href = $method_to_fusion_pred{$method};
 43 |         my %bin_counts;
 44 |         
 45 |         foreach my $fusion_pred (keys %$fusion_preds_href) {
 46 |             
 47 |             my $bin = $fusion_to_expr_bin{$fusion_pred};
 48 |             unless (defined $bin) {
 49 |                 print STDERR "Error, fusion \"$fusion_pred\" not assigned to an expression bin.\n";
 50 |                 next;
 51 |             }
 52 |             
 53 |             $bin_counts{$bin}++;
 54 |             
 55 |         }
 56 | 
 57 |         print "$method";
 58 |         for my $bin (1..$max_bin) {
 59 |             my $count = $bin_counts{$bin} || 0;
 60 |             my $num_ref_fusions_in_bin = $count_ref_fusions_in_bin{$bin};
 61 | 
 62 |             my $sensitivity = sprintf("%.2f", $count / $num_ref_fusions_in_bin * 100);
 63 |             print "\t$sensitivity";
 64 |         }
 65 |         print "\n";
 66 |     }
 67 | 
 68 |     exit(0);
 69 |     
 70 |     
 71 | }
 72 | 
 73 | ####
 74 | sub parse_fusion_predictions {
 75 |     my ($preds_file) = @_;
 76 | 
 77 |     my %method_to_preds;
 78 | 
 79 |     open (my $fh, $preds_file) or die $!;
 80 |     while (<$fh>) {
 81 |         my $line = $_;
 82 |         chomp;
 83 |         if (/^\#/) { next; }
 84 |         my @x = split(/\t/);
 85 |         
 86 |         my $pred_class = $x[0];
 87 |         unless ($pred_class eq "TP") { next; }
 88 |         
 89 |         my $fusion_name = $x[3];
 90 |         my $sample = $x[2];
 91 |         my $method = $x[1];
 92 |         
 93 |         if ($line =~ /chr_mapping_to_first_encounter_of_TP_\S+\|(\S+--\S+)/) {
 94 |             $fusion_name = $1;
 95 |         }
 96 |         
 97 |         $fusion_name = uc $fusion_name;
 98 |         my ($geneA, $geneB) = sort split(/--/, $fusion_name);
 99 |         $fusion_name = "$geneA--$geneB";
100 |         
101 |         $fusion_name = "$sample|$fusion_name";
102 |         
103 |         $method_to_preds{$method}->{$fusion_name} = 1;
104 |     }
105 |     close $fh;
106 |     
107 |     
108 |     return(%method_to_preds);
109 | }
110 |         
111 | 
112 | ####
113 | sub parse_fusions_into_expr_bins {
114 |     my ($fusion_tpm_file) = @_;
115 | 
116 |     my %fusion_to_TPM_bin;
117 | 
118 |     open (my $fh, $fusion_tpm_file) or die $!;
119 |     while (<$fh>) {
120 |         chomp;
121 |         my ($sample, $fusion, $TPM) = split(/\t/);
122 | 
123 |         $fusion = uc($fusion);
124 |         my ($geneA, $geneB) = sort split(/--/, $fusion);
125 |         
126 |         $fusion = "$geneA--$geneB";
127 |         
128 |         my $bin = ceil(log($TPM+0.01)/log(2));
129 |         
130 |         if ($bin < 1) {
131 |             $bin = 1;
132 |         }
133 |         elsif ($bin > $max_bin) {
134 |             $bin = $max_bin;
135 |         }
136 |         
137 |         my $fusion_name = join("|", $sample, $fusion);
138 | 
139 |         $fusion_to_TPM_bin{$fusion_name} = $bin;
140 |     }
141 |     close $fh;
142 | 
143 |     return(%fusion_to_TPM_bin);
144 | }
145 | 


--------------------------------------------------------------------------------
/benchmarking/fusion_preds_to_matrix.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "usage: $0 preds.collected\n\n";
 7 | 
 8 | my $preds_file = $ARGV[0] or die $usage;
 9 | 
10 | 
11 | main: {
12 | 
13 |     my %prognames;
14 |     my %fusion_preds;
15 | 
16 |     open(my $fh, $preds_file) or die $!;
17 |     my $header = <$fh>;
18 |     unless ($header =~ /^sample\tprog/) {
19 |         die "Error, missing expected header format for $preds_file";
20 |     }
21 |     while(<$fh>) {
22 |         chomp;
23 |         my @x = split(/\t/);
24 |         my $sample_name = $x[0];
25 |         my $prog = $x[1];
26 |         my $fusion_name = uc $x[2];
27 |         my $J = $x[3];
28 |         my $S = $x[4];
29 | 
30 |         my $sum_JS = $J + $S;
31 | 
32 |         $fusion_name = "$sample_name|$fusion_name";
33 | 
34 |         $prognames{$prog} = 1;
35 |         
36 |         $fusion_preds{$fusion_name}->{$prog} = $sum_JS;
37 |         
38 |     }
39 |     close $fh;
40 | 
41 | 
42 |     ## output matrix
43 |     my @prognames = sort keys %prognames;
44 |     my @fusions = sort keys %fusion_preds;
45 | 
46 |     print "\t" . join("\t", @prognames) . "\n";
47 | 
48 |     foreach my $fusion (@fusions) {
49 |         my @vals = ($fusion);
50 |         foreach my $progname (@prognames) {
51 |             my $val = $fusion_preds{$fusion}->{$progname} || 0;
52 |             push (@vals, $val);
53 |         }
54 |         
55 |         print join("\t", @vals) . "\n";
56 |     }
57 | 
58 |     exit(0);
59 | }
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/benchmarking/fusion_progs_agree_to_matrix.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "usage: $0 fusions.byProgAgree [min_progs=3]\n\n";
 7 | 
 8 | my $progs_agree_file = $ARGV[0] or die $usage;
 9 | my $min_progs_agree = $ARGV[1] || 3;
10 | 
11 | main: {
12 | 
13 |     my %prognames;
14 |     my %fusion_preds;
15 | 
16 |     open(my $fh, $progs_agree_file) or die $!;
17 |     while(<$fh>) {
18 |         chomp;
19 |         my @x = split(/\t/);
20 |         my $fusion_name = $x[0];
21 |         my $prog_list = $x[1];
22 |         my $count_fusions = $x[2];
23 | 
24 |         if ($count_fusions < $min_progs_agree) {
25 |             next;
26 |         }
27 |         
28 |         my @progs = split(/,/, $prog_list);
29 |         foreach my $prog (@progs) {
30 | 
31 |             $prognames{$prog} = 1;
32 |             
33 |             $fusion_preds{$fusion_name}->{$prog} = 1;
34 |         }
35 |     }
36 |     close $fh;
37 | 
38 | 
39 |     ## output matrix
40 |     my @prognames = sort keys %prognames;
41 |     my @fusions = sort keys %fusion_preds;
42 | 
43 |     print "\t" . join("\t", @prognames) . "\n";
44 | 
45 |     foreach my $fusion (@fusions) {
46 |         my @vals = ($fusion);
47 |         foreach my $progname (@prognames) {
48 |             my $val = $fusion_preds{$fusion}->{$progname} || 0;
49 |             push (@vals, $val);
50 |         }
51 |         
52 |         print join("\t", @vals) . "\n";
53 |     }
54 |     
55 |     exit(0);
56 | }
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/benchmarking/fusion_sample_TPs_to_matrix.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "usage: $0 preds.collected.scored\n\n";
 7 | 
 8 | my $preds_file = $ARGV[0] or die $usage;
 9 | 
10 | 
11 | main: {
12 | 
13 |     my %prog_to_sample_to_TP;
14 |     my %samples;
15 |     
16 |     open(my $fh, $preds_file) or die $!;
17 |     while(<$fh>) {
18 |         if (/^\#/) { next; }
19 |         chomp;
20 |         my @x = split(/\t/);
21 |       
22 |         my $score_type = $x[0];
23 |         my $prog = $x[1];
24 |         my $sample_name = $x[2]; 
25 |         my $fusion = $x[3];
26 | 
27 |         if ($score_type eq 'TP') {
28 |             $prog_to_sample_to_TP{$prog}->{$sample_name}->{$fusion}++;
29 |         }
30 | 
31 |         $samples{$sample_name}++;
32 |         
33 | 
34 |     }
35 |     close $fh;
36 | 
37 | 
38 |     ## output matrix
39 |     my @prognames = sort keys %prog_to_sample_to_TP;
40 |     my @samplenames = keys %samples;
41 |     
42 |     print "\t" . join("\t", @prognames) . "\n";
43 | 
44 |     foreach my $sample (@samplenames) {
45 |         my @vals = ($sample);
46 |         foreach my $prog (@prognames) {
47 |             my @TP_fusions;
48 |             if (exists $prog_to_sample_to_TP{$prog}->{$sample}) {
49 |                 @TP_fusions = keys %{$prog_to_sample_to_TP{$prog}->{$sample}};
50 |             }
51 |             my $num_TP = scalar(@TP_fusions);
52 |             push (@vals, $num_TP);
53 |         }
54 |         print join("\t", @vals) . "\n";
55 |     }
56 | 
57 |     exit(0);
58 | }
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/benchmarking/notes:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ~/GITHUB/CTAT_FUSIONS/FusionAnnotator/FusionAnnotator --annotate preds.collected -C 2 > preds.collected.wAnnot
 4 | 
 5 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/filter_collected_preds.pl preds.collected.wAnnot > preds.collected.wAnnot.filt
 6 | 
 7 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/collected_preds_to_fusion_prog_support_listing.pl preds.collected.wAnnot.filt > preds.collected.wAnnot.filt.byProg
 8 | 
 9 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/define_truth_n_unsure_set.pl preds.collected.wAnnot.filt.byProg 4
10 | 
11 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/fusion_preds_to_TP_FP_FN.wrapper.pl --fusion_preds preds.collected.wAnnot.filt --truth_fusions preds.collected.wAnnot.filt.byProg.min_4.truth_set --unsure_fusions preds.collected.wAnnot.filt.byProg.min_4.unsure_set --allow_reverse_fusion > min4.score
12 | 
13 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/all_TP_FP_FN_to_ROC.pl min4.score > min4.score.roc
14 | 
15 | 
16 | ~/GITHUB/CTAT_FUSIONS/FusionSimulatorToolkit/benchmarking/plotters/plot_ROC.Rscript min4.score.roc 
17 | 
18 | 
19 | 
20 | 
21 | ## sensitivty vs. expression
22 | 
23 | fusion_preds_sensitivity_vs_expr.pl preds.collected.scored fusion_TPM_values.dat > sensitivity_vs_expr.dat
24 | 
25 |  ~/GITHUB/trinityrnaseq/Analysis/DifferentialExpression/PtR  -m sensitivity_vs_expr.dat --heatmap --sample_clust none --heatmap_colorscheme 'black,yellow'
26 | 
27 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/AUC_barplot.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) != 1) {
 6 |     stop("require param: file.auc ")
 7 | }
 8 | 
 9 | 
10 | auc = args[1]
11 | 
12 | data = read.table(auc, header=F)
13 | 
14 | 
15 | library('data.table')
16 | 
17 | colnames(data) = c('progname', 'auc')
18 | 
19 | dt = data.table(data)
20 | 
21 | 
22 | 
23 | barplot_filename = paste0(auc, ".barplot.pdf")
24 | pdf(barplot_filename)
25 | 
26 | prognames = levels(dt[,progname])
27 | colors = rainbow(length(prognames))
28 | names(colors) = prognames
29 | 
30 | dt = dt[order(-auc),]
31 | 
32 | ordered_prognames = dt[,progname]
33 | barplot(dt[,auc], names.arg=ordered_prognames, las=2, cex.axis=0.5, cex.names=0.5, col=colors[ordered_prognames])
34 | 
35 | 
36 | dev.off()
37 | 
38 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/AUC_boxplot.from_separate_auc_files.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) < 2) {
 6 |     stop("require param: fileA.auc fileB.auc ...   ")
 7 | }
 8 | 
 9 | 
10 | summary_table = NULL
11 | 
12 | for (auc in args) {
13 | 
14 | 	data = read.table(auc, header=F, row.names=1)
15 |     data = t(data)
16 | 	rownames(data) = c(auc)
17 | 
18 | 	if (is.null(summary_table)) {
19 |     	summary_table = data
20 |     }
21 | 	else {
22 |     	
23 |     	summary_table = rbind(summary_table, data[,colnames(summary_table)])
24 |     }
25 | 
26 | }
27 | 
28 | 
29 | summary_table = summary_table[,rev(order(apply(summary_table, 2, median)))]
30 | 
31 | print(summary_table)
32 | 
33 | write.table(summary_table, file="auc.summary_table.txt", quote=F, sep="\t")
34 | 
35 | pdf("auc.boxplot.pdf")
36 | boxplot(summary_table, las=2, outline=F)
37 | 
38 | dev.off()
39 | 
40 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/AUC_boxplot.from_single_summary_AUC_file.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) < 1) {
 6 |     stop("require param: summary_all.auc ")
 7 | }
 8 | 
 9 | 
10 | 
11 | auc = args[1]
12 | 
13 | data = read.table(auc, header=F)
14 | 
15 | l = split(data, data[,1])
16 | 
17 | l2 = lapply(l, function(x) { x[,2] })
18 | 
19 | 
20 | pdf_filename = paste0(auc, ".AUC_boxplot.pdf")
21 | pdf(pdf_filename)
22 | boxplot(l2[ rev(order(sapply(l2, function(x) { median(x) })) ) ], las=2, outline=F)
23 | 
24 | dev.off()
25 | 
26 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plotPRcurves.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # contributed by Bo Li, mod by bhaas
 4 | 
 5 | argv = commandArgs(TRUE)
 6 | if (length(argv) != 2) {
 7 |    cat("Usage: Rscript plotPRcurves.R input.table output.pdf\n")
 8 |    q(status = 1)
 9 | }
10 | 
11 | lwd=1
12 | 
13 | plotPR = function(id, progs, data, colors) {
14 | 	idx = data[,1] == progs[id]
15 | 	if (id == 1) {
16 | 		plot(data[idx,2], data[idx,3], type = 'l', lwd = lwd, col = colors[id], lty = id, xlim = c(0, 1), ylim = c(0, 1), xlab = "Recall", ylab = "Precision")
17 | 	} else {
18 | 		par(new = T)
19 | 		plot(data[idx,2], data[idx,3], type = 'l', lwd = lwd, col = colors[id], lty = id, xlim = c(0, 1), ylim = c(0, 1), xlab = "", ylab = "")
20 | 	}
21 | }
22 | 
23 | data = read.table(argv[1], header=T)
24 | progs = levels(data[,1])
25 | colors = rainbow(length(progs))
26 | 
27 | pdf(argv[2])
28 | par(mar = c(5, 4, 8, 2) + 0.1, xpd = TRUE)
29 | a = lapply(1:length(progs), plotPR, progs, data, colors)
30 | legend(x = -0.06, y = 1.3, legend = progs, ncol = 3, lwd = lwd, col = colors, lty = 1:length(progs), cex = 0.54)
31 | dev.off()
32 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) != 3) {
 6 |     stop("require params: auc.50.dat auc.101.dat plot_name.pdf")
 7 | }
 8 | 
 9 | auc_50_dat_filename = args[1]
10 | auc_101_dat_filename = args[2]
11 | plot_name = args[3]
12 | 
13 | auc_50_dat = read.table(auc_50_dat_filename, header=F)
14 | auc_101_dat = read.table(auc_101_dat_filename, header=F)
15 | 
16 | df_50_dat = data.frame(read_type=c('len50'), progname=auc_50_dat[,1], auc=auc_50_dat[,2])
17 | df_101_dat = data.frame(read_type=c('len101'), progname=auc_101_dat[,1], auc=auc_101_dat[,2])
18 | 
19 | all_dat = rbind(df_50_dat, df_101_dat)
20 | 
21 | 
22 | ## plot it
23 | library('ggplot2')
24 | library('data.table')
25 | 
26 | dt = data.table(all_dat)
27 | 
28 | #dt_len101 = dt[read_type == "len101"]
29 | 
30 | #dt_auc_median = dt_len101[,.(auc.median=median(auc)), by=.(progname)][order(-auc.median)]
31 | 
32 | #dt_auc_median = dt[,.(auc.median=median(auc)), by=.(progname)][order(-auc.median)]
33 | 
34 | dt_auc_mean = dt[,.(auc.mean=mean(auc)), by=.(progname)][order(-auc.mean)]
35 | 
36 | all_dat$progname = factor(all_dat$progname, levels=factor(dt_auc_mean$progname))
37 | all_dat$read_type = factor(all_dat$read_type, levels=factor(c('len101', 'len50')))
38 | 
39 | 
40 | p = ggplot(all_dat, aes(factor(progname), auc)) +
41 |     geom_boxplot(aes(fill=read_type), outlier.shape=NA) +
42 |     theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5));
43 | 
44 | pdf(plot_name)
45 | plot(p)
46 | 
47 | dev.off()
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_F1_vs_min_frags.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | args<-commandArgs(TRUE)
 6 | 
 7 | if (length(args) == 0) {
 8 |     stop("require param: min_X.results.scored.ROC")
 9 | }	
10 | 
11 | roc_file = args[1]
12 | 
13 | library(ggplot2)
14 | library(dplyr)
15 | 
16 | data = read.table(roc_file, header=T)
17 | 
18 | p = data %>% filter(min_sum_frags <= 20) %>% ggplot(aes(x=min_sum_frags, y=F1, color=prog)) + geom_line()
19 | 
20 | pdf_filename = paste0(roc_file, ".F1_vs_minFrags.pdf")
21 | pdf(pdf_filename, width=8)
22 | 
23 | plot(p)
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_TP_FP_vs_minSum_per_prog.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | args<-commandArgs(TRUE)
 6 | 
 7 | if (length(args) == 0) {
 8 |     stop("require param: min_X.results.scored.ROC")
 9 | }	
10 | 
11 | roc_file = args[1]
12 | 
13 | library(ggplot2)
14 | library(dplyr)
15 | library(tidyr)
16 | 
17 | data = read.table(roc_file, header=T)
18 | 
19 | max_TP = max(data$TP)
20 | 
21 | p = data %>% filter(min_sum_frags<20) %>% gather(key='TPFP', value='count', TP, FP) %>% ggplot(aes(x=min_sum_frags, y=count, color=TPFP)) + geom_point() + facet_wrap(~prog) + geom_hline(yintercept = max_TP) + ylim(0,1.5*max_TP)
22 | 
23 | pdf_filename = paste0(roc_file, ".TP_and_FP_counts_vs_minFrags_eaProg.pdf")
24 | pdf(pdf_filename)
25 | 
26 | plot(p)
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_all_auc_barplots.Rscript:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | library(ggplot2)
  4 | library(dplyr)
  5 | library(tidyr)
  6 | library('data.table')
  7 | 
  8 | 
  9 | args<-commandArgs(TRUE)
 10 | 
 11 | if (length(args) != 2) {
 12 |     stop("require param: low_val high_val")
 13 | }
 14 | 
 15 | low_val = as.numeric(args[1])
 16 | high_val = as.numeric(args[2])
 17 | 
 18 | pdf("all.auc.dat.pdf", height=20)
 19 | 
 20 | data = read.table("all.auc.dat", header=T)
 21 | 
 22 | num_experiments = high_val - low_val + 1
 23 | num_plots = num_experiments*4
 24 | 
 25 | layout(matrix(1:num_plots, nrow=num_experiments, ncol=4, byrow=T))
 26 | 
 27 | par(mar=c(2,2,2,2))
 28 | 
 29 | dt = data.table(data)
 30 | 
 31 | prognames = levels(dt[,progname])
 32 | 
 33 | colors = rainbow(length(prognames))
 34 | names(colors) = prognames
 35 | 
 36 | rankings = list()
 37 | 
 38 | for (mt in seq(low_val, high_val)) {
 39 |  	for (iu in c(0,1)) {
 40 |     	for (okp in c(0,1)) {
 41 | 
 42 |             title = sprintf("mA=%d, iu=%d, okp=%d", mt, iu, okp)
 43 | 
 44 | 			mini_dt = dt[min_thresh==mt & ignoreUnsure==iu & okpara==okp,]
 45 |             mini_dt = mini_dt[order(-auc),]
 46 |             print(mini_dt)
 47 |             ordered_prognames = mini_dt[,progname]
 48 |             barplot(mini_dt[,auc], names.arg=ordered_prognames, las=2, cex.axis=0.5, cex.names=0.5, col=colors[ordered_prognames], main=title)
 49 | 
 50 |             ordered_prognames = as.character(ordered_prognames)
 51 |             for (rank_val in seq(length(ordered_prognames))) {
 52 |                 myprogname = ordered_prognames[rank_val]
 53 |                 prog_auc = mini_dt[rank_val,auc]
 54 |                 if (myprogname %in% names(rankings)) {
 55 |                     rankings[[ myprogname ]] = rbind(rankings[[ myprogname ]], data.frame(rankval=rank_val, mt=mt, iu=iu, okp=okp, auc=prog_auc))
 56 |                 } else {
 57 |                     rankings[[ myprogname ]] = data.frame(rankval=rank_val, mt=mt, iu=iu, okp=okp, auc=prog_auc)
 58 |                 }
 59 |             }
 60 | 
 61 |         }
 62 |     }
 63 | }
 64 | 
 65 | dev.off()
 66 | 
 67 | 
 68 | #############################
 69 | ## Examine relative rankings:
 70 | 
 71 | rankings.table = do.call(rbind, lapply(names(rankings), function(x) { cbind(prog=x, rankings[[x]]) }))
 72 | 
 73 | write.table(rankings.table, file="all.auc.rankings.dat", quote=F, sep="\t")
 74 | 
 75 | 
 76 | 
 77 | for (iu_val in c(0,1)) {
 78 |     for (okp_val in c(0,1)) {
 79 | 
 80 |         pdf(sprintf("all.auc.rankings.iu=%d.okp=%d.boxplot.pdf", iu_val, okp_val))
 81 | 
 82 |         title = sprintf("iu=%d, okp=%d", iu_val, okp_val)
 83 | 
 84 |         filtered_table = rankings.table %>% filter(iu==iu_val & okp==okp_val)
 85 |         ranked.progs = filtered_table %>% group_by(prog) %>% summarize(mr=median(rankval)) %>% arrange(mr)
 86 | 
 87 |         filtered_table$prog = factor(filtered_table$prog, levels=factor(ranked.progs$prog))
 88 | 
 89 |         p = filtered_table %>% ggplot(aes(as.factor(prog), rankval)) + geom_boxplot() + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + ggtitle(title)
 90 | 
 91 |         plot(p)
 92 | 
 93 |         dev.off()
 94 | 
 95 |         output_rankings_file = sprintf("all.auc.rankings.iu=%d.okp=%d.dat", iu_val, okp_val)
 96 |         write.table(ranked.progs, file=output_rankings_file, quote=F, sep="\t")
 97 |     }
 98 | }
 99 | 
100 | 
101 | 
102 | ## show how the iu and okp params change status for each prog
103 | pdf("all.auc.rankings_per_prog_adj.boxplot.pdf")
104 | p = rankings.table %>% mutate(combo_iu_okp=sprintf("%i,%i", iu, okp)) %>% ggplot(aes(combo_iu_okp, rankval)) + geom_boxplot() + facet_wrap(~prog)
105 | plot(p)
106 | dev.off()
107 | 
108 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_before_vs_after_filt_TP_FP_compare.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | if (length(args) != 2) {
 6 |     stop("usage: plot_before_vs_after_filt_TP_FP_compare.Rscript before.best.dat after.best.dat")
 7 | }
 8 | 
 9 | 
10 | before_dat_filename = args[1]
11 | after_dat_filename = args[2]
12 | 
13 | pdf_filename = paste0(after_dat_filename, '.before_vs_after.pdf')
14 | pdf(pdf_filename, width=10)
15 | 
16 | 
17 | before_dat = read.table(before_dat_filename, header=T, row.names=1)
18 | after_dat = read.table(after_dat_filename, header=T, row.names=1)
19 | 
20 | prognames = rownames(before_dat)
21 | merged_df = data.frame(prog=prognames, 
22 |                        before_TP=before_dat[prognames,'TP'], before_FP=before_dat[prognames,'FP'],
23 |                        after_TP=after_dat[prognames,'TP'], after_FP=after_dat[prognames,'FP'])
24 | 
25 | rownames(merged_df) = prognames
26 | 
27 | max_TP = max(merged_df$before_TP, merged_df$after_TP)
28 | max_FP = max(merged_df$before_FP, merged_df$after_FP)
29 | 
30 | plot(0,0, type='n', xlim=c(0,max_FP), ylim=c(0,max_TP), xlab='FP', ylab='TP')
31 | 
32 | colors = rainbow(length(prognames))
33 | 
34 | i=0
35 | for (progname in prognames) {
36 | 	prog_data = merged_df[progname,,drop=F]
37 | 
38 | 	print(prog_data)
39 | 
40 | 	before_TP = prog_data$before_TP[1]
41 |     before_FP = prog_data$before_FP[1]
42 | 
43 |     after_TP  = prog_data$after_TP[1]
44 |     after_FP  = prog_data$after_FP[1]
45 | 
46 | 	i = i + 1
47 | 	arrows(before_FP, before_TP, after_FP, after_TP, col=colors[i], length=0.15, lwd=2)
48 | 	print(c(before_FP, before_TP, after_FP, after_TP))  
49 |     text(before_FP, before_TP, labels=progname, col=colors[i], cex=0.6, pos=4)
50 | }
51 | 
52 | merged_dat_filename = paste0(after_dat_filename, ".before_vs_after.dat")
53 | write.table(merged_df, file=merged_dat_filename, quote=F, sep="\t")
54 | 
55 | dev.off()
56 | 
57 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_median_accuracy_ranking_vs_median_runtime.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | args<-commandArgs(TRUE)
 6 | 
 7 | if (length(args) != 2) {
 8 |     stop("require params: cancer_cell_lines/all.auc.rankings.dat all_progs_cancer/runtimes.txt")
 9 | }	
10 | 
11 | all_auc_rankings_file = args[1]
12 | runtimes_file = args[2]
13 | 
14 | library(tidyverse)
15 | 
16 | 
17 | auc_data = read.table(all_auc_rankings_file)
18 | 
19 | med_rank_data = auc_data %>% group_by(prog) %>% summarize(med_rank=median(rankval))
20 | 
21 | runtime_data = read.table(runtimes_file, header=T)
22 | 
23 | med_runtime_data = runtime_data %>% group_by(prog) %>% summarize(med_runtime=median(time_h, na.rm=T))
24 | 
25 | write.table(med_runtime_data, file='med_runtime_data.tsv', quote=F, sep='\t', row.names=F)
26 | write.table(med_rank_data, file='med_rank_data.tsv', quote=F, sep='\t', row.names=F)
27 | 
28 | 
29 | #p = data %>% group_by(prog) %>% filter(! is.na(F1)) %>% filter(F1 == max(F1)) %>% ggplot(aes(x=PPV, y=TPR, color=prog)) + geom_point()
30 | #
31 | #pdf_filename = paste0(roc_file, ".tpr_ppv_at_maxF1_scatter.pdf")
32 | #pdf(pdf_filename)
33 | #
34 | #plot(p)
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_peak_F1_scatter.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | args<-commandArgs(TRUE)
 6 | 
 7 | if (length(args) == 0) {
 8 |     stop("require param: min_X.results.scored.ROC")
 9 | }
10 | 
11 | roc_file = args[1]
12 | 
13 | library(ggplot2)
14 | library(dplyr)
15 | 
16 | data = read.table(roc_file, header=T)
17 | 
18 | 
19 | peak_F1_data = data %>% group_by(prog) %>% filter(! is.na(F1)) %>% filter(F1 == max(F1)) %>% arrange(desc(F1))
20 | 
21 | p = peak_F1_data %>% ggplot(aes(x=PPV, y=TPR, color=prog, shape=prog)) + geom_point() + scale_shape_manual(values=rep(seq(0,25), 2))
22 | 
23 | pdf_filename = paste0(roc_file, ".tpr_ppv_at_maxF1_scatter.pdf")
24 | pdf(pdf_filename, width=9, height=4)
25 | 
26 | plot(p)
27 | 
28 | 
29 | peak_F1_dat_file = paste0(roc_file, ".tpr_ppv_at_maxF1.dat")
30 | write.table(peak_F1_data, file=peak_F1_dat_file, quote=F, sep="\t")
31 | 
32 | 


--------------------------------------------------------------------------------
/benchmarking/plotters/plot_upsetR.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors = FALSE)
 4 | 
 5 | args<-commandArgs(TRUE)
 6 | 
 7 | if (length(args) == 0) {
 8 |     stop("require param: matrix")
 9 | }
10 | 
11 | prog_agree_matrix = args[1]
12 | 
13 | library(ggplot2)
14 | library(dplyr)
15 | library(tidyr)
16 | library(UpSetR)
17 | 
18 | 
19 | pdf_filename = sprintf("%s.UpSetR.pdf", prog_agree_matrix)
20 | pdf(pdf_filename, width=11)
21 | 
22 | data = read.table(prog_agree_matrix, header=T)
23 | 
24 | upset(data, number.angles=90, nsets=1000, nintersects=1000)
25 | 
26 | dev.off()
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/analyze_Edgren_subset.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use FindBin;
 7 | use File::Basename;
 8 | use lib ("$FindBin::Bin/../../PerlLib");
 9 | use Pipeliner;
10 | use Process_cmd;
11 | use Cwd;
12 | 
13 | if (basename(cwd()) ne "Edgren_subset") {
14 |     die "Error, must run this while in the cancer_cell_lines/ directory.";
15 | }
16 | 
17 | 
18 | my $benchmark_data_basedir = "$FindBin::Bin/../..";
19 | my $benchmark_toolkit_basedir = "$FindBin::Bin/../../benchmarking";
20 | 
21 | 
22 | main: {
23 | 
24 |     my $pipeliner = &init_pipeliner();
25 | 
26 |     my $cmd = "$benchmark_toolkit_basedir/collected_preds_to_fusion_prog_support_listing.pl preds.collected.gencode_mapped.wAnnot.filt.edgren ../progs_select.txt  > preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree";
27 |     $pipeliner->add_commands(new Command($cmd, "edgren.byProgAgree.ok"));
28 |     
29 |     ## need the unsure set defined. Basically, treat everything non-unique as unsure.
30 |     #$cmd = "$benchmark_toolkit_basedir/define_truth_n_unsure_set.pl preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree 1000";
31 |     #$pipeliner->add_commands(new Command($cmd, "define_min_agree.ok"));
32 |     
33 |     ## evaluate predictions:
34 |     
35 |     $cmd = "$benchmark_toolkit_basedir/fusion_preds_to_TP_FP_FN.pl "
36 |          . " --truth_fusions edgren.truthset.raw "
37 |          . " --fusion_preds preds.collected.gencode_mapped.wAnnot.filt.edgren "
38 |          . " --allow_reverse_fusion "
39 |          . " --allow_paralogs $benchmark_data_basedir/resources/paralog_clusters.dat "
40 |         #. " --unsure_fusions preds.collected.gencode_mapped.wAnnot.filt.edgren.byProgAgree.min_1000.unsure_set "
41 |          . " > preds.collected.gencode_mapped.wAnnot.filt.edgren.scored ";
42 | 
43 |     $pipeliner->add_commands(new Command($cmd, "edgren.TP_FP_FN.ok"));
44 |         
45 |     my $roc_file = "preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.ROC";
46 |     
47 |     $cmd = "$benchmark_toolkit_basedir/all_TP_FP_FN_to_ROC.pl preds.collected.gencode_mapped.wAnnot.filt.edgren.scored > $roc_file";
48 |     $pipeliner->add_commands(new Command($cmd, "edgren.roc.ok"));
49 |     
50 |     # plot ROC
51 |     $cmd = "$benchmark_toolkit_basedir/plotters/plot_ROC.Rscript $roc_file";
52 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_roc.ok"));
53 |     
54 |     # plot F1
55 |     $cmd = "$benchmark_toolkit_basedir/plotters/plot_F1_vs_min_frags.R $roc_file";
56 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_F1_vs_min_frags.ok"));
57 | 
58 |     $cmd = "$benchmark_toolkit_basedir/plotters/plot_peak_F1_scatter.R $roc_file";
59 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_peak_F1_scatter.ok"));
60 | 
61 |     # plot TP vs FP counts according to min frags per orog
62 |     $cmd = "$benchmark_toolkit_basedir/plotters/plot_TP_FP_vs_minSum_per_prog.R $roc_file";
63 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_TP_FP_vs_minFrags.ok"));
64 |     
65 |     
66 |     ###################################
67 |     # convert to Precision-Recall curve
68 | 
69 |     my $PR_file = "preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.PR";
70 |     
71 |     $cmd = "$benchmark_toolkit_basedir/calc_PR.py --in_ROC $roc_file --min_read_support 3 --out_PR $PR_file | sort -k2,2gr | tee $PR_file.AUC";
72 |     $pipeliner->add_commands(new Command($cmd, "edgren.pr.ok"));
73 | 
74 |     # plot PR curve
75 |     $cmd = "$benchmark_toolkit_basedir/plotters/plotPRcurves.R $PR_file $PR_file.plot.pdf";
76 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_pr.ok"));
77 | 
78 |     # plot AUC barplot
79 |     $cmd = "$benchmark_toolkit_basedir/plotters/AUC_barplot.Rscript $PR_file.AUC";
80 |     $pipeliner->add_commands(new Command($cmd, "edgren.plot_pr_auc_barplot.ok"));
81 |     
82 |     $pipeliner->run();
83 | 
84 | }
85 |     
86 | 
87 | ####
88 | sub init_pipeliner {
89 | 
90 |     my $pipeliner = new Pipeliner(-verbose => 2, -cmds_log => 'pipe.log');
91 |     my $checkpoint_dir = cwd() . "/_checkpoints";
92 |     unless (-d $checkpoint_dir) {
93 |         mkdir $checkpoint_dir or die "Error, cannot mkdir $checkpoint_dir";
94 |     }
95 |     $pipeliner->set_checkpoint_dir($checkpoint_dir);
96 | 
97 |     return($pipeliner);
98 | }
99 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./all* ./auc_files.list
4 | rm -rf ./_*
5 | 
6 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/edgren.truthset:
--------------------------------------------------------------------------------
  1 | ########################
  2 | ## BT474 ###############
  3 | 
  4 | # Edgren
  5 | BT474|ACACA--STAC2
  6 | BT474|DIDO1--TTI1
  7 | BT474|CPNE1--PI3
  8 | BT474|GLB1--CMTM7
  9 | BT474|LAMP1--MCF2L
 10 | BT474|RAB22A--MYO9B
 11 | BT474|RPS6KB1--SNF8
 12 | BT474|SKA2--MYO19
 13 | BT474|STARD3--DOK5
 14 | BT474|VAPB--IKZF3
 15 | BT474|ZMYND8--CEP250
 16 | 
 17 | # Kangaspeaka
 18 | BT474|AHCTF1--NAAA
 19 | BT474|MED1--ACSF2
 20 | BT474|MED1--STXBP4
 21 | BT474|MED13--BCAS3
 22 | BT474|PIP4K2B--RAD51C
 23 | BT474|STX16--RAE1
 24 | BT474|THRA--AC090627.1
 25 | BT474|TOB1--SYNRG
 26 | BT474|TRPC4AP--MRPL45
 27 | BT474|USP32--MED1
 28 | 
 29 | # Asmann, 2011
 30 | BT474|LIMA1--USP22
 31 | BT474|ACACA--STAC2
 32 | BT474|FAM102A--CIZ1
 33 | BT474|GLB1--CMTM7
 34 | BT474|MED1--STXBP4
 35 | BT474|PIP4K2B--RAD51C
 36 | BT474|RAB22A--MYO9B
 37 | BT474|RPS6KB1--SNF8
 38 | BT474|STARD3--DOK5
 39 | BT474|TRPC4AP--MRPL45
 40 | BT474|ZMYND8--CEP250
 41 | 
 42 | 
 43 | 
 44 | ########################
 45 | ## KPL4 ################
 46 | 
 47 | # Edgren
 48 | KPL4|BSG--NFIX
 49 | KPL4|PPP1R12A--SEPT10
 50 | KPL4|NOTCH1--NUP214
 51 | 
 52 | 
 53 | #######################
 54 | ## MCF7 ###############
 55 | 
 56 | # Edgren
 57 | MCF7|ARFGEF2--SULF2
 58 | MCF7|BCAS4--BCAS3
 59 | MCF7|RPS6KB1--VMP1
 60 | 
 61 | # Kangaspeaka
 62 | MCF7|AC099850.1--VMP1
 63 | MCF7|GCN1L1--MSI1
 64 | MCF7|SMARCA4--CARM1
 65 | 
 66 | # Asmann, 2011
 67 | MCF7|ADAMTS19--SLC27A6
 68 | MCF7|ARFGEF2--SULF2
 69 | MCF7|ATXN7L3--FAM171A2
 70 | MCF7|BCAS4--BCAS3
 71 | MCF7|GCN1L1--MSI1
 72 | MCF7|RPS6KB1--DIAPH3
 73 | MCF7|SULF2--PRICKLE2
 74 | MCF7|MYH9--EIF3D
 75 | 
 76 | # Maher, PNAS, 2009
 77 | MCF7|AHCYL1--RAD51C
 78 | MCF7|ARFGEF2--SULF2
 79 | MCF7|ARHGAP19--DRG1
 80 | MCF7|BCAS4--BCAS3
 81 | MCF7|PAPOLA--AK7
 82 | MCF7|MYO9B--FCHO1
 83 | 
 84 | 
 85 | ########################
 86 | ## SKBR3 ###############
 87 | 
 88 | # Edgren
 89 | SKBR3|ANKHD1--PCDH1
 90 | SKBR3|CCDC85C--SETD3
 91 | SKBR3|CSE1L--AL035685.1
 92 | SKBR3|CYTH1--EIF3H
 93 | SKBR3|DHX35--ITCH
 94 | SKBR3|NFS1--PREX1
 95 | SKBR3|PREX1--CPNE1
 96 | SKBR3|RARA--PKIA
 97 | SKBR3|SUMF1--LRRFIP2
 98 | SKBR3|TATDN1--GSDMB
 99 | SKBR3|WDR67--ZNF704
100 | 
101 | 
102 | # Asmann, 2011
103 | SKBR3|KLHDC2--SNTB1
104 | 
105 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/edgren.truthset.raw:
--------------------------------------------------------------------------------
 1 | BT474|ACACA--STAC2
 2 | BT474|AHCTF1--NAAA
 3 | BT474|CPNE1--PI3
 4 | BT474|DIDO1--TTI1
 5 | BT474|FAM102A--CIZ1
 6 | BT474|GLB1--CMTM7
 7 | BT474|LAMP1--MCF2L
 8 | BT474|LIMA1--USP22
 9 | BT474|MED1--ACSF2
10 | BT474|MED1--STXBP4
11 | BT474|MED13--BCAS3
12 | BT474|PIP4K2B--RAD51C
13 | BT474|RAB22A--MYO9B
14 | BT474|RPS6KB1--SNF8
15 | BT474|SKA2--MYO19
16 | BT474|STARD3--DOK5
17 | BT474|STX16--RAE1
18 | BT474|THRA--AC090627.1
19 | BT474|TOB1--SYNRG
20 | BT474|TRPC4AP--MRPL45
21 | BT474|USP32--MED1
22 | BT474|VAPB--IKZF3
23 | BT474|ZMYND8--CEP250
24 | KPL4|BSG--NFIX
25 | KPL4|NOTCH1--NUP214
26 | KPL4|PPP1R12A--SEPT10
27 | MCF7|AC099850.1--VMP1
28 | MCF7|ADAMTS19--SLC27A6
29 | MCF7|AHCYL1--RAD51C
30 | MCF7|ARFGEF2--SULF2
31 | MCF7|ARHGAP19--DRG1
32 | MCF7|ATXN7L3--FAM171A2
33 | MCF7|BCAS4--BCAS3
34 | MCF7|GCN1L1--MSI1
35 | MCF7|MYH9--EIF3D
36 | MCF7|MYO9B--FCHO1
37 | MCF7|PAPOLA--AK7
38 | MCF7|RPS6KB1--DIAPH3
39 | MCF7|RPS6KB1--VMP1
40 | MCF7|SMARCA4--CARM1
41 | MCF7|SULF2--PRICKLE2
42 | SKBR3|ANKHD1--PCDH1
43 | SKBR3|CCDC85C--SETD3
44 | SKBR3|CSE1L--AL035685.1
45 | SKBR3|CYTH1--EIF3H
46 | SKBR3|DHX35--ITCH
47 | SKBR3|KLHDC2--SNTB1
48 | SKBR3|NFS1--PREX1
49 | SKBR3|PREX1--CPNE1
50 | SKBR3|RARA--PKIA
51 | SKBR3|SUMF1--LRRFIP2
52 | SKBR3|TATDN1--GSDMB
53 | SKBR3|WDR67--ZNF704
54 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/eval_edgren_min_agree.consolidated.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "usage: $0 consolidated_edgren_predictions.dat min_agree\n\n";
 7 | 
 8 | my $input_file = $ARGV[0] or die $usage;
 9 | my $min_agree = $ARGV[1] or die $usage;
10 | 
11 | main: {
12 | 
13 |     my @progs_to_count = `cat ../progs_select.txt`;
14 |     chomp @progs_to_count;
15 |     
16 |     my %progs_to_count = map { + $_ => 1 } @progs_to_count;
17 |     
18 |     
19 |     open(my $fh, $input_file) or die $!;
20 | 
21 |     my %fusion_to_prog;
22 |     my %orig_fusion_call;
23 |     my %prognames;
24 | 
25 |     my $header = <$fh>;
26 |     while(<$fh>) {
27 |         chomp;
28 |         my @x = split(/\t/);
29 |         my $sample_name = $x[2];
30 |         my $progname = $x[3];
31 |         my $fusion_name = $x[6];
32 | 
33 |         my @y = split(/\|/, $fusion_name);
34 |         
35 |         my ($left_entry, $right_entry) = split(/--/, $y[1]);
36 |                 
37 |         $prognames{$progname}++;
38 |         
39 |         my $alt_fusion_name = "$sample_name|$left_entry--$right_entry";
40 |         $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++;
41 |         $fusion_to_prog{$alt_fusion_name}->{$progname}++;
42 |         
43 |         $alt_fusion_name = "$sample_name|$right_entry--$left_entry";
44 |         $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++;
45 |         $fusion_to_prog{$alt_fusion_name}->{$progname}++;
46 |                 
47 |         
48 |     }
49 |         
50 |     ## capture those fusions that meet the min prog criteria
51 | 
52 |     my %fusions_meet_min_prog_count;
53 | 
54 |     foreach my $fusion_name (keys %fusion_to_prog) {
55 |         
56 |         my $orig_fusion_names_href = $orig_fusion_call{$fusion_name};
57 |         my @orig_fusion_cand_names = sort {$orig_fusion_names_href->{$b}<=>$orig_fusion_names_href->{$a}} keys %$orig_fusion_names_href;
58 | 
59 |         my $orig_fusion_name = $orig_fusion_cand_names[0];
60 |                 
61 |         my $prog_count = scalar(grep { $progs_to_count{$_} } keys %{$fusion_to_prog{$fusion_name}});
62 |         
63 |         #print "$fusion_name\t$orig_fusion_name\t$prog_count\n";
64 | 
65 |         if ($prog_count >= $min_agree) {
66 |             $fusions_meet_min_prog_count{$orig_fusion_name} = 1;
67 |         }
68 |     }
69 |         
70 | 
71 |     ## generate report
72 |     my @prognames = sort keys %prognames;
73 | 
74 |     print "\t" . join("\t", @prognames) . "\n";
75 | 
76 |     my @final_fusions = sort keys %fusions_meet_min_prog_count; 
77 |     
78 |     foreach my $fusion (@final_fusions) {
79 |         
80 |         my @vals = ($fusion);
81 |         foreach my $progname (@prognames) {
82 |             my $found = (exists $fusion_to_prog{$fusion}->{$progname}) ? 1 : 0;
83 |             push (@vals, $found);
84 |         }
85 | 
86 |         print join("\t", @vals) . "\n";
87 |     }
88 | 
89 |     exit(0);
90 |     
91 | }
92 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/eval_edgren_min_agree.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | my $usage = "usage: $0 preds.collected.gencode_mapped.wAnnot.filt.edgren min_agree\n\n";
  7 | 
  8 | my $input_file = $ARGV[0] or die $usage;
  9 | my $min_agree = $ARGV[1] or die $usage;
 10 | 
 11 | main: {
 12 | 
 13 |     my @progs_to_count = `cat ../progs_select.txt`;
 14 |     chomp @progs_to_count;
 15 |     
 16 |     my %progs_to_count = map { + $_ => 1 } @progs_to_count;
 17 |     
 18 |     
 19 |     open(my $fh, $input_file) or die $!;
 20 | 
 21 |     my %fusion_to_prog;
 22 |     my %orig_fusion_call;
 23 |     my %prognames;
 24 | 
 25 |     my $header = <$fh>;
 26 |     while(<$fh>) {
 27 |         chomp;
 28 |         my @x = split(/\t/);
 29 |         my $sample_name = $x[0];
 30 |         my $progname = $x[1];
 31 |         my $fusion_name = $x[2];
 32 | 
 33 |         my ($left_fusion_name, $right_fusion_name) = split(/--/, $fusion_name);
 34 |         
 35 |         $fusion_name = "$sample_name|$fusion_name";
 36 |         
 37 |         $prognames{$progname}++;
 38 |         
 39 |         my $alt_fusion_names_left = $x[5];
 40 |         my $alt_fusion_names_right = $x[6];
 41 | 
 42 |         my @left_entries = split(/,/, $alt_fusion_names_left);
 43 |         my @right_entries = split(/,/, $alt_fusion_names_right);
 44 | 
 45 |         @left_entries = grep { defined($_) } @left_entries;
 46 |         @right_entries = grep { defined($_) } @right_entries;
 47 |         
 48 |         
 49 |         
 50 |         unless (grep {/^$left_fusion_name$/} @left_entries) {
 51 |             push (@left_entries, $left_fusion_name);
 52 |         }
 53 |         unless (grep {/^$right_fusion_name$/} @right_entries) {
 54 |             push (@right_entries, $right_fusion_name);
 55 |         }
 56 |         
 57 |         
 58 |         foreach my $left_entry (@left_entries) {
 59 |             foreach my $right_entry (@right_entries) {
 60 |                 
 61 |                 my $alt_fusion_name = "$sample_name|$left_entry--$right_entry";
 62 |                 $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++;
 63 |                 $fusion_to_prog{$alt_fusion_name}->{$progname}++;
 64 |                 
 65 |                 $alt_fusion_name = "$sample_name|$right_entry--$left_entry";
 66 |                 $orig_fusion_call{$alt_fusion_name}->{$fusion_name}++;
 67 |                 $fusion_to_prog{$alt_fusion_name}->{$progname}++;
 68 |                 
 69 |             }
 70 |         }
 71 |         
 72 |     }
 73 |     
 74 |     ## capture those fusions that meet the min prog criteria
 75 | 
 76 |     my %fusions_meet_min_prog_count;
 77 | 
 78 |     foreach my $fusion_name (keys %fusion_to_prog) {
 79 |         
 80 |         my $orig_fusion_names_href = $orig_fusion_call{$fusion_name};
 81 |         my @orig_fusion_cand_names = sort {$orig_fusion_names_href->{$b}<=>$orig_fusion_names_href->{$a}} keys %$orig_fusion_names_href;
 82 | 
 83 |         my $orig_fusion_name = $orig_fusion_cand_names[0];
 84 |                 
 85 |         my $prog_count = scalar(grep { $progs_to_count{$_} } keys %{$fusion_to_prog{$fusion_name}});
 86 |         
 87 |         #print "$fusion_name\t$orig_fusion_name\t$prog_count\n";
 88 | 
 89 |         if ($prog_count >= $min_agree) {
 90 |             $fusions_meet_min_prog_count{$orig_fusion_name} = 1;
 91 |         }
 92 |     }
 93 | 
 94 |     
 95 | 
 96 |     ## generate report
 97 |     my @prognames = sort keys %prognames;
 98 | 
 99 |     print "\t" . join("\t", @prognames) . "\n";
100 | 
101 |     my @final_fusions = sort keys %fusions_meet_min_prog_count; 
102 |     
103 |     foreach my $fusion (@final_fusions) {
104 |         
105 |         my @vals = ($fusion);
106 |         foreach my $progname (@prognames) {
107 |             my $found = (exists $fusion_to_prog{$fusion}->{$progname}) ? 1 : 0;
108 |             push (@vals, $found);
109 |         }
110 | 
111 |         print join("\t", @vals) . "\n";
112 |     }
113 | 
114 |     exit(0);
115 |     
116 | }
117 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/Edgren_subset/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | 
 6 | cat ../preds.collected.gencode_mapped.wAnnot.filt | egrep '^(sample|BT474|MCF7|KPL4|SKBR3)' > preds.collected.gencode_mapped.wAnnot.filt.edgren
 7 | 
 8 | 
 9 | 
10 | ## analyze accuracy
11 | ./analyze_Edgren_subset.pl
12 | 
13 | 
14 | ## examine enrichment for valid fusions among minProgs
15 | ./examine_validated_enrichment.R edgren.truthset.raw preds.collected.gencode_mapped.wAnnot.filt.edgren.scored
16 | 
17 | 
18 | ## examine min3 agree Venn
19 | 
20 | ./eval_edgren_min_agree.consolidated.pl consolidated_edgren_predictions.dat 3 > edgren.min3
21 | 
22 | ../../benchmarking/plotters/plot_upsetR.R edgren.min3
23 | 
24 | 
25 | 
26 | 
27 | ## run through standard analysis for curiosity sake
28 | ../../benchmarking/Venn_analysis_strategy.pl preds.collected.gencode_mapped.wAnnot.filt.edgren ../progs_select.txt 3 10
29 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/SuppTable-cancer_cell_lines.csv:
--------------------------------------------------------------------------------
 1 | Cancer cell line,Cancer type,Data source,accession or analysis_id
 2 | BT474,breast,SRA,SRP003186 
 3 | G20476.DMS_454.2,lung,CCLE,1d3e9dae-b558-4187-b8a0-e79b2b307f3f
 4 | G20495.786-O.2,kidney,CCLE,166efd97-7b71-4089-be92-d8d006f86c3b
 5 | G20498.KYSE-180.2,head-neck,CCLE,a1c5e568-4169-48fe-9d84-7a70d61ceea0
 6 | G20500.IGR-37.2,skin,CCLE,1138456d-ecc5-45f4-a169-9ba373d5d71a
 7 | G25214.MKN7.1,stomach,CCLE,23056430-2489-4922-ac0b-299b7f43e74e
 8 | G25225.NCI-H522.1,lung,CCLE,9da8f1f7-d606-4fe8-ab14-c9fe2a4b7afc
 9 | G26175.A172.2,brain,CCLE,0534dd1a-0287-484d-939f-b2e5f37688c3
10 | G26182.KMS-12-BM.2,lymphoid,CCLE,04778047-db4d-4b8c-b77c-b910fa8c9e12
11 | G26199.LN-229.2,brain,CCLE,2dc62ec8-476f-4d4a-a51f-729673e63f80
12 | G26212.A-673.2,sarcomatoid,CCLE,ed412801-81e9-4777-942a-95079b7044e1
13 | G26216.KP-2.2,pancreas,CCLE,77c15c63-b42f-4194-a346-6e8bf98ac23b
14 | G26228.Hs_683.2,brain,CCLE,6af225a0-fcc0-471b-838f-2bf1c0ded8d3
15 | G26236.NCI-H716.2,colo-rectal,CCLE,95e1c652-a7f2-41f6-9a30-503b8c7c37a5
16 | G26249.KMS-26.2,lymphoid,CCLE,d54514e6-1825-46cd-a394-53886430033d
17 | G26253.KMS-34.2,lymphoid,CCLE,905fffad-e784-47b7-a617-6df5ec7e604c
18 | G26262.NCI-H889.2,lung,CCLE,a3e91ee2-aa54-4acf-8eb2-45d795ec5188
19 | G27214.PC-3.1,other,CCLE,811a96a5-a7f2-4082-9044-e6972c7316a9
20 | G27219.Panc_03.27.1,pancreas,CCLE,c5b5dba9-1eee-43ec-a097-dc72d45b20da
21 | G27233.A-498.1,kidney,CCLE,ce9f1b08-8a07-416a-ab43-90e011092b08
22 | G27259.AN3_CA.1,uterus,CCLE,3f1b9533-8773-4ca8-8c43-98101cbba096
23 | G27280.TC-71.1,sarcomatoid,CCLE,04ae78b3-1f09-4d55-ac56-8600d73ed8a5
24 | G27367.BFTC-909.1,kidney,CCLE,1c5f708f-252b-49d0-b16d-78d5a5344e7d
25 | G27376.COLO_792.1,skin,CCLE,5c9df572-dd10-420f-980a-17ae004dfc08
26 | G27453.SNU-398.2,liver,CCLE,15ea12f5-1702-48d7-8cd0-98659cddb7e2
27 | G27463.SK-MEL-1.2,skin,CCLE,8f1e5d67-c86c-4b0d-9e21-5803f7b79432
28 | G27476.PK-59.2,pacreas,CCLE,3051f39a-7881-4d7b-a13b-c5395e5b4ef3
29 | G27479.SK-MEL-3.2,skin,CCLE,f6870a19-fe07-4afb-95c4-204c35bfa95a
30 | G27488.SNU-620.2,stomach,CCLE,b146a903-8d81-4d13-a630-22c944087bf4
31 | G27516.SK-MEL-28.2,skin,CCLE,4a77f393-6c2f-47d7-b7e5-411982bf75a6
32 | G27544.SF268.2,brain,CCLE,f399f213-694c-4463-b855-132f433df92d
33 | G28011.KLE.1,uterus,CCLE,f43766e8-108d-4438-89ab-75d0dbe55997
34 | G28034.MDA-MB-361.1,breast,CCLE,a337c425-4314-40c6-a40a-a444781bd1b7
35 | G28045.KYSE-270.1,head-neck,CCLE,067cb65d-b578-4dd6-95db-60f2d2ae040e
36 | G28050.KMM-1.1,lymphoid,CCLE,1bae551f-db0a-4b96-92ca-2211d0d5265b
37 | G28054.KYSE-520.1,head-neck,CCLE,67d2c686-e66b-489f-a6b3-0ed36a56ba00
38 | G28070.LN-18.1,brain,CCLE,d62d67e3-da37-4046-8be5-07d2ac93f47f
39 | G28072.MDA-MB-175-VII.1,breast,CCLE,ea450165-4ec4-431f-a21b-ebdea26d9794
40 | G28077.MG-63.1,sarcomatoid,CCLE,8c7f184a-ddde-4510-90b6-99a9252b9128
41 | G28081.JHH-7.1,liver,CCLE,b6118100-0a8c-4153-ba64-5db6e1e820fa
42 | G28087.MDA-MB-436.1,breast,CCLE,cd1ad136-1e68-4f08-8341-216382301fd8
43 | G28535.OVTOKO.1,ovary,CCLE,24799388-008d-4011-a181-f4b2070b8bb0
44 | G28545.NUGC-2.1,stomach,CCLE,7cba5a73-b8a7-46c9-8175-eeca8bd483ac
45 | G28575.OUMS-23.1,colo-rectal,CCLE,75b57b0c-df5b-429b-a024-3777b6291bf0
46 | G28610.MHH-ES-1.1,sarcomatoid,CCLE,b1c89d5f-841b-41b6-87ac-d5407bbf8293
47 | G30594.UACC-893.1,breast,CCLE,52646ad2-d86d-4fc5-ba5c-a5beeed25f39
48 | G30631.SU-DHL-10.1,other,CCLE,fe8b3333-45eb-41bc-88b8-cfc3b9666591
49 | G41663.OVISE.5,ovary,CCLE,2d1f25d4-7245-4a2f-bd6f-6f5b6c3e5d0d
50 | G41682.KYSE-510.5,head-neck,CCLE,8a027ed1-8767-48a6-ae39-d4f29417c0e8
51 | G41706.RT4.5,urinary bladder,CCLE,55c315f0-6c8c-4a04-86e5-fc73f35b3619
52 | G41709.FaDu.5,head-neck,CCLE,192ac756-5175-4acd-92be-357cdf926d33
53 | G41710.SNU-16.5,stomach,CCLE,becae7d6-fdb7-4f63-8947-40b799a8d544
54 | G41724.HGC-27.5,stomach,CCLE,f1faef0c-e58c-4eb3-9e3b-e5dc3791fa14
55 | H2228,lung,SRA,DRR016705
56 | K562,lymphoid,SRA,SRR521460
57 | KPL4,breast,SRA,SRP003186 
58 | LC2Ad,lung,SRA,DRR016709
59 | MCF7,breast,SRA,SRP003186 
60 | SKBR3,breast,SRA,SRP003186 
61 | VCaP_85,prostate,SRA,SRR1217085
62 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/analyze_cancer_data.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Carp;
  6 | use FindBin;
  7 | use Cwd;
  8 | use File::Basename;
  9 | use lib ("$FindBin::Bin/../PerlLib");
 10 | use Pipeliner;
 11 | use Process_cmd;
 12 | 
 13 | 
 14 | my $restricted_progs_file = $ARGV[0] || "";
 15 | 
 16 | unless ($ENV{FUSION_ANNOTATOR}) {
 17 | 
 18 |     if (-d "$ENV{HOME}/GITHUB/CTAT_FUSIONS/FusionAnnotator") {
 19 |         $ENV{FUSION_ANNOTATOR} = "~/GITHUB/CTAT_FUSIONS/FusionAnnotator";
 20 |     }
 21 |     else {
 22 |         die "Error, must set env var FUSION_ANNOTATOR to point to base dir of\n"
 23 |             . "      git clone https://github.com/FusionAnnotator/FusionAnnotator.git\n"
 24 |             . "      (after having installed it)  ";
 25 |     }
 26 | }
 27 | 
 28 | unless ($ENV{TRINITY_HOME}) {
 29 |     die "Error, must specify env var TRINITY_HOME to trinity base installation directory";
 30 | }
 31 | 
 32 | 
 33 | if (basename(cwd()) ne "cancer_cell_lines") {
 34 |     die "Error, must run this while in the cancer_cell_lines/ directory.";
 35 | }
 36 | 
 37 | 
 38 | my $benchmark_data_basedir = "$FindBin::Bin/..";
 39 | my $benchmark_toolkit_basedir = "$FindBin::Bin/../benchmarking";
 40 | my $fusion_annotator_basedir = $ENV{FUSION_ANNOTATOR};
 41 | my $trinity_home = $ENV{TRINITY_HOME};
 42 | 
 43 | 
 44 | main: {
 45 | 
 46 |     my $pipeliner = &init_pipeliner();
 47 |     
 48 |     ## create file listing
 49 |     my $cmd = "find ./samples -type f | $benchmark_data_basedir/util/make_file_listing_input_table.pl $restricted_progs_file > fusion_result_file_listing.dat";
 50 |     $pipeliner->add_commands(new Command($cmd, "fusion_file_listing.ok"));
 51 | 
 52 |     # collect predictions
 53 |     $cmd = "$benchmark_toolkit_basedir/collect_preds.pl fusion_result_file_listing.dat > preds.collected";
 54 |     $pipeliner->add_commands(new Command($cmd, "collect_preds.ok"));
 55 | 
 56 |     # map fusion predictions to gencode gene symbols based on identifiers or chromosomal coordinates.
 57 |     $cmd = "$benchmark_toolkit_basedir/map_gene_symbols_to_gencode.pl "
 58 |         . " preds.collected "
 59 |         . " $benchmark_data_basedir/resources/genes.coords.gz "
 60 |         . " $benchmark_data_basedir/resources/genes.aliases "
 61 |         . " > preds.collected.gencode_mapped ";
 62 | 
 63 |     $pipeliner->add_commands(new Command($cmd, "gencode_mapped.ok"));
 64 | 
 65 |     # annotate
 66 |     $cmd = "$fusion_annotator_basedir/FusionAnnotator --annotate preds.collected.gencode_mapped  -C 2 > preds.collected.gencode_mapped.wAnnot";
 67 |     $pipeliner->add_commands(new Command($cmd, "annotate_fusions.ok"));
 68 | 
 69 |     # filter HLA and mitochondrial features
 70 |     $cmd = "$benchmark_toolkit_basedir/filter_collected_preds.pl preds.collected.gencode_mapped.wAnnot > preds.collected.gencode_mapped.wAnnot.filt";
 71 |     $pipeliner->add_commands(new Command($cmd, "filter_fusion_annot.ok"));
 72 |     
 73 |     # generate and plot correlation matrix for predicted fusions by prog
 74 |     $cmd = "$benchmark_toolkit_basedir/fusion_preds_to_matrix.pl preds.collected.gencode_mapped.wAnnot.filt > preds.collected.gencode_mapped.wAnnot.filt.matrix";
 75 |     $pipeliner->add_commands(new Command($cmd, "pred_cor_matrix.ok"));
 76 | 
 77 |     $cmd = "$trinity_home/Analysis/DifferentialExpression/PtR  -m preds.collected.gencode_mapped.wAnnot.filt.matrix --binary --sample_cor_matrix --heatmap_colorscheme 'black,yellow' ";
 78 |     $pipeliner->add_commands(new Command($cmd, "pred_cor_matrix_plot.ok"));
 79 |     
 80 | 
 81 | 
 82 |     ## remove edgren set:
 83 |     $cmd = "bash -c 'set -eou pipefail; cat preds.collected.gencode_mapped.wAnnot.filt | egrep -v \"^(BT474|KPL4|MCF7|SKBR3)\" > preds.collected.gencode_mapped.wAnnot.filt.noEdgren'";
 84 |     $pipeliner->add_commands(new Command($cmd, "rmEdgren.ok"));
 85 |     
 86 |     ## run Venn-based accuracy analysis:
 87 | 
 88 |     $cmd = "$benchmark_toolkit_basedir/Venn_analysis_strategy.pl preds.collected.gencode_mapped.wAnnot.filt.noEdgren progs_select.txt 3 10";
 89 |     $pipeliner->add_commands(new Command($cmd, "venn_analysis.ok"));
 90 |     
 91 |     
 92 |     $pipeliner->run();
 93 |     
 94 |     exit(0);
 95 |     
 96 |     
 97 | }
 98 | 
 99 | 
100 | ####
101 | sub init_pipeliner {
102 |     
103 |     my $pipeliner = new Pipeliner(-verbose => 2, -cmds_log => 'pipe.log');
104 |     my $checkpoint_dir = cwd() . "/_checkpoints";
105 |     unless (-d $checkpoint_dir) {
106 |         mkdir $checkpoint_dir or die "Error, cannot mkdir $checkpoint_dir";
107 |     }
108 |     $pipeliner->set_checkpoint_dir($checkpoint_dir);
109 | 
110 |     return($pipeliner);
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./all* ./auc_files.list ./*.dat ./*.pdf
4 | rm -rf ./_*
5 | 
6 | cd Edgren_subset && ./cleanMe.sh
7 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/progs_select.txt:
--------------------------------------------------------------------------------
 1 | ARRIBA
 2 | ChimPipe
 3 | ChimeraScan
 4 | deFuse
 5 | EricScript
 6 | FUSIONCATCHER_v1.10_June192019
 7 | FusionHunter
 8 | InFusion
 9 | JAFFA-Assembly
10 | JAFFA-Direct
11 | MapSplice
12 | nFuse
13 | PIZZLY
14 | PRADA
15 | SOAP-fuse
16 | STARCHIP_csm10_pGm2_May012019
17 | STARSEQR
18 | #STAR_FUSION_v1.5
19 | TopHat-Fusion
20 | TrinityFusion-C
21 | 


--------------------------------------------------------------------------------
/cancer_cell_lines/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | if [ ! -d samples ]; then
 6 |     wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_cancer_cell_lines/samples/
 7 |     find data.broadinstitute.org/|grep html | xargs -n1 rm -f
 8 |     mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_cancer_cell_lines/samples .
 9 |     rm -rf ./data.broadinstitute.org
10 | fi
11 | 
12 | 
13 | 
14 | ./analyze_cancer_data.pl $*
15 | 
16 | 
17 | ## Edgren subset study
18 | 
19 | if [ $* ]; then
20 |     cd Edgren_subset && ./runMe.sh
21 | fi
22 | 


--------------------------------------------------------------------------------
/cleanMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | 
 6 | dirs=(simulated_data cancer_cell_lines runtime_analysis) 
 7 | 
 8 | for dir in ${dirs[*]}
 9 | do
10 |     cd $dir
11 |     ./cleanMe.sh
12 |     cd ../
13 | done
14 | 
15 | rm -rf ./figs_for_paper
16 | 
17 | 


--------------------------------------------------------------------------------
/progs_restrict.txt:
--------------------------------------------------------------------------------
 1 | ARRIBA
 2 | CHIMERASCAN
 3 | CHIMPIPE
 4 | DEFUSE
 5 | ERICSCRIPT
 6 | FUSIONCATCHER_v1.10_June192019
 7 | FUSIONHUNTER
 8 | INFUSION_CF3
 9 | JAFFA_ASSEMBLY
10 | JAFFA_DIRECT
11 | JAFFA_HYBRID
12 | MAPSPLICE
13 | NFUSE
14 | PIZZLY
15 | PRADA
16 | SOAP_FUSE
17 | STARCHIP_csm10_pGm2_May012019
18 | STAR_FUSION_v1.5_hg19_Apr042019
19 | STARSEQR_STAR-SEQR
20 | TOPHAT_FUSION
21 | TRINITY_FUSION_C_hg19
22 | TRINITY_FUSION_D_hg19
23 | TRINITY_FUSION_UC_hg19
24 | 


--------------------------------------------------------------------------------
/resources/genes.coords.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fusiontranscripts/FusionBenchmarking/8f673cfd5c2c4f153bf58990a798bd6277b3c66e/resources/genes.coords.gz


--------------------------------------------------------------------------------
/resources/notes:
--------------------------------------------------------------------------------
 1 | 
 2 | ## hg19 coordinate liftovers and gene symbol aliases:
 3 | 
 4 | using: /home/unix/bhaas/FUS/util/make_hg19_gene_coords_file.pl  to extract liftover coords.
 5 | 
 6 | ie.
 7 | 
 8 |     liftOver -gff ref_annot.gtf  /home/unix/bhaas/utilities/hg38ToHg19.over.chain.gz mapped unmapped
 9 |     ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_to_gene_spans.pl mapped > mapped.coords
10 | 
11 |     then aggregate all coords across all hg19-transposed annotations to gene.coords.gz file.
12 | 
13 | 
14 | ## paralog clustering
15 | 
16 | see: notes.paralog_clustering.2020.txt
17 | 
18 | 


--------------------------------------------------------------------------------
/resources/notes.paralog_clustering.2020.txt:
--------------------------------------------------------------------------------
 1 | ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_file_to_feature_seqs.pl  --gtf_file gencode.v22.annotation.gtf --genome_fa GRCh38.primary_assembly.genome.fa --seqType CDSplus > gencode.v22.annotation.cdsplus.fa
 2 | 
 3 | 
 4 | ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/gtf_file_to_feature_seqs.pl  --gtf_file gencode.v19.annotation.gtf --genome_fa GRCh37.p13.genome.primary.fa --seqType CDSplus > gencode.v19.annotation.cdsplus.fa
 5 | 
 6 | 
 7 | c GRCh37/gencode.v19.annotation.cdsplus.fa  |  perl -lane 's/>/>gv19./; print;' > GRCh37/gencode.v19.annotation.cdsplus.fa.mod
 8 | 
 9 | c GRCh38/gencode.v22.annotation.cdsplus.fa  | perl -lane 's/>/>gv22./; print;' > GRCh38/gencode.v22.annotation.cdsplus.fa.mod
10 | 
11 | c  GRCh37/gencode.v19.annotation.cdsplus.fa.mod GRCh38/gencode.v22.annotation.cdsplus.fa.mod > gencode.combined.cdsplus.fa
12 | 
13 | makeblastdb -in gencode.combined.cdsplus.fa -dbtype nucl
14 | 
15 | mkdir tmpdir; ~/GITHUB/CTAT_FUSIONS/ctat-genome-lib-builder/util/dfam_repeat_masker.pl --dfam_hmm /seq/RNASEQ/TOOLS/DFAM/homo_sapiens_dfam.hmm --target_fa gencode.combined.cdsplus.fa --out_masked gencode.combined.cdsplus.dfam_masked.fa --CPU 10 --tmpdir ./tmpdir 2>&1 | tee run.dfam.log
16 | 
17 | ## blast
18 | 
19 | makeblastdb -in gencode.combined.cdsplus.dfam_masked.fa -dbtype nucl
20 | 
21 | blastn -query  gencode.combined.cdsplus.dfam_masked.fa -db  gencode.combined.cdsplus.dfam_masked.fa -max_target_seqs 10000 -outfmt 6 -evalue 1e-3 -lcase_masking  -num_threads 20 -word_size 11  >  blast_pairs.outfmt6 
22 | 
23 | 
24 | ## prep for paralog clustering
25 | 
26 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/outfmt6_add_percent_match_length.group_segments.pl blast_pairs.outfmt6 gencode.combined.cdsplus.fa gencode.combined.cdsplus.fa > blast_pairs.outfmt6.grouped
27 | 
28 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/blast_outfmt6_replace_trans_id_w_gene_symbol.pl gencode.combined.cdsplus.dfam_masked.fa  blast_pairs.outfmt6.grouped >  blast_pairs.outfmt6.grouped.genesym
29 | 
30 | cat blast_pairs.outfmt6.grouped.genesym | sort -k4,4g -k3,3gr > blast_pairs.outfmt6.grouped.genesym.sorted
31 | 
32 | ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/get_top_blast_pairs.pl blast_pairs.outfmt6.grouped.genesym.sorted > blast_pairs.outfmt6.grouped.genesym.sorted.top
33 | 
34 |  ~/GITHUB/CTAT_FUSIONS/FusionBenchmarking/util/paralog_clustering_util/outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl  --outfmt6_grouped  blast_pairs.outfmt6.grouped.genesym.sorted.top --min_pct_len 1 --min_per_id 90 --inflation_factor 3
35 | 


--------------------------------------------------------------------------------
/resources/paralog_clusters.dat:
--------------------------------------------------------------------------------
1 | paralog_clusters.2020.I3.dat


--------------------------------------------------------------------------------
/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | 
 6 | ## Run analyses first for the simulated data and then for the cancer cell line data.
 7 | 
 8 | dirs=(simulated_data cancer_cell_lines runtime_analysis) 
 9 | 
10 | for dir in ${dirs[*]}
11 | do
12 |     cd $dir
13 |     ./runMe.sh $*
14 |     cd ../
15 | done
16 | 
17 | 
18 | 
19 | # gather main and supp. figures for paper
20 | 
21 | ./util/__get_figs_for_paper.pl .
22 | 
23 | 


--------------------------------------------------------------------------------
/runtime_analysis/STAR_F_multicore/runtimes.txt:
--------------------------------------------------------------------------------
 1 | sample	prog	time_h
 2 | G20476.DMS_454.2	1core	1.98
 3 | G20476.DMS_454.2	2core	1.23
 4 | G20476.DMS_454.2	5core	1.01
 5 | G20476.DMS_454.2	10core	0.82
 6 | G20495.786-O.2	1core	2.01
 7 | G20495.786-O.2	2core	1.03
 8 | G20495.786-O.2	5core	0.53
 9 | G20495.786-O.2	10core	0.36
10 | G20498.KYSE-180.2	1core	1.75
11 | G20498.KYSE-180.2	2core	0.97
12 | G20498.KYSE-180.2	5core	0.54
13 | G20498.KYSE-180.2	10core	0.51
14 | G20500.IGR-37.2	1core	1.78
15 | G20500.IGR-37.2	2core	1.01
16 | G20500.IGR-37.2	5core	0.55
17 | G20500.IGR-37.2	10core	0.52
18 | G25214.MKN7.1	1core	1.83
19 | G25214.MKN7.1	2core	1.06
20 | G25214.MKN7.1	5core	0.63
21 | G25214.MKN7.1	10core	0.63
22 | G25225.NCI-H522.1	1core	1.81
23 | G25225.NCI-H522.1	2core	1.02
24 | G25225.NCI-H522.1	5core	0.60
25 | G25225.NCI-H522.1	10core	0.46
26 | G26175.A172.2	1core	1.90
27 | G26175.A172.2	2core	1.06
28 | G26175.A172.2	5core	0.56
29 | G26175.A172.2	10core	0.52
30 | G26182.KMS-12-BM.2	1core	1.95
31 | G26182.KMS-12-BM.2	2core	1.07
32 | G26182.KMS-12-BM.2	5core	0.67
33 | G26182.KMS-12-BM.2	10core	0.43
34 | G26199.LN-229.2	1core	1.88
35 | G26199.LN-229.2	2core	1.06
36 | G26199.LN-229.2	5core	0.55
37 | G26199.LN-229.2	10core	0.39
38 | G26212.A-673.2	1core	1.93
39 | G26212.A-673.2	2core	1.07
40 | G26212.A-673.2	5core	0.56
41 | G26212.A-673.2	10core	0.57
42 | 


--------------------------------------------------------------------------------
/runtime_analysis/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ev
4 | 
5 | rm -f ./STAR_F_multicore/runtimes.txt.boxplot.pdf ./all_progs_cancer/runtimes.txt.boxplot.pdf
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/runtime_analysis/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | ../util/boxplot_runtimes.Rscript ./STAR_F_multicore/runtimes.txt
 6 | 
 7 | ../util/boxplot_runtimes.Rscript ./all_progs_cancer/runtimes.txt
 8 | 
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/simulated_data/SuppTable-sim_reads.csv:
--------------------------------------------------------------------------------
 1 | Simulated read length,simulated read data set name,modeled on original data,original data source
 2 | sim_50,sim_adipose,ERR030880,ArrayExpress E-MTAB-513:ERR030880
 3 | sim_50,sim_brain,ERR030882,ArrayExpress E-MTAB-513:ERR030882
 4 | sim_50,sim_colon,ERR030884,ArrayExpress E-MTAB-513:ERR030884
 5 | sim_50,sim_heart,ERR030886,ArrayExpress E-MTAB-513:ERR030886
 6 | sim_50,sim_testis,ERR030873,ArrayExpress E-MTAB-513:ERR030873
 7 | sim_101,sim1_reads,G27488.SNU-620.2,CCLE:b146a903-8d81-4d13-a630-22c944087bf4
 8 | sim_101,sim2_reads,G28535.OVTOKO,CCLE:24799388-008d-4011-a181-f4b2070b8bb0
 9 | sim_101,sim3_reads,G25214.MKN7,CCLE:23056430-2489-4922-ac0b-299b7f43e74e
10 | sim_101,sim4_reads,G30608.SW_780,CCLE:a3e56efd-459e-44a2-be92-1abf59ee7ff3
11 | sim_101,sim5_reads,G27376.COLO_792.1,CCLE:5c9df572-dd10-420f-980a-17ae004dfc08
12 | 


--------------------------------------------------------------------------------
/simulated_data/cleanMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dirs=(sim_101 sim_50)
 4 | 
 5 | for dir in ${dirs[*]}
 6 | do
 7 |     cd $dir
 8 |     ./cleanMe.sh
 9 |     cd ..
10 | done
11 | 
12 | rm -f ./*.pdf
13 |            
14 |            
15 | 


--------------------------------------------------------------------------------
/simulated_data/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | dirs=(sim_50 sim_101)
 6 | 
 7 | # run analyses separately for the sim_50 and sim_101 data
 8 | for dir in ${dirs[*]}
 9 | do
10 |     cd $dir
11 |     ./runMe.sh $*
12 |     cd ../
13 | done
14 | 
15 | ####################################
16 | # combine results into single figure
17 | 
18 | ## allow rev
19 | ../benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript sim_50/__analyze_allow_reverse/all.AUC.dat sim_101/__analyze_allow_reverse/all.AUC.dat  allow_rev.combined.pdf
20 | 
21 | ## allow rev & paralogs-ok
22 | ../benchmarking/plotters/plot_AUC_50_vs_101_boxplots.Rscript sim_50/__analyze_allow_rev_and_paralogs/all.AUC.dat sim_101/__analyze_allow_rev_and_paralogs/all.AUC.dat  allow_rev_and_paralogs.combined.pdf
23 | 
24 |            
25 |            
26 | 


--------------------------------------------------------------------------------
/simulated_data/sim_101/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log
4 | rm -rf ./_*
5 | 
6 | 


--------------------------------------------------------------------------------
/simulated_data/sim_101/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | if [ ! -d samples ]; then
 6 |     wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_101/samples/
 7 |     find data.broadinstitute.org/|grep html | xargs -n1 rm -f
 8 |     mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_101/samples .
 9 |     rm -rf ./data.broadinstitute.org
10 | fi
11 | 
12 | 
13 | ../analyze_simulated_data.pl sim_101.truth_set.dat sim_101.fusion_TPM_values.dat $*
14 | 


--------------------------------------------------------------------------------
/simulated_data/sim_50/cleanMe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | rm -f ./fusion_result_file_listing.dat ./preds.* ./pipe.log ./log ./all.AUC.dat
4 | rm -rf ./_*
5 | 
6 | 


--------------------------------------------------------------------------------
/simulated_data/sim_50/runMe.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ev
 4 | 
 5 | if [ ! -d samples ]; then
 6 |     wget -r --no-parent https://data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_50/samples/
 7 |     find data.broadinstitute.org/|grep html | xargs -n1 rm -f
 8 |     mv data.broadinstitute.org/Trinity/CTAT_FUSIONTRANS_BENCHMARKING/on_simulated_data/sim_50/samples .
 9 |     rm -rf ./data.broadinstitute.org
10 | fi
11 | 
12 | 
13 | ../analyze_simulated_data.pl sim_50.truth_set.dat sim_50.fusion_TPM_values.dat $*
14 | 


--------------------------------------------------------------------------------
/util/Terra/organize_FI_results_for_benchmarking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys, os, re
  4 | import subprocess
  5 | import logging
  6 | 
  7 | logging.basicConfig(stream=sys.stderr, level=logging.INFO)
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | def main():
 11 |     
 12 |     usage = "\n\tusage: {} local.FI.files.list ANALYSIS_NAME\n\n".format(sys.argv[0])
 13 |     if len(sys.argv) < 3:
 14 |         sys.stderr.write(usage)
 15 |         sys.exit(1)
 16 | 
 17 |     local_files_list = sys.argv[1]
 18 |     analysis_name = sys.argv[2]
 19 | 
 20 | 
 21 |     translation = get_sample_name_translation()
 22 | 
 23 |     with open(local_files_list) as fh:
 24 |         for filename in fh:
 25 |             filename = filename.rstrip()
 26 |             if filename[-3:] != ".gz":
 27 |                 raise RuntimeError("Error, not identifying filename {} as gzipped")
 28 | 
 29 |             sample_name = os.path.basename(filename)
 30 |             sample_name, count = re.subn(".FusionInspector.tsv.gz", "", sample_name) # when FI run as follow-up to starF
 31 |             if count != 1:
 32 |                 sample_name, count = re.subn(".finspector.FusionInspector.fusions.abridged.tsv.gz", "", sample_name) # when FI run separately
 33 |                 if count != 1:
 34 |                     raise RuntimeError("didn't find .FusionInspector.tsv.gz or .finspector.FusionInspector.fusions.abridged.tsv.gz in sample_name: {}".format(sample_name))
 35 | 
 36 | 
 37 |             if sample_name in translation:
 38 |                 sample_name = translation[sample_name]
 39 |                 
 40 |             outdir = "/".join(["samples", sample_name, analysis_name])
 41 |             if not os.path.exists(outdir):
 42 |                 os.makedirs(outdir)
 43 | 
 44 |             outputfile = os.path.join(outdir, "finspector.fusion_predictions.abridged.tsv")
 45 | 
 46 |             logger.info("-writing {}".format(outputfile))
 47 |             subprocess.check_call(" ".join(["gunzip", "-c", filename, ">", outputfile]), shell=True)
 48 |             
 49 | 
 50 | 
 51 |     sys.exit(0)
 52 | 
 53 | 
 54 | def get_sample_name_translation():
 55 | 
 56 |     translation = dict()
 57 | 
 58 |     pairs_txt = """G20476_DMS_454_2	G20476.DMS_454.2
 59 | G20495_786-O_2	G20495.786-O.2
 60 | G20498_KYSE-180_2	G20498.KYSE-180.2
 61 | G20500_IGR-37_2	G20500.IGR-37.2
 62 | G25214_MKN7_1	G25214.MKN7.1
 63 | G25225_NCI-H522_1	G25225.NCI-H522.1
 64 | G26175_A172_2	G26175.A172.2
 65 | G26182_KMS-12-BM_2	G26182.KMS-12-BM.2
 66 | G26199_LN-229_2	G26199.LN-229.2
 67 | G26212_A-673_2	G26212.A-673.2
 68 | G26216_KP-2_2	G26216.KP-2.2
 69 | G26228_Hs_683_2	G26228.Hs_683.2
 70 | G26236_NCI-H716_2	G26236.NCI-H716.2
 71 | G26249_KMS-26_2	G26249.KMS-26.2
 72 | G26253_KMS-34_2	G26253.KMS-34.2
 73 | G26262_NCI-H889_2	G26262.NCI-H889.2
 74 | G27214_PC-3_1	G27214.PC-3.1
 75 | G27219_Panc_03_27_1	G27219.Panc_03.27.1
 76 | G27233_A-498_1	G27233.A-498.1
 77 | G27259_AN3_CA_1	G27259.AN3_CA.1
 78 | G27280_TC-71_1	G27280.TC-71.1
 79 | G27367_BFTC-909_1	G27367.BFTC-909.1
 80 | G27376_COLO_792_1	G27376.COLO_792.1
 81 | G27453_SNU-398_2	G27453.SNU-398.2
 82 | G27463_SK-MEL-1_2	G27463.SK-MEL-1.2
 83 | G27476_PK-59_2	G27476.PK-59.2
 84 | G27479_SK-MEL-3_2	G27479.SK-MEL-3.2
 85 | G27488_SNU-620_2	G27488.SNU-620.2
 86 | G27516_SK-MEL-28_2	G27516.SK-MEL-28.2
 87 | G27544_SF268_2	G27544.SF268.2
 88 | G28011_KLE_1	G28011.KLE.1
 89 | G28034_MDA-MB-361_1	G28034.MDA-MB-361.1
 90 | G28045_KYSE-270_1	G28045.KYSE-270.1
 91 | G28050_KMM-1_1	G28050.KMM-1.1
 92 | G28054_KYSE-520_1	G28054.KYSE-520.1
 93 | G28070_LN-18_1	G28070.LN-18.1
 94 | G28072_MDA-MB-175-VII_1	G28072.MDA-MB-175-VII.1
 95 | G28077_MG-63_1	G28077.MG-63.1
 96 | G28081_JHH-7_1	G28081.JHH-7.1
 97 | G28087_MDA-MB-436_1	G28087.MDA-MB-436.1
 98 | G28535_OVTOKO_1	G28535.OVTOKO.1
 99 | G28545_NUGC-2_1	G28545.NUGC-2.1
100 | G28575_OUMS-23_1	G28575.OUMS-23.1
101 | G28610_MHH-ES-1_1	G28610.MHH-ES-1.1
102 | G30594_UACC-893_1	G30594.UACC-893.1
103 | G30631_SU-DHL-10_1	G30631.SU-DHL-10.1
104 | G41663_OVISE_5	G41663.OVISE.5
105 | G41682_KYSE-510_5	G41682.KYSE-510.5
106 | G41706_RT4_5	G41706.RT4.5
107 | G41709_FaDu_5	G41709.FaDu.5
108 | G41710_SNU-16_5	G41710.SNU-16.5
109 | G41724_HGC-27_5	G41724.HGC-27.5"""
110 | 
111 |     for line in pairs_txt.split("\n"):
112 |         (before, after) = re.split("\s+", line)
113 |         translation[before] = after
114 | 
115 | 
116 |     return translation
117 | 
118 | 
119 | 
120 | if __name__=='__main__':
121 |     main()
122 | 


--------------------------------------------------------------------------------
/util/Terra/organize_StarF_results_for_benchmarking.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys, os, re
  4 | import subprocess
  5 | import logging
  6 | 
  7 | logging.basicConfig(stream=sys.stderr, level=logging.INFO)
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | def main():
 11 | 
 12 |     usage = "\n\tusage: {} local.starF.files.list ANALYSIS_NAME\n\n".format(sys.argv[0])
 13 |     if len(sys.argv) < 3:
 14 |         sys.stderr.write(usage)
 15 |         sys.exit(1)
 16 | 
 17 |     local_files_list = sys.argv[1]
 18 |     analysis_name = sys.argv[2]
 19 | 
 20 |     translation = get_sample_name_translation()
 21 |     
 22 | 
 23 |     with open(local_files_list) as fh:
 24 |         for filename in fh:
 25 |             filename = filename.rstrip()
 26 |             if filename[-3:] != ".gz":
 27 |                 raise RuntimeError("Error, not identifying filename {} as gzipped")
 28 | 
 29 |             sample_name = os.path.basename(filename)
 30 |             sample_name, count = re.subn(".STAR-Fusion.tsv.gz", "", sample_name)
 31 |             if count != 1:
 32 |                 raise RuntimeError("didn't find .STAR-Fusion.tsv.gz in sample_name: {}".format(sample_name))
 33 | 
 34 |             if sample_name in translation:
 35 |                 sample_name = translation[sample_name]
 36 |             
 37 |             outdir = "/".join(["samples", sample_name, analysis_name])
 38 |             if not os.path.exists(outdir):
 39 |                 os.makedirs(outdir)
 40 | 
 41 |             outputfile = os.path.join(outdir, "star-fusion.fusion_predictions.abridged.tsv")
 42 | 
 43 |             logger.info("-writing {}".format(outputfile))
 44 |             subprocess.check_call(" ".join(["gunzip", "-c", filename, ">", outputfile]), shell=True)
 45 |             
 46 | 
 47 | 
 48 |     sys.exit(0)
 49 | 
 50 | 
 51 | 
 52 | def get_sample_name_translation():
 53 | 
 54 |     translation = dict()
 55 | 
 56 |     pairs_txt = """G20476_DMS_454_2	G20476.DMS_454.2
 57 | G20495_786-O_2	G20495.786-O.2
 58 | G20498_KYSE-180_2	G20498.KYSE-180.2
 59 | G20500_IGR-37_2	G20500.IGR-37.2
 60 | G25214_MKN7_1	G25214.MKN7.1
 61 | G25225_NCI-H522_1	G25225.NCI-H522.1
 62 | G26175_A172_2	G26175.A172.2
 63 | G26182_KMS-12-BM_2	G26182.KMS-12-BM.2
 64 | G26199_LN-229_2	G26199.LN-229.2
 65 | G26212_A-673_2	G26212.A-673.2
 66 | G26216_KP-2_2	G26216.KP-2.2
 67 | G26228_Hs_683_2	G26228.Hs_683.2
 68 | G26236_NCI-H716_2	G26236.NCI-H716.2
 69 | G26249_KMS-26_2	G26249.KMS-26.2
 70 | G26253_KMS-34_2	G26253.KMS-34.2
 71 | G26262_NCI-H889_2	G26262.NCI-H889.2
 72 | G27214_PC-3_1	G27214.PC-3.1
 73 | G27219_Panc_03_27_1	G27219.Panc_03.27.1
 74 | G27233_A-498_1	G27233.A-498.1
 75 | G27259_AN3_CA_1	G27259.AN3_CA.1
 76 | G27280_TC-71_1	G27280.TC-71.1
 77 | G27367_BFTC-909_1	G27367.BFTC-909.1
 78 | G27376_COLO_792_1	G27376.COLO_792.1
 79 | G27453_SNU-398_2	G27453.SNU-398.2
 80 | G27463_SK-MEL-1_2	G27463.SK-MEL-1.2
 81 | G27476_PK-59_2	G27476.PK-59.2
 82 | G27479_SK-MEL-3_2	G27479.SK-MEL-3.2
 83 | G27488_SNU-620_2	G27488.SNU-620.2
 84 | G27516_SK-MEL-28_2	G27516.SK-MEL-28.2
 85 | G27544_SF268_2	G27544.SF268.2
 86 | G28011_KLE_1	G28011.KLE.1
 87 | G28034_MDA-MB-361_1	G28034.MDA-MB-361.1
 88 | G28045_KYSE-270_1	G28045.KYSE-270.1
 89 | G28050_KMM-1_1	G28050.KMM-1.1
 90 | G28054_KYSE-520_1	G28054.KYSE-520.1
 91 | G28070_LN-18_1	G28070.LN-18.1
 92 | G28072_MDA-MB-175-VII_1	G28072.MDA-MB-175-VII.1
 93 | G28077_MG-63_1	G28077.MG-63.1
 94 | G28081_JHH-7_1	G28081.JHH-7.1
 95 | G28087_MDA-MB-436_1	G28087.MDA-MB-436.1
 96 | G28535_OVTOKO_1	G28535.OVTOKO.1
 97 | G28545_NUGC-2_1	G28545.NUGC-2.1
 98 | G28575_OUMS-23_1	G28575.OUMS-23.1
 99 | G28610_MHH-ES-1_1	G28610.MHH-ES-1.1
100 | G30594_UACC-893_1	G30594.UACC-893.1
101 | G30631_SU-DHL-10_1	G30631.SU-DHL-10.1
102 | G41663_OVISE_5	G41663.OVISE.5
103 | G41682_KYSE-510_5	G41682.KYSE-510.5
104 | G41706_RT4_5	G41706.RT4.5
105 | G41709_FaDu_5	G41709.FaDu.5
106 | G41710_SNU-16_5	G41710.SNU-16.5
107 | G41724_HGC-27_5	G41724.HGC-27.5"""
108 | 
109 |     for line in pairs_txt.split("\n"):
110 |         (before, after) = re.split("\s+", line)
111 |         translation[before] = after
112 | 
113 | 
114 |     return translation
115 | 
116 | 
117 | if __name__=='__main__':
118 |     main()
119 | 


--------------------------------------------------------------------------------
/util/__get_figs_for_paper.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | 
  6 | my $usage = "usage: $0 repo_basedir\n";
  7 | 
  8 | my $basedir = $ARGV[0] or die $usage;
  9 | 
 10 | chdir $basedir or die "Error, cannot cd to $basedir";
 11 | 
 12 | # make the dir structure
 13 | unless (-d "figs_for_paper") {
 14 |     &process_cmd("mkdir -p figs_for_paper");
 15 | }
 16 | 
 17 | 
 18 | my @targets_and_dests = ( 
 19 |         
 20 |     ## Figure 2
 21 |     
 22 |     # Fig 2-a
 23 |     ["simulated_data/allow_rev_and_paralogs.combined.pdf", 
 24 |      "figs_for_paper/fig_2a.sim50_vs_101.boxplots.pdf"],
 25 | 
 26 |     # Fig 2-b_top, expression vs. sensitivity heatmap PE 50
 27 |     ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.sensitivity_vs_expr.dat.genes_vs_samples_heatmap.pdf", 
 28 |      "figs_for_paper/fig_2b_top.sim_50.sens_vs_expr.heatmap.pdf"],
 29 |     
 30 |     # Fig 2-b_bottom, expression vs. sensitivity heatmap PE 101
 31 |     ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.sensitivity_vs_expr.dat.genes_vs_samples_heatmap.pdf",
 32 |      "figs_for_paper/fig_2b_bottom.sim_101.sens_vs_expr.heatmap.pdf"],
 33 |     
 34 |     
 35 |     ## Figure 3
 36 |     ["cancer_cell_lines/Edgren_subset/edgren.min3.UpSetR.pdf",
 37 |      "figs_for_paper/fig_3a.four_breast_cancer_cell_lines_UpSetR_plot_2nd_page.pdf"],
 38 |     
 39 |     ["cancer_cell_lines/Edgren_subset/preds.collected.gencode_mapped.wAnnot.filt.edgren.scored.enrich_stats.pdf",
 40 |      "figs_for_paper/fig_3b.valid_fusion_enrichment_2nd_page.pdf"],
 41 |     
 42 |     ## Figure 4
 43 |     ["cancer_cell_lines/all.auc.rankings.iu\=1.okp\=1.boxplot.pdf",
 44 |      "figs_for_paper/fig_4a_cancer_leaderboard_rankings.pdf"],
 45 |     
 46 |     ["cancer_cell_lines/__min_7_agree/min_7.okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1_scatter.pdf",
 47 |      "figs_for_paper/fig_4d_peak_accuracy_min7progsagree.pdf"],
 48 |     
 49 |     
 50 |     
 51 |     ####################
 52 |     ## Supplementary Figures
 53 |     
 54 |     # supp fig 1
 55 |     ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.TP_and_FP_counts_vs_minFrags_eaProg.pdf",
 56 |      "figs_for_paper/supp_fig1.pe50_TP_FP_vs_minReads.pdf"],
 57 |     
 58 |     # supp fig 2
 59 |     ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.TP_and_FP_counts_vs_minFrags_eaProg.pdf",
 60 |      "figs_for_paper/supp_fig2.pe101_TP_FP_vs_minReads.pdf"],
 61 |     
 62 |     # supp fig 3a
 63 |     ["simulated_data/sim_50/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.tpr_ppv_at_maxF1_scatter.pdf",
 64 |      "figs_for_paper/supp_fig3a.pe50_max_F1_scatter.pdf"],
 65 |     
 66 |     # supp fig 3b
 67 |     ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.tpr_ppv_at_maxF1_scatter.pdf",
 68 |      "figs_for_paper/supp_fig3b.pe101_max_F1_scatter.pdf"],
 69 |     
 70 |     # supp fig 4
 71 |     ["cancer_cell_lines/okPara_ignoreUnsure.results.scored.ROC.tpr_ppv_at_maxF1.dat.consolidated.scatters.pdf",
 72 |      "figs_for_paper/supp_fig4.cancer_maxF1_ea_truthset.pdf"],
 73 | 
 74 |     # supp fig 5
 75 |     ["simulated_data/sim_101/__analyze_allow_rev_and_paralogs/all.scored.preds.ROC.best.dat.before_vs_after.pdf",
 76 |      "figs_for_paper/supp_fig5.before_vs_after_paralog_equiv_pe101.pdf"],
 77 |     
 78 |     # supp fig 6
 79 |     ["cancer_cell_lines/preds.collected.gencode_mapped.wAnnot.filt.matrix.binary.sample_cor_matrix.pdf",
 80 |      "figs_for_paper/supp_fig6.cancer_correlated_preds.pdf"],
 81 | 
 82 |     # supp fig 7
 83 |     ["cancer_cell_lines/all.auc.rankings_per_prog_adj.boxplot.pdf",
 84 |      "figs_for_paper/supp_fig7.effect_iu_okp_on_cancer_ranking_dist.pdf"],
 85 |     
 86 |     # supp fig 8
 87 |     ["cancer_cell_lines/all.auc.rankings.iu\=1.okp\=0.boxplot.pdf",
 88 |      "figs_for_paper/supp_fig8.cancer_rankings_equiv_para_off.pdf"],
 89 |     
 90 |     
 91 |     ############################
 92 |     ## Supplementary data files
 93 |     
 94 |     # supp table 1
 95 |     ["simulated_data/sim_50/preds.collected.gencode_mapped.wAnnot.filt",
 96 |      "figs_for_paper/supp_table1.pe50_fusion_filtered_preds.tsv"],
 97 |     
 98 |     # supp table 2
 99 |     ["simulated_data/sim_101/preds.collected.gencode_mapped.wAnnot.filt",
100 |      "figs_for_paper/supp_table2.pe101_fusion_filtered_preds.tsv"],
101 |     
102 |     # supp table 4
103 |     ["cancer_cell_lines/preds.collected.gencode_mapped.wAnnot.filt",
104 |      "figs_for_paper/supp_table4.cancer_fusion_filtered_preds.tsv"],
105 | 
106 |     );    
107 | 
108 | 
109 |     
110 | foreach my $target_and_dest (@targets_and_dests) {
111 | 
112 |     my ($from, $to) = @$target_and_dest;
113 | 
114 |     &process_cmd("cp $from $to");
115 | 
116 | }
117 | 
118 | 
119 |     
120 |     
121 | 
122 | exit(0);
123 | 
124 | ####
125 | sub process_cmd {
126 |     my ($cmd) = @_;
127 | 
128 |     print "CMD: $cmd\n";
129 |     my $ret = system($cmd);
130 |     if ($ret) {
131 |         die "Error, CMD: $cmd died with ret $ret";
132 |     }
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/util/boxplot_runtimes.Rscript:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args<-commandArgs(TRUE)
 4 | 
 5 | dat_filename = args[1]
 6 | 
 7 | data = read.table(dat_filename, header=T, sep='\t')
 8 | 
 9 | library('ggplot2')
10 | library('data.table')
11 | 
12 | dt = data.table(data)
13 | dt_median_time = dt[,.(median_time_h=median(time_h, na.rm=T)), by=.(prog)][order(median_time_h)]
14 | 
15 | data$prog = factor(data$prog, levels=factor(dt_median_time$prog))
16 | 
17 | write.table(dt_median_time, 'median_runtimes.txt', quote=F, sep="\t")
18 | 
19 | pdf_filename = paste(dat_filename, '.boxplot.pdf', sep='')
20 | pdf(pdf_filename)
21 | 
22 | p = ggplot(data, aes(factor(prog), time_h)) +
23 |     geom_boxplot(outlier.shape=NA) +
24 |     theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + ylim(c(0,120))
25 | 
26 | plot(p)
27 | 
28 | dev.off()
29 | 


--------------------------------------------------------------------------------
/util/capture_PR_AUC_for_plotting.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "\n\n\tusage: $0 auc_list.files\n\n";
 7 | 
 8 | my $auc_files_filename = $ARGV[0] or die $usage;
 9 | 
10 | main: {
11 |     
12 |     my @files = `cat $auc_files_filename`;
13 |     chomp @files;
14 | 
15 |     print join("\t", "progname", "min_thresh", "ignoreUnsure", "okpara", "auc") . "\n";
16 |     
17 |     foreach my $file (@files) {
18 | 
19 |         $file =~ /min_(\d+)/ or die "Erorr, no min val extracted from $file";
20 |         
21 |         my $min_thresh = $1;
22 | 
23 |         my $ignoreUnsure = 0;
24 |         if ($file =~ /ignoreUnsure/) {
25 |             $ignoreUnsure = 1;
26 |         }
27 |         
28 |         my $okpara = 0;
29 |         if ($file =~ /okPara/) {
30 |             $okpara = 1;
31 |         }
32 |         
33 |         my @data = `cat $file`;
34 |         chomp @data;
35 |         
36 |         foreach my $line (@data) {
37 |             my ($progname, $auc) = split(/\t/, $line);
38 |             print join("\t", $progname, $min_thresh, $ignoreUnsure, $okpara, $auc) . "\n";
39 |         }
40 |         
41 |     }
42 |     
43 |     exit(0);
44 |     
45 | }
46 | 


--------------------------------------------------------------------------------
/util/make_file_listing_input_table.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Carp;
 6 | 
 7 | 
 8 | my %restrict_progs;
 9 | if (@ARGV) {
10 |     my $restrict_progs_file = $ARGV[0];
11 |     open(my $fh, $restrict_progs_file) or die "Error, cannot open file: $restrict_progs_file";
12 |     while(<$fh>) {
13 |         chomp;
14 |         unless (/\w/) { next; }
15 |         my $progname = $_;
16 |         $restrict_progs{$progname} = 1;
17 |     }
18 |     close $fh;
19 | }
20 |     
21 | 
22 | ## convert prog name tokens to names used in the data table.
23 | my %converter = (CHIMERASCAN => 'ChimeraScan',
24 |                  CHIMPIPE => 'ChimPipe',
25 |                  DEFUSE => 'deFuse',
26 |                  ERICSCRIPT => 'EricScript',
27 |                  FUSIONHUNTER => 'FusionHunter',
28 |                  #FUSION_CATCHER_V0994e => 'FusionCatcher',
29 |                  INFUSION_CF3 => 'InFusion',
30 |                  JAFFA_ASSEMBLY => 'JAFFA-Assembly',
31 |                  JAFFA_DIRECT => 'JAFFA-Direct',
32 |                  JAFFA_HYBRID => 'JAFFA-Hybrid',
33 |                  MAPSPLICE => 'MapSplice',
34 |                  NFUSE => 'nFuse',
35 |                  PRADA => 'PRADA',
36 |                  SOAP_FUSE => 'SOAP-fuse',
37 |                  'STAR_FUSION_GRCh37v19_FL3_v51b3df4' => 'STAR_FUSION_old',
38 |                  TOPHAT_FUSION => 'TopHat-Fusion',
39 |                  ARRIBA => ['ARRIBA', 'ARRIBA_hc'], ## scoring regular and the hc subset separately
40 |                  PIZZLY => 'PIZZLY',
41 |                  STARCHIP => 'STARCHIP',
42 |                  'STAR_FUSION_v1.5_hg19_Apr042019' => 'STAR_FUSION_v1.5',
43 |                  STARCHIP_csm10 => 'STARChip_csm10',
44 |                  TRINITY_FUSION_C_hg19 => 'TrinityFusion-C',
45 |                  TRINITY_FUSION_UC_hg19 => 'TrinityFusion-UC',
46 |                  TRINITY_FUSION_D_hg19 => 'TrinityFusion-D',
47 |                  STARSEQR => 'STARSEQR',
48 |                  'STARSEQR_STAR-SEQR' => 'STARSEQR' 
49 |     );
50 | 
51 | 
52 | while (<STDIN>) {
53 |     chomp;
54 |     my $filename = $_;
55 | 
56 |     unless (-f $filename) {
57 |         print STDERR "warning, $filename is not a file. Skipping...\n";
58 |         next;
59 |     }
60 |     
61 |     if ($filename =~ m|/samples/([^/]+)/([^/]+)/|) {
62 |         
63 |         my $sample_name = $1;
64 |         my $prog = $2;
65 | 
66 |         if (%restrict_progs && ! exists $restrict_progs{$prog}) {
67 |             print STDERR "make_file_listing_input_table::  - skipping $filename, not in restricted list.\n";
68 |             next;
69 |         }
70 |         
71 |         my $proper_progname = $converter{$prog};
72 | 
73 |         if ($proper_progname) {
74 |             ## In case we have multiple ways of parsing the file and filtering data for different assessements.
75 |             if (ref $proper_progname) {
76 |                 foreach my $progname_adj (@$proper_progname) {
77 |                     print join("\t", $sample_name, $progname_adj, $filename) . "\n";
78 |                 }
79 |             }
80 |             else {
81 |                 print join("\t", $sample_name, $proper_progname, $filename) . "\n";
82 |             }
83 |         }
84 |         else {
85 |             # keep original name
86 |             print join("\t", $sample_name, $prog, $filename) . "\n";
87 |         }
88 |     }
89 |     else {
90 |         print STDERR "WARNING: not parsing filename as a target: $filename\n";
91 |     }
92 | }
93 | 
94 | 
95 | exit(0);
96 | 
97 | 
98 |     
99 | 


--------------------------------------------------------------------------------
/util/make_supp_AUC_table.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use File::Basename;
 6 | 
 7 | my $usage = "usage: $0 search_dir token \n\n";
 8 | 
 9 | my $search_dir = $ARGV[0] or die $usage;
10 | my $token = $ARGV[1] or die $usage;
11 | 
12 | my $cmd = "find $search_dir -regex \".\*fusion_preds.txt.scored.PR.AUC\" ";
13 | 
14 | my @files = `$cmd`;
15 | chomp @files;
16 | 
17 | print join("\t", "read_set", "data_set", "progname", "AUC") . "\n";
18 | 
19 | foreach my $file (@files) {
20 | 
21 |     print STDERR "-processing $file\n";
22 |     open (my $fh, $file) or die "Error, cannot open file: $file";
23 | 
24 |     my $data_set_name = basename(dirname($file));
25 |     
26 |     
27 |     while (<$fh>) {
28 |         print join("\t", $token, $data_set_name, $_);
29 |     }
30 | 
31 |     close $fh;
32 |     
33 | }
34 | 
35 | exit(0);
36 | 
37 | 


--------------------------------------------------------------------------------
/util/make_supp_ROC_table.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use File::Basename;
 6 | 
 7 | 
 8 | my $usage = "usage: $0 search_dir token \n\n";
 9 | 
10 | my $search_dir = $ARGV[0] or die $usage;
11 | my $token = $ARGV[1] or die $usage;
12 | 
13 | my $cmd = "find $search_dir -regex \".\*fusion_preds.txt.scored.ROC\" ";
14 | 
15 | my @files = `$cmd`;
16 | 
17 | my $printed_header_flag = 0;
18 | 
19 | foreach my $file (@files) {
20 | 
21 |     print STDERR "-processing $file\n";
22 |     open (my $fh, $file) or die "Error, cannot open file: $file";
23 |     
24 |     my $data_set_name = basename(dirname($file));
25 | 
26 | 
27 |     my $header = <$fh>;
28 |     unless ($printed_header_flag) {
29 |         print join("\t", "read_set", "data_set", $header);
30 |         $printed_header_flag = 1;
31 |     }
32 |     while (<$fh>) {
33 |         print join("\t", $token, $data_set_name, $_);
34 |     }
35 | 
36 |     close $fh;
37 | 
38 | }
39 | 
40 | exit(0);
41 | 
42 | 


--------------------------------------------------------------------------------
/util/paralog_clustering_util/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions for computing approximate paralog clusters and simpler blast match clusters for annotating suspicious fusion calls.
 2 | 
 3 | ## blastn
 4 | 
 5 |     blastn -query ref_annot.cdna -db ref_annot.cdna -max_target_seqs 1000 -outfmt 6 -evalue 1e-10 -num_threads 20 -word_size 11  >  blast_pairs.outfmt6
 6 | 
 7 | 
 8 | ## group segments
 9 | 
10 |     outfmt6_add_percent_match_length.group_segments.pl  blast_pairs.outfmt6 ref_annot.cdna ref_annot.cdna > blast_pairs.outfmt6.grouped
11 | 
12 | ## replace with gene symbols
13 | 
14 |     blast_outfmt6_replace_trans_id_w_gene_symbol.pl ref_annot.cdna blast_pairs.outfmt6.grouped > blast_pairs.outfmt6.grouped.genesym
15 | 
16 | 
17 | # sort by Evalue asc, per_id desc
18 |     
19 |     cat  blast_pairs.outfmt6.grouped.genesym | sort -k4,4g -k3,3gr  > blast_pairs.outfmt6.grouped.genesym.sorted
20 | 
21 | 
22 | # get top match for each
23 | 
24 |     get_top_blast_pairs.pl blast_pairs.outfmt6.grouped.genesym.sorted > blast_pairs.outfmt6.grouped.genesym.sorted.top
25 | 
26 | # perform Markov clustering
27 | 
28 |     outfmt6_add_percent_match_length.group_segments.to_Markov_Clustering.pl --outfmt6_grouped blast_pairs.outfmt6.grouped.genesym.sorted.top --min_pct_len 1 --min_per_id 90 --inflation_factor 5
29 | 
30 |     ln -s dump.out.blast_pairs.outfmt6.grouped.genesym.sorted.top.minLEN_1_pct_len.minPID_90.abc.mci.I50 paralog_clusters.txt
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/util/paralog_clustering_util/blast_outfmt6_replace_trans_id_w_gene_symbol.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "\n\n\tusage: $0 search_db.fasta  blast_results.outfmt6\n\n";
 7 | 
 8 | my $search_db = $ARGV[0] or die $usage;
 9 | my $blast_outfmt6 = $ARGV[1] or die $usage;
10 | 
11 | 
12 | main: {
13 | 
14 |     my %trans_to_gene_symbol = &parse_headers($search_db);
15 | 
16 | 
17 |     open (my $fh, $blast_outfmt6) or die "Error, cannot open file $blast_outfmt6";
18 |     while (<$fh>) {
19 |         if (/^\#/) { next; }
20 |         chomp;
21 |         my @x = split(/\t/);
22 |         my $transA = $x[0];
23 |         my $geneA = $trans_to_gene_symbol{$transA} or die "Error, no gene for $transA";
24 |         my $transB = $x[1];
25 |         my $geneB = $trans_to_gene_symbol{$transB} or die "Error, no gene for $transB";
26 |         
27 |         $x[0] = $geneA;
28 |         $x[1] = $geneB;
29 | 
30 |         if ($geneA ne $geneB) {
31 |             print join("\t", @x) . "\n";
32 |         }
33 |     }
34 |     close $fh;
35 | 
36 |     exit(0);
37 | 
38 | 
39 | }
40 | 
41 | 
42 | ####
43 | sub parse_headers {
44 |     my ($search_db) = @_;
45 | 
46 |     my %trans_to_sym;
47 | 
48 |     open (my $fh, $search_db) or die "Error, cannot open file $search_db";
49 |     while (<$fh>) {
50 |         chomp;
51 |         if (/^>/) {
52 |             s/>//;
53 |             my ($trans_id, $gene_id, $gene_sym) = split(/\s+/);
54 |             
55 |             unless (defined $gene_sym) {
56 |                 $gene_sym = $gene_id;
57 |             }
58 |             
59 |             $trans_to_sym{$trans_id} = $gene_sym;
60 |         }
61 |     }
62 | 
63 |     close $fh;
64 | 
65 | 
66 |     return(%trans_to_sym);
67 | }
68 | 


--------------------------------------------------------------------------------
/util/paralog_clustering_util/get_top_blast_pairs.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | 
 7 | my $usage = "\n\n\tusage: $0 blastn.outfmt6.grouped.geneSym.sorted\n\n** NOTE, MUST BE PRE-SORTED like so:\n"
 8 |     . "     cat blastn.outfmt6.grouped.geneSym | sort -k4,4g -k3,3gr > blastn.outfmt6.grouped.geneSym.sorted \n\n\n";
 9 | 
10 | my $input_file = $ARGV[0] or die $usage;
11 | 
12 | 
13 | main: {
14 |     
15 | 
16 |     # m blastn.outfmt6.grouped.geneSym | sort -k4,4g -k3,3gr > blastn.outfmt6.grouped.geneSym.sorted
17 | 
18 | 
19 |     my %data;
20 |     
21 |     open (my $fh, $input_file) or die "Error, cannot open file: $input_file";
22 |     while (<$fh>) {
23 |         my $line = $_;
24 |         chomp;
25 |         my @x = split(/\t/);
26 |         my $geneA = $x[0];
27 |         my $geneB = $x[1];
28 |         
29 |         my $token = join("$;", sort ($geneA, $geneB) );
30 | 
31 |         unless ($data{$token}) {
32 |             $data{$token} = 1;
33 |             print $line;
34 |         }
35 | 
36 |     }
37 | 
38 |     exit(0);
39 | }
40 | 
41 | 
42 | 
43 |         
44 | 


--------------------------------------------------------------------------------
/util/paralog_clustering_util/outfmt6_add_percent_match_length.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use FindBin;
 6 | use lib ("$FindBin::Bin/../../lib");
 7 | use Fasta_reader;
 8 | use List::Util qw(min max);
 9 | 
10 | my $usage = "\n\n\tusage: $0 blast.outfmt6 query_fasta target_fasta\n";
11 | 
12 | my $blast_file = $ARGV[0] or die $usage;
13 | my $query_fasta = $ARGV[1] or die $usage;
14 | my $target_fasta = $ARGV[2] or die $usage;
15 | 
16 | main: {
17 | 
18 |     my %query_seq_lens = &get_seq_lengths($query_fasta);
19 | 
20 |     my %target_seq_lens;
21 |     if ($query_fasta eq $target_fasta) {
22 |         %target_seq_lens = %query_seq_lens;
23 |     }
24 |     else {
25 |         %target_seq_lens = &get_seq_lengths($target_fasta);
26 |     }
27 | 
28 |     open (my $fh, $blast_file) or die "Error, cannot open file $blast_file";
29 |     while (<$fh>) {
30 |         chomp;
31 |         my @x = split(/\t/);
32 |         my $query_acc = $x[0];
33 |         my $target_acc = $x[1];
34 | 
35 |         my $query_len = $query_seq_lens{$query_acc} or die "Error, cannot find seq length for query: $query_acc";
36 |         my $target_len = $target_seq_lens{$target_acc} or die "Error, cannot find seq length for target: $target_acc";
37 | 
38 |         my $query_hit_len = abs($x[7]-$x[6]);
39 |         my $db_hit_len = abs($x[9]-$x[8]);
40 | 
41 |         my $pct_query_len = sprintf("%.2f", $query_hit_len / $query_len * 100);
42 |         my $pct_target_len = sprintf("%.2f", $db_hit_len / $target_len * 100);
43 | 
44 |         push (@x, $query_len, $pct_query_len, $target_len, $pct_target_len, max($pct_query_len, $pct_target_len));
45 |         
46 |         print join("\t", @x) . "\n";
47 |     }
48 |     
49 |     exit(0);
50 | }
51 | 
52 | ####
53 | sub get_seq_lengths {
54 |     my ($fasta_file) = @_;
55 | 
56 |     my %seq_lens;
57 | 
58 |     my $fasta_reader = new Fasta_reader($fasta_file);
59 |     while (my $seq_obj = $fasta_reader->next()) {
60 | 
61 |         my $acc = $seq_obj->get_accession();
62 |         my $seq_len = length($seq_obj->get_sequence());
63 | 
64 |         $seq_lens{$acc} = $seq_len;
65 |     }
66 | 
67 |     return(%seq_lens);
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/util/terra_partition_to_sample_dirs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import sys, os, re
  4 | import subprocess
  5 | 
  6 | def main():
  7 |     usage = "\n\tusage: {} files.list.file progname_token\n\n".format(sys.argv[0])
  8 | 
  9 |     if len(sys.argv) < 3:
 10 |         print(usage, file=sys.stderr)
 11 |         sys.exit(1)
 12 | 
 13 |     files_list_file = sys.argv[1]
 14 |     progname_token = sys.argv[2]
 15 | 
 16 |     if not os.path.exists("samples"):
 17 |         os.makedirs("samples")
 18 | 
 19 |     with open(files_list_file) as fh:
 20 |         for filename in fh:
 21 |             orig_filename = filename.rstrip()
 22 |             filename = os.path.basename(orig_filename)
 23 |             filename_pts = filename.split(".")
 24 | 
 25 |             samplename = filename_pts[0]
 26 |             if samplename in sample_conversions:
 27 |                 samplename = sample_conversions[samplename]
 28 |                 print("-renamed samplename to: {}".format(samplename))
 29 |             
 30 |             dest_dir = "samples/{}/{}".format(samplename, progname_token)
 31 |             if not os.path.exists(dest_dir):
 32 |                 os.makedirs(dest_dir)
 33 |                 
 34 |             cmd = "cp {} {}".format(orig_filename, dest_dir)
 35 |             subprocess.check_call(cmd, shell=True)
 36 |             print(cmd, file=sys.stderr)
 37 | 
 38 |     sys.exit(0)
 39 |                 
 40 | 
 41 | sample_conversions = {
 42 |     'G20476_DMS_454_2' : "G20476.DMS_454.2",
 43 |     "G20495_786-O_2" : "G20495.786-O.2",
 44 |     "G20498_KYSE-180_2" : "G20498.KYSE-180.2",
 45 |     "G20500_IGR-37_2" : "G20500.IGR-37.2",
 46 |     "G25214_MKN7_1" : "G25214.MKN7.1",
 47 |     "G25225_NCI-H522_1" : "G25225.NCI-H522.1",
 48 |     "G26175_A172_2" : "G26175.A172.2",
 49 |     "G26182_KMS-12-BM_2" : "G26182.KMS-12-BM.2",
 50 |     "G26199_LN-229_2" : "G26199.LN-229.2",
 51 |     "G26212_A-673_2" : "G26212.A-673.2",
 52 |     "G26216_KP-2_2" : "G26216.KP-2.2",
 53 |     "G26228_Hs_683_2" : "G26228.Hs_683.2",
 54 |     "G26236_NCI-H716_2" : "G26236.NCI-H716.2",
 55 |     "G26249_KMS-26_2" : "G26249.KMS-26.2",
 56 |     "G26253_KMS-34_2" : "G26253.KMS-34.2",
 57 |     "G26262_NCI-H889_2" : "G26262.NCI-H889.2",
 58 |     "G27214_PC-3_1" : "G27214.PC-3.1",
 59 |     "G27219_Panc_03_27_1" : "G27219.Panc_03.27.1",
 60 |     "G27233_A-498_1" : "G27233.A-498.1",
 61 |     "G27259_AN3_CA_1" : "G27259.AN3_CA.1",
 62 |     "G27280_TC-71_1" : "G27280.TC-71.1",
 63 |     "G27367_BFTC-909_1" : "G27367.BFTC-909.1",
 64 |     "G27376_COLO_792_1" : "G27376.COLO_792.1",
 65 |     "G27453_SNU-398_2" : "G27453.SNU-398.2",
 66 |     "G27463_SK-MEL-1_2" : "G27463.SK-MEL-1.2",
 67 |     "G27476_PK-59_2" : "G27476.PK-59.2",
 68 |     "G27479_SK-MEL-3_2" : "G27479.SK-MEL-3.2",
 69 |     "G27488_SNU-620_2" : "G27488.SNU-620.2",
 70 |     "G27516_SK-MEL-28_2" : "G27516.SK-MEL-28.2",
 71 |     "G27544_SF268_2" : "G27544.SF268.2",
 72 |     "G28011_KLE_1" : "G28011.KLE.1",
 73 |     "G28034_MDA-MB-361_1" : "G28034.MDA-MB-361.1",
 74 |     "G28045_KYSE-270_1" : "G28045.KYSE-270.1",
 75 |     "G28050_KMM-1_1" : "G28050.KMM-1.1",
 76 |     "G28054_KYSE-520_1" : "G28054.KYSE-520.1",
 77 |     "G28070_LN-18_1" : "G28070.LN-18.1",
 78 |     "G28072_MDA-MB-175-VII_1" : "G28072.MDA-MB-175-VII.1",
 79 |     "G28077_MG-63_1" : "G28077.MG-63.1",
 80 |     "G28081_JHH-7_1" : "G28081.JHH-7.1",
 81 |     "G28087_MDA-MB-436_1" : "G28087.MDA-MB-436.1",
 82 |     "G28535_OVTOKO_1" : "G28535.OVTOKO.1",
 83 |     "G28545_NUGC-2_1" : "G28545.NUGC-2.1",
 84 |     "G28575_OUMS-23_1" : "G28575.OUMS-23.1",
 85 |     "G28610_MHH-ES-1_1" : "G28610.MHH-ES-1.1",
 86 |     "G30594_UACC-893_1" : "G30594.UACC-893.1",
 87 |     "G30631_SU-DHL-10_1" : "G30631.SU-DHL-10.1",
 88 |     "G41663_OVISE_5" : "G41663.OVISE.5",
 89 |     "G41682_KYSE-510_5" : "G41682.KYSE-510.5",
 90 |     "G41706_RT4_5" : "G41706.RT4.5",
 91 |     "G41709_FaDu_5" : "G41709.FaDu.5",
 92 |     "G41710_SNU-16_5" : "G41710.SNU-16.5",
 93 |     "G41724_HGC-27_5" : "G41724.HGC-27.5" }
 94 | 
 95 |     
 96 | if __name__=='__main__':
 97 |     main()
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------