├── dnanexus └── cis-x │ ├── resources │ └── .gitkeep │ ├── README.developer.md │ ├── src │ └── cis-x.sh │ ├── dxapp.json │ └── README.md ├── .dockerignore ├── .gitignore ├── src ├── seed │ ├── bin │ │ ├── hg19_ref_gene_to_bed │ │ ├── scrape_geneimprint │ │ ├── merge_roadmap │ │ └── cis-X-seed │ └── README.md ├── other │ ├── hg19_refGene2bed.pl │ ├── realpath.c │ ├── meme_glam2_fix_new_gcc.patch │ └── mergeData_geneName.pl ├── ref-exp │ ├── bin │ │ ├── cis-X-ref-exp-generate │ │ ├── cis-X-ref-exp-prepare │ │ ├── cis-X-ref-exp │ │ └── cis-X-ref-exp-preprocess │ ├── src │ │ ├── format.precal.pl │ │ ├── cis-X.refexp.step1.pl │ │ ├── cis-X.refexp.step2.pl │ │ ├── refexp.gen.pl │ │ ├── precal.R │ │ ├── cleanup.bi.cases.R │ │ ├── filter.cohort.v2.pl │ │ └── collect.cohort.pl │ └── README.md └── core │ ├── bin │ ├── cis-X-mark │ ├── cis-X-test-outliers │ ├── cis-X-nominate │ ├── cis-X-build-matrix │ ├── cis-X-ase │ ├── cis-X-screen │ └── cis-X-run │ ├── src │ ├── sepCHR.pl │ ├── proc_ase_runs.pl │ ├── mergeVariantOut.pl │ ├── 05.merge.pl │ ├── binom.R │ ├── fdr.R │ ├── 02.add.count.pl │ ├── merge.fa.pl │ ├── 01.get.markder.pl │ ├── 07.gene.model.Oct2017.pl │ ├── exp.check.R │ ├── scan.sv.pl │ ├── ase.candidate.byrun.pl │ ├── check.TAD.cnv.pl │ ├── scan.cnv.pl │ ├── check.TAD.pl │ ├── ase.candidate.pl │ ├── snvindel.prep.pl │ ├── snvindel.process.pl │ └── ase_runs.pl │ └── README.md ├── bin └── cis-X ├── RELEASE.md ├── CHANGELOG.md ├── Dockerfile ├── README.md └── LICENSE /dnanexus/cis-x/resources/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | dnanexus 2 | refs/external 3 | tmp 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dnanexus/cis-x/resources/tmp 2 | refs 3 | tmp 4 | vendor 5 | -------------------------------------------------------------------------------- /src/seed/bin/hg19_ref_gene_to_bed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | input = ARGV[0] or raise "missing input" 5 | 6 | File.open(input) do |f| 7 | # skip header 8 | f.readline 9 | 10 | f.each_line do |line| 11 | r = line.split("\t") 12 | puts "#{r[2]}\t#{r[4]}\t#{r[5]}\t#{r[12]}\t#{r[1]}\t#{r[3]}" 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /dnanexus/cis-x/README.developer.md: -------------------------------------------------------------------------------- 1 | # St. Jude cis-X (dev) 2 | 3 | The main script runs a container with a pre-built cis-X image. The DNAnexus 4 | applet only executes the `run` command. 5 | 6 | ## Build 7 | 8 | ``` 9 | $ docker build --tag cis-x ../.. 10 | $ mkdir -p resources/tmp 11 | $ docker save cis-x | gzip > resources/tmp/cis-x-latest.tar.gz 12 | $ dx build 13 | ``` 14 | -------------------------------------------------------------------------------- /src/other/hg19_refGene2bed.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $infile = "hg19_refGene"; 4 | my $outfile = "hg19_refGene.bed"; 5 | open IN, "< $infile" or die "$infile: $!"; 6 | open OUT, "> $outfile" or die "$outfile: $!"; 7 | while() { 8 | chomp; 9 | next if $. == 1; 10 | my @F = split/\t/; 11 | print OUT "$F[2]\t$F[4]\t$F[5]\t$F[12]\t$F[1]\t$F[3]\n"; 12 | } 13 | close IN; 14 | close OUT; 15 | 16 | -------------------------------------------------------------------------------- /src/seed/bin/scrape_geneimprint: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "open-uri" 5 | 6 | require "nokogiri" 7 | 8 | url = ARGV[0] or raise "missing url" 9 | 10 | document = Nokogiri::HTML(open(url)) 11 | rows = document.css("table tr") 12 | 13 | raise "missing data table" if rows.empty? 14 | 15 | rows.each do |row| 16 | puts row.css("th, td").map(&:text).join("\t") 17 | end 18 | -------------------------------------------------------------------------------- /src/other/realpath.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(int argc, char **argv) { 5 | if (argc < 2) { 6 | fprintf(stderr, "%s: missing operand\n", argv[0]); 7 | return EXIT_FAILURE; 8 | } 9 | 10 | for (int i = 1; i < argc; i++) { 11 | char *rp = realpath(argv[i], NULL); 12 | printf("%s\n", rp); 13 | free(rp); 14 | } 15 | 16 | return EXIT_SUCCESS; 17 | } 18 | -------------------------------------------------------------------------------- /src/ref-exp/bin/cis-X-ref-exp-generate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..) 4 | 5 | CONFIG=$1 6 | RESULTS_DIR=$2 7 | EXP_MATRIX=$3 8 | 9 | if [ $# -lt 3 ]; then 10 | basename $0 11 | echo 12 | echo "USAGE:" 13 | echo " cis-X ref-exp generate " 14 | exit 1 15 | fi 16 | 17 | perl $CIS_X_REF_EXP_HOME/src/cis-X.refexp.step2.pl $CONFIG $RESULTS_DIR $EXP_MATRIX 18 | -------------------------------------------------------------------------------- /src/ref-exp/bin/cis-X-ref-exp-prepare: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..) 4 | 5 | CONFIG=$1 6 | RESULTS_DIR=$2 7 | CHR_STRING=$3 8 | 9 | if [ $# -lt 3 ]; then 10 | basename $0 11 | echo 12 | echo "USAGE:" 13 | echo " cis-X ref-exp prepare " 14 | exit 1 15 | fi 16 | 17 | perl $CIS_X_REF_EXP_HOME/src/cis-X.refexp.step1.pl $CONFIG $RESULTS_DIR $CHR_STRING $COVG_WGS 18 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-mark: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 5 | 6 | SAMPLE_ID=$1 7 | HIGH20=$2 8 | CNVLOH=$3 9 | SNV4_OUT=$4 10 | HET_OUT=$5 11 | COVG_WGS=$6 12 | 13 | BADLST=$CIS_X_HOME/refs/SuperBad.good.bad.new 14 | 15 | perl $CIS_X_CORE_HOME/src/01.get.markder.pl \ 16 | $SAMPLE_ID \ 17 | $HIGH20 \ 18 | $CNVLOH \ 19 | $SNV4_OUT \ 20 | $HET_OUT \ 21 | $BADLST \ 22 | $COVG_WGS 23 | -------------------------------------------------------------------------------- /src/other/meme_glam2_fix_new_gcc.patch: -------------------------------------------------------------------------------- 1 | diff --git a/src/glam2_glam2.c b/src/glam2_glam2.c 2 | index 60a9a3f..ef9c3dc 100644 3 | --- a/src/glam2_glam2.c 4 | +++ b/src/glam2_glam2.c 5 | @@ -145,7 +145,7 @@ int aln_cmp(const void *a, const void *b) { 6 | return x < y ? +1 : x > y ? -1 : 0; 7 | } 8 | 9 | -inline void report_external_failure(const char *prog, int status) { 10 | +void report_external_failure(const char *prog, int status) { 11 | if (status == 0) { 12 | // success 13 | } if (status == -1) { 14 | -------------------------------------------------------------------------------- /src/core/src/sepCHR.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $input = $ARGV[0]; 4 | my $chrom = $ARGV[1]; 5 | my $output = $ARGV[2]; 6 | my $chr_string = $ARGV[3]; 7 | 8 | open IN, "< $input" or die "$input: $!"; 9 | open OUT, "> $output" or die "$output: $!"; 10 | while() { 11 | chomp; 12 | my @F = split(/\./,$_); 13 | my $snv4 = $_; 14 | unless ($chr_string eq "TRUE") { 15 | $snv4 =~ s/^chr//; 16 | } 17 | if ($F[0] eq $chrom) { 18 | print OUT "$snv4\n"; 19 | } 20 | } 21 | close IN; 22 | close OUT; 23 | 24 | -------------------------------------------------------------------------------- /src/ref-exp/src/format.precal.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $workdir = $ARGV[0]; 4 | 5 | my $infile = "$workdir/raw.tvalue.bicohort.txt"; 6 | my $outfile = "$workdir/refexp/precal.tvalue.bin_gt1.txt"; 7 | open OUT, "> $outfile" or die "$outfile: $!"; 8 | open IN, "< $infile" or die "$infile: $!"; 9 | while() { 10 | chomp; 11 | next if $. == 1; 12 | my @F = split/\t/; 13 | my @f = split(/,/,$F[1]); 14 | for my $f (@f) { 15 | next if $f eq "NaN"; 16 | print OUT "$f\n"; 17 | } 18 | } 19 | close IN; 20 | close OUT; 21 | -------------------------------------------------------------------------------- /src/ref-exp/src/cis-X.refexp.step1.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $config = $ARGV[0]; 4 | my $workdir = $ARGV[1]; 5 | my $chr_string = $ARGV[2]; 6 | 7 | unless ($config and $workdir and $chr_string) { 8 | die("Usage: cis-X.refexp.step1.pl [config file] [working dir] [chr-string]"); 9 | } 10 | 11 | my $outfile = "$workdir/cis-X.refexp.step1.commands.sh"; 12 | open OUT, "> $outfile" or die "$outfile: $!"; 13 | open IN, "< $config" or die "$config: $!"; 14 | while() { 15 | chomp; 16 | next if $. == 1; 17 | my @F = split/\t/; 18 | print OUT "cis-X ref-exp preprocess $F[0] $workdir $F[1] $F[2] $F[3] $chr_string\n"; 19 | } 20 | close IN; 21 | close OUT; 22 | 23 | -------------------------------------------------------------------------------- /bin/cis-X: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/..) 4 | 5 | COMMAND=$1 6 | 7 | usage() { 8 | basename $0 9 | echo 10 | echo "USAGE:" 11 | echo " cis-X [args...]" 12 | echo "" 13 | echo "SUBCOMMANDS:" 14 | echo " ref-exp Generate reference expression matrices" 15 | echo " run Search for activating regulatory variants in the tumor genome" 16 | echo " seed Download and generate a set of common references" 17 | exit 1 18 | } 19 | 20 | case $COMMAND in 21 | ref-exp) $CIS_X_HOME/src/ref-exp/bin/cis-X-ref-exp "${@:2}" ;; 22 | run) $CIS_X_HOME/src/core/bin/cis-X-run "${@:2}" ;; 23 | seed) $CIS_X_HOME/src/seed/bin/cis-X-seed "${@:2}" ;; 24 | *) usage ;; 25 | esac 26 | -------------------------------------------------------------------------------- /src/ref-exp/src/cis-X.refexp.step2.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | use Cwd qw(abs_path); 4 | 5 | my $config = $ARGV[0]; 6 | my $workdir = $ARGV[1]; 7 | my $expfile = $ARGV[2]; 8 | 9 | unless ($config and $workdir and $expfile) { 10 | die("Usage: cis-X.refexp.step2.pl [config file] [working dir] [exp matrix]"); 11 | } 12 | 13 | my $codepath = abs_path($0); 14 | my $codedir = `dirname $codepath`; 15 | chomp($codedir); 16 | print "$codedir\n"; 17 | 18 | system "perl -w $codedir/collect.cohort.pl $config $workdir $expfile"; 19 | system "perl -w $codedir/filter.cohort.v2.pl $codedir $workdir"; 20 | system "Rscript $codedir/cleanup.bi.cases.R $workdir"; 21 | system "perl -w $codedir/refexp.gen.pl $workdir $expfile"; 22 | system "Rscript $codedir/precal.R $workdir"; 23 | system "perl -w $codedir/format.precal.pl $workdir"; 24 | -------------------------------------------------------------------------------- /src/ref-exp/bin/cis-X-ref-exp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..) 4 | 5 | COMMAND=$1 6 | 7 | usage() { 8 | basename $0 9 | echo 10 | echo "USAGE:" 11 | echo " cis-X ref-exp [args...]" 12 | echo 13 | echo "SUBCOMMANDS:" 14 | echo " generate Generate a biallelic reference expression matrix" 15 | echo " prepare Create a batch script for preprocessing inputs" 16 | echo " preprocess Runs allelic specific expression (ASE) tests on inputs" 17 | exit 1 18 | } 19 | 20 | case $COMMAND in 21 | generate) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-generate "${@:2}" ;; 22 | prepare) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-prepare "${@:2}" ;; 23 | preprocess) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-preprocess "${@:2}" ;; 24 | *) usage ;; 25 | esac 26 | -------------------------------------------------------------------------------- /src/core/src/proc_ase_runs.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $ase_runs = $ARGV[0]; 4 | my $ase_runs_gene = $ARGV[1]; 5 | my $outfile = $ARGV[2]; 6 | 7 | my (%run2g); 8 | 9 | open IN, "< $ase_runs_gene" or die "$ase_runs_gene: $!"; 10 | while() { 11 | chomp; 12 | my @F = split/\t/; 13 | next if $F[10] == 0; 14 | $run2g{$F[3]}{$F[7]} = 1; 15 | } 16 | close IN; 17 | 18 | open IN, "< $ase_runs" or die "$ase_runs: $!"; 19 | open OUT, "> $outfile" or die "$outfile: $!"; 20 | while() { 21 | chomp; 22 | if ($. == 1) { 23 | print OUT "$_\tGenes_overlap_hc\n"; 24 | next; 25 | } 26 | my @F = split/\t/; 27 | my $gene = ""; 28 | if ($run2g{$F[0]}) { 29 | my @g = keys %{$run2g{$F[0]}}; 30 | $gene = join(',',@g); 31 | } 32 | print OUT "$_\t$gene\n"; 33 | } 34 | close IN; 35 | close OUT; 36 | -------------------------------------------------------------------------------- /src/core/src/mergeVariantOut.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $workdir = $ARGV[0]; 4 | my $outfile = $ARGV[1]; 5 | my $chr_string = $ARGV[2]; 6 | 7 | open OUT, "> $outfile" or die "$outfile: $!"; 8 | for my $i (1 .. 22) { 9 | my $infile = "$workdir/matrix_chr" . $i . "_simple.tab"; 10 | if (! -e $infile) { 11 | print "$infile not exist.\n"; 12 | next; 13 | } 14 | open IN, "< $infile" or die "$infile: $!"; 15 | while() { 16 | chomp; 17 | if ($. == 1) { 18 | if ($i == 1) { 19 | print OUT "$_\n"; 20 | } 21 | next; 22 | } 23 | if ($chr_string eq "TRUE") { 24 | print OUT "$_\n"; 25 | }else { 26 | my @F = split/\t/; 27 | my $snv4 = "chr" . $F[1]; 28 | print OUT "$F[0]\t$snv4\t$F[2]\t$F[3]\n"; 29 | } 30 | } 31 | close IN; 32 | } 33 | close OUT; 34 | 35 | -------------------------------------------------------------------------------- /src/core/src/05.merge.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | ### Only the longest transcript was used. 3 | 4 | my $infile = $ARGV[0]; 5 | my $outfile = $ARGV[1]; 6 | 7 | my $head = ""; 8 | my %dat = (); 9 | my %g2len = (); 10 | 11 | open IN, "< $infile" or die "$infile: $!"; 12 | while() { 13 | chomp; 14 | if ($. == 1) { 15 | $head = $_; 16 | next; 17 | } 18 | my @F = split/\t/; 19 | next unless $F[8]; 20 | my $len = $F[5] - $F[4]; 21 | if ($dat{$F[1]}) { 22 | if ($len > $g2len{$F[1]}) { 23 | $dat{$F[1]} = $_; 24 | $g2len{$F[1]} = $len; 25 | }else { 26 | 1; 27 | } 28 | }else { 29 | $dat{$F[1]} = $_; 30 | $g2len{$F[1]} = $len; 31 | } 32 | } 33 | close IN; 34 | 35 | open OUT, "> $outfile" or die "$outfile: $!"; 36 | print OUT "$head\n"; 37 | for my $g (sort keys %dat) { 38 | print OUT "$dat{$g}\n"; 39 | } 40 | close OUT; 41 | 42 | -------------------------------------------------------------------------------- /src/ref-exp/src/refexp.gen.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $workdir = $ARGV[0]; 4 | my $expfile = $ARGV[1]; 5 | 6 | my $outdir = "$workdir/refexp"; 7 | system "mkdir -p $outdir"; 8 | 9 | my $infile = "$workdir/cis-X.refexp.step2.collect.filtered.bi.samples.cleared.txt"; 10 | my $outfile = "$outdir/exp.ref.bi.txt"; 11 | open OUT, "> $outfile" or die "$outfile: $!"; 12 | print OUT "Gene\tnum.cases\tSJID\tfpkm\n"; 13 | open IN, "< $infile" or die "$infile: $!"; 14 | while() { 15 | chomp; 16 | next if $. == 1; 17 | my @F = split/\t/; 18 | if ($F[5]>=10) { 19 | if ($F[15] == 1) { 20 | print OUT "$F[0]\t$F[16]\t$F[17]\t$F[18]\n"; 21 | }else { 22 | print OUT "$F[0]\t$F[5]\t$F[6]\t$F[7]\n"; 23 | } 24 | } 25 | } 26 | close IN; 27 | close OUT; 28 | 29 | $outfile = "$outdir/exp.ref.white.txt"; 30 | open OUT, "> $outfile" or die "$outfile: $!"; 31 | print OUT "Gene\tnum.cases\tSID\tfpkm\n"; 32 | close OUT; 33 | 34 | -------------------------------------------------------------------------------- /src/core/src/binom.R: -------------------------------------------------------------------------------- 1 | 2 | argv <- commandArgs(TRUE) 3 | 4 | infile <- argv[1] 5 | outfile <- argv[2] 6 | 7 | out <- NULL 8 | dat <- read.table(infile,sep="\t",header=T,quote="") 9 | 10 | for (i in 1:nrow(dat)) { 11 | p_corr <- NULL 12 | covg <- dat[i,7] 13 | sigma <- 10.8*(1-exp(-1*covg/105)) 14 | ep <- 0.5 15 | if (dat[i,9] == "cnvloh") { 16 | ep <- dat[i,6] 17 | } 18 | p_binom <- dbinom(seq(0,covg),covg,ep) 19 | p_norm <- dnorm(seq(-1000,1000),mean=0,sd=sigma) 20 | p_conv <- convolve(p_binom,p_norm,type="open") 21 | y <- abs(dat[i,11]/(dat[i,11]+dat[i,10]) - ep) 22 | if (dat[i,11] > covg*ep) { 23 | p_corr <- sum(p_conv[(1001+dat[i,11]):length(p_conv)]) 24 | }else { 25 | p_corr <- sum(p_conv[1:(1001+dat[i,11])]) 26 | } 27 | if (p_corr < 0) { 28 | p_corr <- 0 29 | } 30 | out <- rbind(out,c(p_corr,y)) 31 | } 32 | colnames(out) <- c("pvalue","delta.abs") 33 | out <- cbind(dat,out) 34 | 35 | write.table(out,file=outfile,sep="\t",quote=F,row.names=F) 36 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-test-outliers: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 5 | 6 | SAMPLE_ID=$1 7 | DISEASE=$2 8 | FPKM_MATRIX=$3 9 | OHE_RESULT=$4 10 | 11 | # Ensure reference expression matrices exist for the given disease ID. 12 | BILIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.bi.txt 13 | WHITELIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.white.txt 14 | WHOLELIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.entire.txt 15 | PRECALT=$CIS_X_HOME/refs/diseases/$DISEASE/precal.tvalue.bin_gt1.txt 16 | 17 | if [ ! -f $BILIST ] || [ ! -f $WHITELIST ] || [ ! -f $WHOLELIST ]; then 18 | echo "ERROR: $(date): cis-X-test-outliers: reference expression matrices missing for $DISEASE" 19 | exit 1 20 | fi 21 | 22 | if [ ! -f $PRECALT ]; then 23 | echo "ERROR: $(date): cis-X-test-outliers: reference expression matrices missing for $DISEASE" 24 | exit 1 25 | fi 26 | 27 | Rscript $CIS_X_CORE_HOME/src/exp.check.R \ 28 | $SAMPLE_ID \ 29 | $FPKM_MATRIX \ 30 | $BILIST \ 31 | $WHOLELIST \ 32 | $WHITELIST \ 33 | $OHE_RESULT \ 34 | $PRECALT 35 | -------------------------------------------------------------------------------- /src/core/src/fdr.R: -------------------------------------------------------------------------------- 1 | suppressMessages(library(multtest)) 2 | 3 | argv <- commandArgs(TRUE) 4 | 5 | infile <- argv[1] 6 | outfile <- argv[2] 7 | 8 | dat <- read.table(infile,sep="\t",header=T,quote="",stringsAsFactor=F) 9 | 10 | out <- NULL 11 | pval <- NULL 12 | ai <- NULL 13 | 14 | for (i in 1:nrow(dat)) { 15 | x <- as.numeric(unlist(strsplit(as.character(dat[i,13]),",",perl=T))) 16 | y <- as.numeric(unlist(strsplit(as.character(dat[i,15]),",",perl=T))) 17 | x.geom <- exp(sum(log(x))/length(x)) 18 | y.m <- mean(y) 19 | out <- rbind(out, c(x.geom,y.m)) 20 | } 21 | 22 | colnames(out) <- c("comb.pval","mean.delta") 23 | rownames(out) <- dat[,1] 24 | 25 | if (nrow(dat) == 1) { 26 | out <- cbind(out,out[,1],out[,1],out[,1]) 27 | colnames(out) <- c(colnames(out)[1:2],c("rawp","Bonferroni","ABH")) 28 | out <- cbind(dat,out) 29 | }else { 30 | raw.p <- out[,1] 31 | adj.p <- mt.rawp2adjp(raw.p,c("Bonferroni","ABH"))$adj 32 | rownames(adj.p) <- names(raw.p[order(raw.p)]) 33 | out <- cbind(out,adj.p[rownames(out),]) 34 | out <- cbind(dat,out) 35 | } 36 | 37 | write.table(out,file=outfile,sep="\t",quote=F,row.names=F) 38 | -------------------------------------------------------------------------------- /src/seed/bin/merge_roadmap: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # frozen_string_literal: true 3 | 4 | require "csv" 5 | 6 | EPIGENOME_ID_PATTERN = /E\d{3}/ 7 | 8 | # This expects a csv-formatted version of `jul2013.roadmapData.qc`: 9 | # https://docs.google.com/spreadsheets/d/1yikGx4MsO9Ei36b64yOy9Vb6oPC5IBGlFbYEt-N6gOM/view 10 | def read_groups(path) 11 | File.open(path) do |f| 12 | # skip meta 13 | 6.times { f.readline } 14 | 15 | csv = CSV.new(f) 16 | 17 | csv.each.reduce({}) do |map, row| 18 | map[row[1]] = row[3] 19 | map 20 | end 21 | end 22 | end 23 | 24 | def parse_eid(s) 25 | matches = s.match(EPIGENOME_ID_PATTERN) 26 | raise if !matches 27 | matches[0] 28 | end 29 | 30 | input_dir = ARGV[0] or raise "missing input_dir" 31 | meta_file = ARGV[1] or raise "missing meta_file" 32 | 33 | groups = read_groups(meta_file) 34 | 35 | pathnames = Dir[File.join(input_dir, "*.bed")].sort 36 | 37 | pathnames.each do |pathname| 38 | basename = File.basename(pathname) 39 | eid = parse_eid(basename) 40 | group = groups[eid] 41 | 42 | raise if !group 43 | 44 | File.foreach(pathname) do |line| 45 | puts "#{line.chomp}\t#{eid}\t#{group}" 46 | end 47 | end 48 | -------------------------------------------------------------------------------- /src/ref-exp/src/precal.R: -------------------------------------------------------------------------------- 1 | 2 | argv <- commandArgs(TRUE) 3 | 4 | workdir <- argv[1] 5 | 6 | infile <- paste(workdir,"/refexp/exp.ref.bi.txt",sep="") 7 | outfile <- paste(workdir,"/raw.tvalue.bicohort.txt",sep="") 8 | dat <- read.table(infile,sep="\t",header=T,row.names=1,quote="",stringsAsFactors=F) 9 | rawt <- NULL 10 | 11 | for (i in 1:nrow(dat)) { 12 | rawt.i <- NULL 13 | y.in <- NULL 14 | y.raw <- NULL 15 | y.size <- NULL 16 | y.median <- NULL 17 | y.in <- as.numeric(unlist(strsplit(dat[i,3],",",perl=T))) 18 | y.raw <- log10(as.numeric(unlist(strsplit(dat[i,3],",",perl=T)))+0.1) 19 | y.median <- median(y.in) 20 | y.size <- length(y.in) 21 | 22 | if (y.size >= 20 && y.median >= 1) { 23 | for (j in 1:length(y.raw)) { 24 | y.white <- y.raw[-j] 25 | x.i <- y.raw[j] 26 | t.white <- (x.i-mean(y.white))/((1+(length(y.white)-2)^-1)*(sd(y.white)^2))^0.5 27 | p.white <- pt(t.white,length(y.white)-2,lower.tail=F) 28 | if (j == 1) { 29 | rawt.i <- t.white 30 | }else { 31 | rawt.i <- paste(rawt.i,t.white,sep=",") 32 | } 33 | } 34 | rawt <- rbind(rawt,c(rownames(dat)[i],rawt.i)) 35 | } 36 | } 37 | write.table(rawt,file=outfile,row.names=F,quote=F,sep="\t") 38 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-nominate: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 5 | 6 | SAMPLE_ID=$1 7 | ASE_RESULT_GENE=$2 8 | OHE_RESULT=$3 9 | CANDIDATES_RESULT=$4 10 | THRESH_FPKM=$5 11 | ASE_RESULT_RUN=$6 12 | CANDIDATES_RESULT_RUN=$7 13 | 14 | THRESH_AI_DI=0.3 15 | THRESH_AI_CNV=0.2 16 | THRESH_PVALUE_ASE=0.05 17 | THRESH_PVALUE_LOO=0.05 18 | #THRESH_FPKM=5 19 | THRESH_LOO_Hi_Perc=0.1 20 | NUM_MARKERS=4 21 | 22 | IMPRINTING_GENES=$CIS_X_HOME/refs/external/ImprintGenes.txt 23 | ONCOGENES=$CIS_X_HOME/refs/external/cancer_gene_census.txt 24 | 25 | perl $CIS_X_CORE_HOME/src/ase.candidate.pl \ 26 | $THRESH_PVALUE_ASE \ 27 | $THRESH_AI_DI \ 28 | $THRESH_AI_CNV \ 29 | $THRESH_FPKM \ 30 | $THRESH_PVALUE_LOO \ 31 | $SAMPLE_ID \ 32 | $CANDIDATES_RESULT \ 33 | $ASE_RESULT_GENE \ 34 | $OHE_RESULT \ 35 | $THRESH_LOO_Hi_Perc \ 36 | $IMPRINTING_GENES \ 37 | $ONCOGENES 38 | 39 | perl $CIS_X_CORE_HOME/src/ase.candidate.byrun.pl \ 40 | $SAMPLE_ID \ 41 | $THRESH_FPKM \ 42 | $THRESH_PVALUE_LOO \ 43 | $ASE_RESULT_RUN \ 44 | $OHE_RESULT \ 45 | $CANDIDATES_RESULT_RUN \ 46 | $IMPRINTING_GENES \ 47 | $ONCOGENES \ 48 | $NUM_MARKERS \ 49 | $ASE_RESULT_GENE \ 50 | $THRESH_AI_DI \ 51 | $THRESH_AI_CNV 52 | -------------------------------------------------------------------------------- /src/ref-exp/bin/cis-X-ref-exp-preprocess: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/../../core) 6 | PATH=$CIS_X_CORE_HOME/bin:$PATH 7 | 8 | SAMPLE_ID=$1 9 | ROOTDIR=$2 10 | HIGH20=$3 11 | RNABAM=$4 12 | CNV_LOH=$5 13 | CHR_STRING=$6 14 | 15 | COVG_WGS=10 16 | COVG_RNA=10 17 | CNV_LOH_ACTION=drop 18 | 19 | if [ $# -lt 6 ]; then 20 | basename $0 21 | echo 22 | echo "USAGE:" 23 | echo " cis-X ref-exp preprocess " 24 | exit 1 25 | fi 26 | 27 | WORKDIR=$ROOTDIR/$SAMPLE_ID/working_space 28 | mkdir -p $WORKDIR 29 | cd $WORKDIR 30 | 31 | SNV4_OUT="$WORKDIR/$SAMPLE_ID.snv4.txt" 32 | HET_OUT="$WORKDIR/$SAMPLE_ID.heterozygous.markers.txt" 33 | #cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT 34 | cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT $COVG_WGS 35 | 36 | MATRIX_OUT="$WORKDIR/matrix_combined_matrix_simple.tab" 37 | #cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT 38 | cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT $CHR_STRING 39 | 40 | ASE_RESULT_MARKER="$WORKDIR/$SAMPLE_ID.ase.combine.WGS.RNAseq.goodmarkers.binom.txt" 41 | ASE_RESULT_GENE="$WORKDIR/$SAMPLE_ID.ase.gene.model.fdr.txt" 42 | ASE_RESULT_RUN="$WORKDIR/$SAMPLE_ID.ase.candidates.runs.txt" 43 | #cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE 44 | cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE $ASE_RESULT_RUN $CNV_LOH_ACTION $COVG_RNA 45 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-build-matrix: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 4 | 5 | RNABAM=$1 6 | SNV4_OUT=$2 7 | WORKDIR=$3 8 | MATRIX_OUT=$4 9 | CHR_STRING=$5 10 | 11 | LOG=$WORKDIR/log.txt 12 | 13 | RNABAMLST="$WORKDIR/bam.lst" 14 | 15 | echo $RNABAM > $RNABAMLST 16 | 17 | for i in $(seq 1 22); do 18 | CHROM="chr$i" 19 | SNV4_CHR=snv4.seqchr.txt 20 | 21 | if [ -f $SNV4_CHR ]; then 22 | rm $SNV4_CHR 23 | fi 24 | 25 | if [ -f "commands.txt" ]; then 26 | rm commands.txt 27 | fi 28 | 29 | if [ -f "commands.sh" ]; then 30 | rm commands.sh 31 | fi 32 | 33 | perl $CIS_X_CORE_HOME/src/sepCHR.pl $SNV4_OUT $CHROM $SNV4_CHR $CHR_STRING 34 | 35 | LINE_TEMP=$(wc -l $SNV4_CHR | sed -e 's/^ *//' | cut -d" " -f1) 36 | 37 | if [ $LINE_TEMP -gt 0 ]; then 38 | echo "INFO: $(date): cis-X-build-matrix: processing $CHROM" 39 | 40 | variants2matrix -now -bam-list $RNABAMLST -variant-file $SNV4_CHR -snv4 -flat -name $CHROM -step1 commands.txt 2>> $LOG 41 | cat commands.txt |sed 's/^\/bin\/env //' > commands.sh 42 | sh commands.sh 2>commands.err 43 | variants2matrix -now -bam-list $RNABAMLST -variant-file $SNV4_CHR -snv4 -flat -name $CHROM -step2 -clean 2>> $LOG 44 | fi 45 | done 46 | 47 | perl $CIS_X_CORE_HOME/src/mergeVariantOut.pl $WORKDIR $MATRIX_OUT $CHR_STRING >> $LOG 48 | 49 | MATRIX_OUT_LINE=$(wc -l $MATRIX_OUT | sed -e 's/^ *//' | cut -d" " -f1) 50 | 51 | if [ $MATRIX_OUT_LINE -lt 2 ]; then 52 | echo "No output from variants2matrix. Exiting." 53 | exit 1 54 | fi 55 | -------------------------------------------------------------------------------- /src/core/src/02.add.count.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $het_wgs = $ARGV[1]; 5 | my $geno_rna = $ARGV[2]; 6 | my $output = $ARGV[3]; 7 | #my $cvg_cut = $ARGV[4]; ### output everything with minimal 3 reads support for at least one allele. Further filter will apply. 2019-04-08. 8 | 9 | my %count = (); 10 | my $infile = $geno_rna; 11 | open IN, "< $infile" or die "$infile: $!"; 12 | while() { 13 | chomp; 14 | next if $. == 1; 15 | my @F = split/\t/; 16 | $count{$sid}{$F[1]}{ref} = $F[2]; 17 | $count{$sid}{$F[1]}{mut} = $F[3]; 18 | $count{$sid}{$F[1]}{cvg} = $F[2] + $F[3]; 19 | } 20 | close IN; 21 | 22 | $infile = $het_wgs; 23 | my $outfile = $output; 24 | open IN, "< $infile" or die "$infile: $!"; 25 | open OUT, "> $outfile" or die "$outfile: $!"; 26 | while() { 27 | chomp; 28 | if ($. == 1) { 29 | print OUT "chrom\tpos\tref\tmut\tcvg_wgs\tmut_freq_wgs\tcvg_rna\tmut_freq_rna\tcnvlohTag\tref\tvar\n"; 30 | next; 31 | } 32 | my @F = split/\t/; 33 | my $snv4 = "$F[0].$F[1].$F[2].$F[3]"; 34 | my $cvg_wgs = $F[4] + $F[6]; 35 | my $freq_wgs = sprintf("%.3f",$F[6]/$cvg_wgs); 36 | my $cvg_rna = $count{$sid}{$snv4}{cvg}; 37 | # if ($cvg_rna >= $cvg_cut) { 38 | # if ($count{$sid}{$snv4}{ref} >= 3 or $count{$sid}{$snv4}{mut} >= 3) { 39 | if ($cvg_rna >= 5) { 40 | $freq_rna = sprintf("%.3f",$count{$sid}{$snv4}{mut}/$cvg_rna); 41 | print OUT "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$cvg_wgs\t$freq_wgs\t$cvg_rna\t$freq_rna\t$F[8]\t$count{$sid}{$snv4}{ref}\t$count{$sid}{$snv4}{mut}\n"; 42 | } 43 | } 44 | close IN; 45 | close OUT; 46 | -------------------------------------------------------------------------------- /src/core/src/merge.fa.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $varlst = $ARGV[1]; 5 | my $fa_in = $ARGV[2]; 6 | my $fa_out = $ARGV[3]; 7 | 8 | my (%name2fa); 9 | 10 | my $infile = $fa_in; 11 | my $name = ""; 12 | open IN, "< $infile" or die "$infile: $!"; 13 | while() { 14 | chomp; 15 | if ($_ =~ /^>/) { 16 | $name = $_; 17 | $name =~ s/^>//; 18 | next; 19 | } 20 | $name2fa{$name} = $_; 21 | } 22 | close IN; 23 | 24 | $infile = $varlst; 25 | my $outfile = $fa_out; 26 | open OUT, "> $outfile" or die "$outfile: $!"; 27 | open IN, "< $infile" or die "$infile: $!"; 28 | while() { 29 | chomp; 30 | next if $. == 1; 31 | my @F = split/\t/; 32 | my $left = $name2fa{$F[4]}; 33 | my $right = $name2fa{$F[5]}; 34 | my $ref = $F[2]; 35 | my $mut = $F[3]; 36 | my $mut_seq = ""; 37 | my $ref_seq = ""; 38 | my $mut_id = "$F[0].mut"; 39 | my $ref_id = "$F[0].ref"; 40 | if ($F[1] eq "snv") { 41 | $mut_seq = $left . $mut . $right; 42 | $ref_seq = $left . $ref . $right; 43 | }elsif ($F[1] eq "ins") { 44 | $mut_seq = $left . $mut . $right; 45 | $ref_seq = $left . $right; 46 | }elsif ($F[1] eq "del") { 47 | $mut_seq = $left . $right; 48 | $ref_seq = $left . $ref . $right; 49 | }elsif ($F[1] eq "complex_indel") { 50 | $mut_seq = $left . $mut . $right; 51 | $ref_seq = $left . $ref . $right; 52 | }else { 53 | print "Wrong var type of $F[1] for $F[0].\n"; 54 | } 55 | print OUT ">"; 56 | print OUT "$mut_id\n"; 57 | print OUT "$mut_seq\n"; 58 | print OUT ">"; 59 | print OUT "$ref_id\n"; 60 | print OUT "$ref_seq\n"; 61 | } 62 | close IN; 63 | close OUT; 64 | 65 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release 2 | 3 | * [ ] Update `CHANGELOG.md` with version and publication date. 4 | * [ ] Update version in `dnanexus/cis-x/dxapp.json`. 5 | * [ ] Stage changes: `git add dnanexus/cis-x/dxapp.json CHANGELOG.md` 6 | * [ ] Create git commit: `git commit -m "Bump version to $VERSION"` 7 | * [ ] Create git tag: `git tag -m "" -a v$VERSION` 8 | * [ ] Push release: `git push --follow-tags` 9 | 10 | ## DNAnexus 11 | 12 | * [ ] Build Docker image: `docker image build --tag cis-x .` 13 | * [ ] Save Docker image: `docker image save cis-x | gzip > dnanexus/cis-x/resources/tmp/cis-x-latest.tar.gz` 14 | * [ ] Check security context: `dx whoami` 15 | * [ ] Build DNAnexus applet: `dx build --destination cis-x:/cis-x-$VERSION dnanexus/cis-x` 16 | * [ ] Verify expected results: 17 | 18 | ``` 19 | dx run cis-x:/cis-x-$VERSION \ 20 | --input sample_id=SJALL018373_D1 \ 21 | --input markers=cis-x:/data/SJALL018373_D1.test.wgs.markers.txt \ 22 | --input cnv_loh=cis-x:/data/SJALL018373_D1.test.wgs.cnvloh.txt \ 23 | --input bam=cis-x:/data/SJALL018373_D1.test.RNAseq.bam \ 24 | --input bai=cis-x:/data/SJALL018373_D1.test.RNAseq.bam.bai \ 25 | --input fpkm_matrix=cis-x:/data/SJALL018373_D1.test.RNASEQ_all_fpkm.txt \ 26 | --input snv_indel=cis-x:/data/SJALL018373_D1.test.mut.txt \ 27 | --input sv=cis-x:/data/SJALL018373_D1.test.sv.txt \ 28 | --input cna=cis-x:/data/SJALL018373_D1.test.cna.txt \ 29 | --input disease=TALL \ 30 | --input cnv_loh_action=drop \ 31 | --input min_coverage_wgs=10 \ 32 | --input min_coverage_rna_seq=10 \ 33 | --destination cis-x:/results/$VERSION 34 | ``` 35 | 36 | * [ ] Publish DNAnexus app: `dx build --app --publish dnanexus/cis-x` 37 | * [ ] Build St. Jude Cloud production workflow. 38 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-ase: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 5 | 6 | SAMPLE_ID=$1 7 | WORKDIR=$2 8 | HET_OUT=$3 9 | MATRIX_OUT=$4 10 | ASE_RESULT_MARKER=$5 11 | ASE_RESULT_GENE=$6 12 | ASE_RESULT_RUN=$7 13 | CNV_LOH_ACTION=$8 14 | COVG_RNA=$9 15 | 16 | #THRESH_AI=0.3 17 | THRESH_AI_DI=0.3 18 | THRESH_AI_CNV=0.2 19 | THRESH_PVALUE_ASE=0.05 20 | 21 | GENE_MODEL=$CIS_X_HOME/refs/external/hg19_refGene 22 | GENE_MODEL_BED=$CIS_X_HOME/refs/external/hg19_refGene.bed 23 | 24 | WGS_RNA_COUNT="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.txt" 25 | ASE_RUNS="$WORKDIR/$SAMPLE_ID.ase.runs.txt" 26 | ASE_RUNS_BED="$WORKDIR/$SAMPLE_ID.ase.runs.bed" 27 | ASE_RUNS_GENE="$WORKDIR/$SAMPLE_ID.ase.runs.genes.txt" 28 | GENE_MODEL_Temp1="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.binom.genemodel.summary.txt" 29 | GENE_MODEL_Temp2="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.binom.genemodel.summary.merged.txt" 30 | 31 | #perl $CIS_X_CORE_HOME/src/02.add.count.pl $SAMPLE_ID $HET_OUT $MATRIX_OUT $WGS_RNA_COUNT $COVG_RNA 32 | perl $CIS_X_CORE_HOME/src/02.add.count.pl $SAMPLE_ID $HET_OUT $MATRIX_OUT $WGS_RNA_COUNT 33 | Rscript $CIS_X_CORE_HOME/src/binom.R $WGS_RNA_COUNT $ASE_RESULT_MARKER 34 | 35 | perl $CIS_X_CORE_HOME/src/07.gene.model.Oct2017.pl \ 36 | $SAMPLE_ID \ 37 | $ASE_RESULT_MARKER \ 38 | $GENE_MODEL \ 39 | $THRESH_AI_DI \ 40 | $THRESH_AI_CNV \ 41 | $THRESH_PVALUE_ASE \ 42 | $CNV_LOH_ACTION \ 43 | $GENE_MODEL_Temp1 \ 44 | $COVG_RNA 45 | 46 | perl $CIS_X_CORE_HOME/src/05.merge.pl $GENE_MODEL_Temp1 $GENE_MODEL_Temp2 47 | Rscript $CIS_X_CORE_HOME/src/fdr.R $GENE_MODEL_Temp2 $ASE_RESULT_GENE 48 | 49 | #perl $CIS_X_CORE_HOME/src/ase_runs.pl $ASE_RESULT_MARKER $THRESH_AI_DI $THRESH_AI_CNV $ASE_RUNS1 $ASE_RUNS_BED1 15 0.8 200000 50 | perl $CIS_X_CORE_HOME/src/ase_runs.pl $ASE_RESULT_MARKER $THRESH_AI_DI $THRESH_AI_CNV $ASE_RUNS $ASE_RUNS_BED 4 0.6 500000000 51 | #bedtools intersect -a $ASE_RUNS_BED1 -b $GENE_MODEL_BED -F 0.8 -wao > $ASE_RUNS_GENE1 52 | bedtools intersect -a $ASE_RUNS_BED -b $GENE_MODEL_BED -wao > $ASE_RUNS_GENE 53 | perl $CIS_X_CORE_HOME/src/proc_ase_runs.pl $ASE_RUNS $ASE_RUNS_GENE $ASE_RESULT_RUN 54 | -------------------------------------------------------------------------------- /src/ref-exp/src/cleanup.bi.cases.R: -------------------------------------------------------------------------------- 1 | 2 | argvs <- commandArgs(TRUE) 3 | 4 | infile <- paste(argvs[1],"cis-X.refexp.step2.collect.filtered.txt",sep="/") 5 | outfile <- paste(argvs[1],"cis-X.refexp.step2.collect.filtered.bi.samples.cleared.txt",sep="/") 6 | dat.raw <- read.table(infile,sep="\t",header=T,row.names=1,quote="",stringsAsFactors=F) 7 | dat <- dat.raw 8 | 9 | out <- NULL 10 | for (i in 1:nrow(dat)) { 11 | s.i <- NULL 12 | s.clear <- NULL 13 | f.i <- NULL 14 | f.clear <- NULL 15 | s.count <- NULL 16 | trim <- 0 17 | tot.i <- dat[i,2]+dat[i,5] 18 | if (dat[i,5]>=10) { 19 | bi.i <- unlist(strsplit(dat[i,6],",",perl=T)) 20 | bi.fpkm <- as.numeric(unlist(strsplit(dat[i,7],",",perl=T))) 21 | names(bi.fpkm) <- bi.i 22 | y.i <- as.numeric(log10(bi.fpkm+0.1)) 23 | if (length(unique(bi.fpkm)) == 1) { 24 | s.i <- "" 25 | s.clear <- "" 26 | f.i <- "" 27 | f.clear <- "" 28 | s.count <- "" 29 | }else if (length(y.i[y.i<0])/length(y.i) == 1) { 30 | s.i <- "" 31 | s.clear <- "" 32 | f.i <- "" 33 | f.clear <- "" 34 | s.count <- "" 35 | }else { 36 | for (j in 1:length(bi.i)){ 37 | x.j <- y.i[j] 38 | y.j <- y.i[-j] 39 | t.j <- (x.j-mean(y.j))/((1+(length(y.j)-2)^-1)*(sd(y.j)^2))^0.5 40 | p.j <- pt(t.j,length(y.j)-2,lower.tail=F) 41 | if (p.j < 0.05) { 42 | s.i <- c(s.i,bi.i[j]) 43 | f.i <- c(f.i,as.numeric(bi.fpkm)[j]) 44 | trim <- 1 45 | }else { 46 | s.clear <- c(s.clear,bi.i[j]) 47 | f.clear <- c(f.clear,as.numeric(bi.fpkm)[j]) 48 | } 49 | } 50 | if (trim == 1) { 51 | s.count <- length(s.clear) 52 | }else { 53 | s.count <- "" 54 | } 55 | } 56 | }else { 57 | s.i <- "" 58 | s.clear <- "" 59 | f.i <- "" 60 | f.clear <- "" 61 | s.count <- "" 62 | } 63 | s.i <- paste(s.i,collapse=",") 64 | f.i <- paste(f.i,collapse=",") 65 | s.clear <- paste(s.clear,collapse=",") 66 | f.clear <- paste(f.clear,collapse=",") 67 | out <- rbind(out,c(tot.i,trim,s.count,s.clear,f.clear,s.i,f.i)) 68 | } 69 | colnames(out) <- c("num.total.samples","trim","num.bi.samples.cleared","bi.samples.cleared","bi.fpkm.cleared","bi.samples.excluded","bi.fpkm.excluded") 70 | out.r <- cbind(rownames(dat),dat,out) 71 | colnames(out.r)[1] <- "Gene" 72 | write.table(out.r,file=outfile,sep="\t",row.names=F,quote=F) 73 | -------------------------------------------------------------------------------- /src/ref-exp/src/filter.cohort.v2.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $codedir = $ARGV[0]; 4 | my $workdir = $ARGV[1]; 5 | 6 | my (%gene,%imprint); 7 | 8 | my $infile = "$codedir/../../../refs/external/hg19_refGene"; 9 | open IN, "< $infile" or die "$infile: $!"; 10 | while() { 11 | chomp; 12 | next if $. == 1; 13 | my @F = split/\t/; 14 | my $g = $F[12]; 15 | $gene{$g}{chrom} = $F[2]; 16 | } 17 | close IN; 18 | 19 | $infile = "$codedir/../../../refs/external/ImprintGenes.txt"; 20 | open IN, "< $infile" or die "$infile: $!"; 21 | while() { 22 | chomp; 23 | next if $. == 1; 24 | my @F = split/\t/; 25 | $imprint{$F[0]} = $F[3]; 26 | } 27 | close IN; 28 | 29 | $infile = "$workdir/cis-X.refexp.step2.collect.txt"; 30 | my $outfile = "$workdir/cis-X.refexp.step2.collect.filtered.txt"; 31 | open IN, "< $infile" or die "$infile: $!"; 32 | open OUT, "> $outfile" or die "$outfile: $!"; 33 | while() { 34 | chomp; 35 | if ($. == 1) { 36 | print OUT "$_\tchrom\timprinting\tase_fpkm_max\tase_fpkm_gt1_count\tbi_fpkm_max\tbi_fpkm_gt1_count\n"; 37 | next; 38 | } 39 | my @F = split/\t/; 40 | next unless $F[1] == 1; 41 | my $g = $F[0]; 42 | my $ase_max = "na"; 43 | my $bi_max = "na"; 44 | my $ase_count = 0; 45 | my $bi_count = 0; 46 | if ($F[2] > 0) { 47 | my @ase = split(/,/,$F[4]); 48 | $ase_max = $ase[0]; 49 | for my $i (0 .. $#ase) { 50 | if ($ase[$i] > $ase_max) { 51 | $ase_max = $ase[$i]; 52 | } 53 | if ($ase[$i] >= 1) { 54 | $ase_count++; 55 | } 56 | } 57 | } 58 | if ($F[5] > 0) { 59 | my @bi = split(/,/,$F[7]); 60 | $bi_max = $bi[0]; 61 | for my $j (0 .. $#bi) { 62 | if ($bi[$j] > $bi_max) { 63 | $bi_max = $bi[$j]; 64 | } 65 | if ($bi[$j] >= 1) { 66 | $bi_count++; 67 | } 68 | } 69 | } 70 | my $imprint = ""; 71 | if ($imprint{$g}) { 72 | $imprint = $imprint{$g}; 73 | } 74 | my $chrom = $gene{$g}{chrom}; 75 | next if $chrom =~ /hap/; 76 | print OUT "$_\t$chrom\t$imprint\t$ase_max\t$ase_count\t$bi_max\t$bi_count\n"; 77 | } 78 | close IN; 79 | close OUT; 80 | 81 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | #Changelog 2 | 3 | ## [1.5.0] - 2020-05-19 4 | 5 | ### Added 6 | 7 | * core: Add option to use user-specified TAD (topologically associating 8 | domain) annotations. See `t` option. By default, this still uses hESC 9 | (Human ES Cell). 10 | 11 | * ref-exp: Add option to handle reference sequence names prefixed with "chr". 12 | Set `chr-string` to either `TRUE` or `FALSE`. 13 | 14 | * ref-exp: Build normalized t-values for a gene across samples. See 15 | `precal.tvalue.bin_gt1.txt`. 16 | 17 | ### Changed 18 | 19 | * core: `cis-X-run` now uses short name arguments instead of unnamed 20 | arguments. For example, instead of `cis-X run $SAMPLE_ID ...`, run `cis-X 21 | run -s $SAMPLE_ID ...`. 22 | 23 | * Synced with 2020-02-08 and 2020-03-13 revisions. 24 | 25 | ## [1.4.0] - 2019-07-10 26 | 27 | ### Added 28 | 29 | * core: Identify regions with consecutive markers that exhibit ASE. 30 | 31 | ### Changed 32 | 33 | * core: Tighten scoring method for when known oncogenes should be reevaluated. 34 | 35 | * core: Updated binomial distribution statistical model. 36 | 37 | * seed: `cancer_gene_census.txt` no longer has a version in its filename. 38 | 39 | ## [1.3.0] - 2019-03-28 40 | 41 | ### Added 42 | 43 | * core: Known oncogenes in the [COSMIC Cancer Gene Census] are used to 44 | reevaluate cis-activated candidates. 45 | 46 | [COSMIC Cancer Gene Census]: https://cancer.sanger.ac.uk/census 47 | 48 | ### Changed 49 | 50 | * core: Increased default transcription factor FPKM value to 10 for 51 | screening. 52 | 53 | * core: The motif for MYB (MYBL1 and MYBL2) are similar and treated as the 54 | same gene. 55 | 56 | * core: SNV/indel candidates are sorted by FPKM value. 57 | 58 | ## [1.2.0] - 2019-01-08 59 | 60 | ### Added 61 | 62 | * core: Added argument to set the FPKM threshold for the nomination of 63 | a cis-activated candidate. 64 | 65 | ## [1.1.0] - 2018-12-17 66 | 67 | ### Added 68 | 69 | * core: Added argument to handle markers in CNV/LOH regions. This can either 70 | be `keep` or `drop`. 71 | 72 | * core: Added arguments to set the threshold for the minimal coverage in WGS 73 | and RNA-seq when selecting heterozygous markers. 74 | 75 | ### Fixed 76 | 77 | * seed: Update download location for `GRCh37-lite.fa.gz`. 78 | 79 | ## 1.0.0 - 2018-07-23 80 | 81 | * Initial release 82 | 83 | [1.4.0]: https://github.com/stjude/cis-x/compare/v1.4.0...v1.5.0 84 | [1.4.0]: https://github.com/stjude/cis-x/compare/v1.3.0...v1.4.0 85 | [1.3.0]: https://github.com/stjude/cis-x/compare/v1.2.0...v1.3.0 86 | [1.2.0]: https://github.com/stjude/cis-x/compare/v1.1.0...v1.2.0 87 | [1.1.0]: https://github.com/stjude/cis-x/compare/v1.0.0...v1.1.0 88 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-screen: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 5 | 6 | SAMPLE_ID=$1 7 | CANDIDATES_RESULT=$2 8 | CANDIDATES_RESULT_RUN=$3 9 | SV_IN=$4 10 | CNA_IN=$5 11 | SNVINDEL_IN=$6 12 | FPKM_MATRIX=$7 13 | WORKDIR=$8 14 | SV_CAN=$9 15 | CNA_CAN=${10} 16 | SNVINDEL_CAN=${11} 17 | ANNO_USER=${12} 18 | TAD=${13} 19 | 20 | LOG=$WORKDIR/log.txt 21 | 22 | SV_WIN=1000000 23 | CNA_WIN=1000000 24 | CNA_SIZE=5000000 25 | SNVINDEL_WIN=200000 26 | #TF_FPKM_THRESH=3 27 | TF_FPKM_THRESH=10 28 | PERC_OVERLAP=0.3 29 | 30 | #TAD=$CIS_X_HOME/refs/external/hESC.combined.domain.hg19.bed 31 | REFGENE=$CIS_X_HOME/refs/external/hg19_refGene.bed 32 | REF_2BIT=$CIS_X_HOME/refs/external/GRCh37-lite.2bit 33 | MOTIF=$CIS_X_HOME/refs/external/HOCOMOCOv10_HUMAN_mono_meme_format.meme 34 | ROADMAP_ENH=$CIS_X_HOME/refs/external/roadmapData.enhancer.merged.111.bed 35 | ROADMAP_PRO=$CIS_X_HOME/refs/external/roadmapData.promoter.merged.111.bed 36 | ROADMAP_DYA=$CIS_X_HOME/refs/external/roadmapData.dyadic.merged.111.bed 37 | 38 | SV_TEMP1="$WORKDIR/$SAMPLE_ID.sv.candidates.temp1.txt" 39 | CNA_TEMP1="$WORKDIR/$SAMPLE_ID.cna.candidates.temp1.txt" 40 | SNVINDEL_VAR="$WORKDIR/$SAMPLE_ID.snvindel.varlist.txt" 41 | SNVINDEL_SEQLIST="$WORKDIR/$SAMPLE_ID.snvindel.seqlist.txt" 42 | SNVINDEL_FA="$WORKDIR/$SAMPLE_ID.snvindel.fa" 43 | FIMO_FA_IN="$WORKDIR/$SAMPLE_ID.snvindel.fimo.input.fa" 44 | FIMO_OUT="$WORKDIR/fimo_out/fimo.txt" 45 | FIMO_ACC2GSYM="$CIS_X_HOME/refs/external/HOCOMOCOv10_annotation_HUMAN_mono.tsv" 46 | 47 | perl $CIS_X_CORE_HOME/src/scan.sv.pl $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $SV_IN $SV_TEMP1 $SV_WIN $REFGENE 48 | perl $CIS_X_CORE_HOME/src/check.TAD.pl $SAMPLE_ID $TAD $REFGENE $SV_TEMP1 $SV_CAN 49 | perl $CIS_X_CORE_HOME/src/scan.cnv.pl $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $CNA_IN $CNA_TEMP1 $CNA_WIN $CNA_SIZE $REFGENE $PERC_OVERLAP 50 | perl $CIS_X_CORE_HOME/src/check.TAD.cnv.pl $SAMPLE_ID $TAD $REFGENE $CNA_TEMP1 $CNA_CAN 51 | 52 | perl $CIS_X_CORE_HOME/src/snvindel.prep.pl $SAMPLE_ID \ 53 | $SNVINDEL_IN \ 54 | $CANDIDATES_RESULT \ 55 | $CANDIDATES_RESULT_RUN \ 56 | $SV_CAN \ 57 | $CNA_CAN \ 58 | $TAD \ 59 | $SNVINDEL_VAR \ 60 | $SNVINDEL_SEQLIST \ 61 | $SNVINDEL_WIN \ 62 | $REFGENE 63 | 64 | twoBitToFa -seqList=$SNVINDEL_SEQLIST $REF_2BIT $SNVINDEL_FA 65 | perl $CIS_X_CORE_HOME/src/merge.fa.pl $SAMPLE_ID $SNVINDEL_VAR $SNVINDEL_FA $FIMO_FA_IN 66 | fimo --verbosity 1 --thresh 1e-3 $MOTIF $FIMO_FA_IN 2>> $LOG 67 | 68 | perl $CIS_X_CORE_HOME/src/snvindel.process.pl \ 69 | $SAMPLE_ID \ 70 | $FIMO_OUT \ 71 | $FIMO_ACC2GSYM \ 72 | $SNVINDEL_VAR \ 73 | $FPKM_MATRIX \ 74 | $TF_FPKM_THRESH \ 75 | $SNVINDEL_CAN \ 76 | $ROADMAP_ENH \ 77 | $ROADMAP_PRO \ 78 | $ROADMAP_DYA \ 79 | $ANNO_USER 80 | -------------------------------------------------------------------------------- /src/ref-exp/src/collect.cohort.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my (%sid,%dat,%gene,%g2fpkm,%checksid); 4 | 5 | my $config = $ARGV[0]; 6 | my $workdir = $ARGV[1]; 7 | my $expfile = $ARGV[2]; 8 | 9 | my $infile = $config; 10 | open IN, "< $infile" or die "$infile: $!"; 11 | while() { 12 | chomp; 13 | next if $. == 1; 14 | my @F = split/\t/; 15 | $sid{$F[0]} = 1; 16 | } 17 | close IN; 18 | 19 | my %col2sid; 20 | $infile = $expfile; 21 | open IN, "< $infile" or die "$infile: $!"; 22 | while() { 23 | chomp; 24 | my @F = split/\t/; 25 | if ($. == 1) { 26 | for my $i (7 .. $#F) { 27 | $col2sid{$i} = $F[$i]; 28 | $checksid{$F[$i]} = 1; 29 | } 30 | next; 31 | } 32 | $gene{$F[1]} = 1; 33 | for my $i (7 .. $#F) { 34 | $g2fpkm{$F[1]}{$col2sid{$i}} = $F[$i]; 35 | } 36 | } 37 | close IN; 38 | 39 | my $NoneExistSID = 0; 40 | for my $s (sort keys %sid) { 41 | unless ($checksid{$s}) { 42 | print "$s not exist in the exp matrix $expfile.\n"; 43 | $NoneExistSID = 1; 44 | } 45 | } 46 | 47 | if ($NoneExistSID == 1) { 48 | die("Error: SID printed above not exist in the expression matrix."); 49 | } 50 | 51 | for my $sid (sort keys %sid) { 52 | my $infile = "$workdir/$sid/working_space/$sid.ase.gene.model.fdr.txt"; 53 | if (! -e $infile) { 54 | print "$infile not exist.\n"; 55 | next; 56 | } 57 | open IN, "< $infile" or die "$infile: $!"; 58 | while() { 59 | chomp; 60 | next if $. == 1; 61 | my @F = split/\t/; 62 | next unless $g2fpkm{$F[1]}; 63 | if ($F[22]<0.05 and $F[19]>=0.3) { 64 | $dat{$F[1]}{ase}{sid} .= "$sid,"; 65 | $dat{$F[1]}{ase}{fpkm} .= "$g2fpkm{$F[1]}{$sid},"; 66 | }elsif ($F[22] >= 0.05 and $F[9] == 0) { ### criteria updated on Dec 27, 2017. 67 | $dat{$F[1]}{bi}{sid} .= "$sid,"; 68 | $dat{$F[1]}{bi}{fpkm} .= "$g2fpkm{$F[1]}{$sid},"; 69 | }else { 70 | 1; 71 | } 72 | } 73 | close IN; 74 | } 75 | 76 | my $outfile = "$workdir/cis-X.refexp.step2.collect.txt"; 77 | open OUT, "> $outfile" or die "$outfile: $!"; 78 | print OUT "gene\tpresent.TARGET\tnum.ase.samples\tase.samples\tfpkm.ase.samples\tnum.bi.samples\tbi.samples\tfpkm.bi.samples\n"; 79 | for my $g (sort keys %dat) { 80 | my $present = 0; 81 | my $ase_count = 0; 82 | my $bi_count = 0; 83 | my $ase_sid = ""; 84 | my $bi_sid = ""; 85 | my $fpkm_ase = ""; 86 | my $fpkm_bi = ""; 87 | $present = 1 if $gene{$g}; 88 | if ($dat{$g}{ase}) { 89 | $ase_sid = $dat{$g}{ase}{sid}; 90 | $fpkm_ase = $dat{$g}{ase}{fpkm}; 91 | $ase_sid =~ s/\,$//; 92 | $fpkm_ase =~ s/\,$//; 93 | my @s_a = split(/,/,$ase_sid); 94 | $ase_count = scalar(@s_a); 95 | } 96 | if ($dat{$g}{bi}) { 97 | $bi_sid = $dat{$g}{bi}{sid}; 98 | $fpkm_bi = $dat{$g}{bi}{fpkm}; 99 | $bi_sid =~ s/\,$//; 100 | $fpkm_bi =~ s/\,$//; 101 | my @s_b = split(/,/,$bi_sid); 102 | $bi_count = scalar(@s_b); 103 | } 104 | print OUT "$g\t$present\t$ase_count\t$ase_sid\t$fpkm_ase\t$bi_count\t$bi_sid\t$fpkm_bi\n"; 105 | } 106 | close OUT; 107 | 108 | -------------------------------------------------------------------------------- /src/seed/README.md: -------------------------------------------------------------------------------- 1 | # cis-X seed 2 | 3 | `cis-X seed` downloads and generates a set of common reference files required 4 | by cis-X. 5 | 6 | ## Prerequisites 7 | 8 | * [Ruby] ^2.2.2 9 | * [nokogiri] ~1.8.3 10 | * [faToTwoBit]\* 11 | * [liftOver]\* 12 | 13 | \* UCSC Genome Browser binaries are not versioned. The latest versions 14 | _should_ work. 15 | 16 | [Ruby]: http://ruby-lang.org/ 17 | [nokogiri]: http://www.nokogiri.org/ 18 | [faToTwoBit]: https://genome.ucsc.edu/goldenpath/help/twoBit.html 19 | [liftOver]: https://genome.ucsc.edu/cgi-bin/hgLiftOver 20 | 21 | ## Usage 22 | 23 | ``` 24 | $ cis-X seed [tmp-dir] 25 | ``` 26 | 27 | ## References 28 | 29 | The following files (sans CGC) are created by `cis-X seed`. They are all 30 | required to run cis-X. 31 | 32 | * `cancer_gene_census.txt`: Tiers 1 and 2 [Cancer Gene Census] (CGC) from 33 | COSMIC in TSV. While cis-X is tested with version 87, newer version should 34 | work. This file requires an account and is not automatically downloaded. 35 | 36 | * `GRCh37-lite.2bit`: Converted from [`GRCh37-lite.fa`] to 2bit using 37 | [faToTwoBit]. 38 | 39 | * `hESC.combined.domain.hg19.bed`: Extracted from Hi-C's "[Human ES Cell (H1) topological domains]" 40 | and preprocessed from hg18 to hg19 using [liftOver]. 41 | 42 | * `hg19_refGene`: Downloaded from the [UCSC Table Browser] (assembly: Feb. 43 | 2009 (GRCH37/hg19), track: NCBI RefSeq, table: UCSC RefSeq (refGene)). 44 | 45 | * `hg19_refGene.bed`: Converted from `hg19_refGene` using 46 | `bin/hg19_ref_gene_to_bed`. 47 | 48 | * `HOCOMOCOv10_HUMAN_mono_meme_format.meme`: Downloaded from [HOCOMOCO v10] 49 | (Matrices in other formats > MEME). 50 | 51 | * `HOCOMOCOv10_annotation_HUMAN_mono.tsv`: Downloaded from [HOCOMOCO v10] 52 | (Complete model annotation) 53 | 54 | * `ImprintGenes.txt`: Copied from Geneimprint [Human Imprinted Genes] as a 55 | tab-delimited file. 56 | 57 | * `roadmapData.dyadic.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project] 58 | (Delineation of DNaseI-accessible regulatory regions > Dyadic). All files 59 | are merged with two extra columns: cell line name and tissue of origin. 60 | 61 | * `roadmapData.enhancer.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project] 62 | (Delineation of DNaseI-accessible regulatory regions > Enhancer). All files 63 | are merged with two extra columns: cell line name and tissue of origin. 64 | 65 | * `roadmapData.promoter.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project] 66 | (Delineation of DNaseI-accessible regulatory regions > Promoter). All files 67 | are merged with two extra columns: cell line name and tissue of origin. 68 | 69 | [Cancer Gene Census]: https://cancer.sanger.ac.uk/census 70 | [`GRCh37-lite.fa`]: https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/ 71 | [Human ES Cell (H1) topological domains]: http://chromosome.sdsc.edu/mouse/hi-c/download.html 72 | [UCSC Table Browser]: http://genome.ucsc.edu/cgi-bin/hgTables 73 | [HOCOMOCO v10]: http://hocomoco11.autosome.ru/downloads_v10 74 | [Human Imprinted Genes]: http://www.geneimprint.com/site/genes-by-species.Homo+sapiens 75 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html 76 | 77 | ## Example 78 | 79 | `cis-X seed` will commonly be used to seed the `$CIS_X_HOME/refs/external` 80 | directory. 81 | 82 | ``` 83 | $ cis-X seed $CIS_X_HOME/refs/external 84 | ``` 85 | -------------------------------------------------------------------------------- /dnanexus/cis-x/src/cis-x.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | main() { 4 | set -x 5 | 6 | REFERENCE_DATA_PROJECT_ID=project-F5444K89PZxXjBqVJ3Pp79B4 7 | DATA_DIR=$HOME/data 8 | REFS_DIR=$HOME/refs 9 | RESULTS_DIR=$HOME/results 10 | CIS_X_EXTRA_ARGS="" 11 | 12 | mkdir $DATA_DIR $REFS_DIR $RESULTS_DIR 13 | 14 | gzip --decompress --stdout $RESOURCES/tmp/cis-x-latest.tar.gz | docker load 15 | 16 | dx download --output $DATA_DIR/wgs.markers.txt "$markers" 17 | dx download --output $DATA_DIR/wgs.cnvloh.txt "$cnv_loh" 18 | dx download --output $DATA_DIR/RNAseq.bam "$bam" 19 | dx download --output $DATA_DIR/RNAseq.bam.bai "$bai" 20 | dx download --output $DATA_DIR/RNAseq_all_fpkm.txt "$fpkm_matrix" 21 | dx download --output $DATA_DIR/mut.txt "$snv_indel" 22 | dx download --output $DATA_DIR/sv.txt "$sv" 23 | dx download --output $DATA_DIR/cna.txt "$cna" 24 | 25 | if [[ ! -z "$user_annotation" ]]; then 26 | dx download --output $DATA_DIR/user_annotation.bed "$user_annotation" 27 | CIS_X_EXTRA_ARGS="$CIS_X_EXTRA_ARGS -u $DATA_DIR/user_annotation.bed" 28 | fi 29 | 30 | if [[ ! -z "$tad_info" ]]; then 31 | dx download --output $DATA_DIR/tad_info.bed "$tad_info" 32 | CIS_X_EXTRA_ARGS="$CIS_X_EXTRA_ARGS -t $DATA_DIR/tad_info.bed" 33 | fi 34 | 35 | dx download --recursive --output $REFS_DIR "$REFERENCE_DATA_PROJECT_ID:/pipeline/cis-X/*" 36 | 37 | docker run \ 38 | --mount type=bind,source=$DATA_DIR,target=/data,readonly \ 39 | --mount type=bind,source=$REFS_DIR,target=/opt/cis-x/refs/external,readonly \ 40 | --mount type=bind,source=$RESULTS_DIR,target=/results \ 41 | cis-x \ 42 | run \ 43 | -s $sample_id \ 44 | -o /results \ 45 | -l /data/wgs.markers.txt \ 46 | -g /data/wgs.cnvloh.txt \ 47 | -b /data/RNAseq.bam \ 48 | -e /data/RNAseq_all_fpkm.txt \ 49 | -m /data/mut.txt \ 50 | -v /data/sv.txt \ 51 | -c /data/cna.txt \ 52 | -d $disease \ 53 | -a $cnv_loh_action \ 54 | -w $min_coverage_wgs \ 55 | -r $min_coverage_rna_seq \ 56 | -f $fpkm_threshold_candidate \ 57 | -h $chr_string $CIS_X_EXTRA_ARGS 58 | 59 | cis_activated_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.cisActivated.candidates.txt) 60 | sv_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.sv.candidates.txt) 61 | cna_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.cna.candidates.txt) 62 | snv_indel_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.snvindel.candidates.txt) 63 | ohe_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.OHE.results.txt) 64 | ase_gene_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.ase.gene.model.fdr.txt) 65 | ase_marker_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.ase.combine.WGS.RNAseq.goodmarkers.binom.txt) 66 | 67 | dx-jobutil-add-output --class file cis_activated_candidates "$cis_activated_candidates" 68 | dx-jobutil-add-output --class file sv_candidates "$sv_candidates" 69 | dx-jobutil-add-output --class file cna_candidates "$cna_candidates" 70 | dx-jobutil-add-output --class file snv_indel_candidates "$snv_indel_candidates" 71 | dx-jobutil-add-output --class file ohe_results "$ohe_results" 72 | dx-jobutil-add-output --class file ase_gene_results "$ase_gene_results" 73 | dx-jobutil-add-output --class file ase_marker_results "$ase_marker_results" 74 | } 75 | -------------------------------------------------------------------------------- /src/other/mergeData_geneName.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | use strict; 3 | 4 | my $input = $ARGV[0]; 5 | my $GTF = $ARGV[1]; 6 | 7 | my $flst = "file.lst"; 8 | my $fbase = "RNAseq_GENCODEv19"; 9 | 10 | system "echo $input > $flst"; 11 | 12 | open(IN, "<$flst"); 13 | my @flst = ; 14 | close (IN); 15 | chomp @flst; 16 | 17 | my %sum = (); 18 | my %count = (); 19 | my @samples = (); 20 | foreach my $f (@flst) { 21 | my @a = split(/\//, $f); 22 | my $nm = $a[$#a]; 23 | $nm =~ s/counts//g; 24 | $nm =~ s/^\.//g; 25 | $nm =~ s/\.txt//g; 26 | push @samples, $nm; 27 | open(IN, "<$f"); 28 | while (my $line = ) { 29 | chomp $line; 30 | my @a = split(/\t/, $line); 31 | if ($a[0] eq '__no_feature') {$count{$nm}{'no_feature'} = $a[1]; last;} 32 | $count{$nm}{$a[0]} = $a[1]; 33 | $sum{$nm} += $a[1]; 34 | } 35 | close (IN); 36 | } 37 | 38 | open(OUT1, ">${fbase}_mRNA_count.txt"); 39 | open(OUT2, ">${fbase}_lincRNA_count.txt"); 40 | open(OUT3, ">${fbase}_antisense_count.txt"); 41 | open(OUT, ">${fbase}_all_count.txt"); 42 | print OUT1 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 43 | print OUT2 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 44 | print OUT3 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 45 | print OUT "GeneID\tGeneName\tType\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 46 | open(FPM1, ">${fbase}_mRNA_fpkm.txt"); 47 | open(FPM2, ">${fbase}_lincRNA_fpkm.txt"); 48 | open(FPM3, ">${fbase}_antisense_fpkm.txt"); 49 | open(FPM, ">${fbase}_all_fpkm.txt"); 50 | print FPM1 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 51 | print FPM2 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 52 | print FPM3 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 53 | print FPM "GeneID\tGeneName\tType\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n"; 54 | open(IN, "< $GTF"); ### for v19. 55 | while (my $line = ) { 56 | chomp $line; 57 | my @a = split(/\t/, $line); 58 | my $ip = 0; 59 | foreach my $s (@samples) { 60 | if ($count{$s}{$a[0]} > 0) {$ip = 1; last;} 61 | } 62 | if ($ip == 0) {next;} 63 | my @tmp1 = (); 64 | my @tmp2 = (); 65 | print OUT "$a[2]\t$a[0]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]"; 66 | print FPM "$a[2]\t$a[0]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]"; 67 | foreach my $s (@samples) { 68 | print OUT "\t$count{$s}{$a[0]}"; 69 | push @tmp1, $count{$s}{$a[0]}; 70 | my $fpkm = sprintf("%.4f", 1000000000*$count{$s}{$a[0]}/($a[$#a]*$sum{$s})); 71 | print FPM "\t$fpkm"; 72 | push @tmp2, $fpkm; 73 | } 74 | print OUT "\n"; 75 | print FPM "\n"; 76 | if ($a[3] =~ /protein_coding/) { 77 | print OUT1 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 78 | print OUT1 join("\t", @tmp1), "\n"; 79 | print FPM1 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 80 | print FPM1 join("\t", @tmp2), "\n"; 81 | } 82 | if ($a[3] =~ /lincRNA/) { 83 | print OUT2 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 84 | print OUT2 join("\t", @tmp1), "\n"; 85 | print FPM2 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 86 | print FPM2 join("\t", @tmp2), "\n"; 87 | } 88 | if ($a[3] =~ /antisense/) { 89 | print OUT3 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 90 | print OUT3 join("\t", @tmp1), "\n"; 91 | print FPM3 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t"; 92 | print FPM3 join("\t", @tmp2), "\n"; 93 | } 94 | } 95 | close (IN); 96 | close (OUT); 97 | close (OUT1); 98 | close (OUT2); 99 | close (OUT3); 100 | close (FPM); 101 | close (FPM1); 102 | close (FPM2); 103 | close (FPM3); 104 | 105 | open(OUT, ">nofeature-summary.txt"); 106 | print OUT "Sample\tNo-Feature\tMapped\n"; 107 | foreach my $s (@samples) { 108 | print OUT "$s\t$count{$s}{'no_feature'}\t$sum{$s}\n"; 109 | } 110 | 111 | -------------------------------------------------------------------------------- /src/ref-exp/README.md: -------------------------------------------------------------------------------- 1 | # cis-X ref-exp 2 | 3 | **cis-X ref-exp** generates reference expression matrices used for outlier 4 | high expression (OHE) tests. 5 | 6 | cis-X uses precalculated reference expression matrices for finding outlier high 7 | expression (OHE) signals, which is disease specific. cis-X includes references 8 | for pediatric neuroblastoma (NBL) and T-cell acute lymphoblastic leukaemia 9 | (T-ALL), but user-defined references can be added as well. 10 | 11 | This command helps generate the biallelic expression cases as described 12 | below (`exp.ref.bi.txt`). 13 | 14 | ## Usage 15 | 16 | ``` 17 | cis-X-ref-exp 18 | 19 | USAGE: 20 | cis-X ref-exp [args...] 21 | 22 | SUBCOMMANDS: 23 | generate Generate a biallelic reference expression matrix 24 | prepare Create a batch script for preprocessing inputs 25 | preprocess Runs allelic specific expression (ASE) tests on inputs 26 | ``` 27 | 28 | ## Reference matrices 29 | 30 | cis-X performs independent tests with three reference expression matrices per 31 | disease: 32 | 33 | * `exp.ref.entire.txt`: the unfiltered cohort. 34 | * `exp.ref.bi.txt`: cases with a biallelic expression for a given gene. 35 | * `exp.ref.white.txt`: cases without known noncoding regulatory variants for 36 | a given gene. 37 | 38 | Reference matrices are tab-delimited files, including a header. 39 | 40 | * `exp.ref.entire.txt`: gene_id, gene_name, type, status, chr, start, end, id... 41 | * `exp.ref.bi.txt`: gene_name, num_cases, id, fpkm 42 | * `exp.ref.white.txt`: gene_name, num_cases, id, fpkm 43 | 44 | An extra file `precal.tvalue.bin_gt1.txt` is also used, which contains a 45 | line-delimited vector of normalized t-values across samples. 46 | 47 | If there is no prior knowledge, it is valid to create a reference matrix with 48 | no rows (but include the header). Note that having both empty biallelic 49 | expression and whitelist matrices will result in higher false negative rates 50 | for cis-activated candidates during analysis. 51 | 52 | Each disease under `$CIS_X_HOME/refs/diseases/$DISEASE` must have these four 53 | files. `$DISEASE` is the name given when running `cis-X run`. See the `NBL` and 54 | `TALL` directories for examples of the reference expression matrices and 55 | normalized t-values vector. 56 | 57 | ## Example 58 | 59 | `cis-X ref-exp` will commonly be used in a three step process: prepare, 60 | preprocess, and generate. 61 | 62 | ### `prepare` 63 | 64 | The preparation stage creates a batch script from a list of inputs. 65 | 66 | ``` 67 | $ cis-X ref-exp prepare /path/to/config.txt /results true 68 | ``` 69 | 70 | It requires a tab-delimited configuration file with four columns: 71 | 72 | * sample_id 73 | * markers: path to a list of single nucleotide markers 74 | * rna_bam: path to a RNA-Seq BAM file 75 | * cnv_loh: path to CNV/LOH regions 76 | 77 | The last argument `chr-string` is a boolean (`TRUE` or `FALSE`) for whether 78 | the reference sequence dictionary names start with "chr". 79 | 80 | The resulting batch script is saved to 81 | `$RESULTS_DIR/cis-X.refexp.step1.commands.sh`. 82 | 83 | ### `preprocess` 84 | 85 | It is unlikely that the `preprocess` subcommand will be called manually, as 86 | the resulting batch script from the `prepare` stage creates a list of 87 | commands that calls it with inputs from the configuration file. This batch 88 | can be submitted to a job runner or executed as a normal script. 89 | 90 | ``` 91 | $ bash /results/cis-X.refexp.step1.commands.sh 92 | ``` 93 | 94 | ### `generate` 95 | 96 | The final generation stage creates a biallelic expression matrix from the 97 | preprocessed outputs. 98 | 99 | ``` 100 | $ cis-X ref-exp generate /path/to/config.txt /results /path/to/gene-exp-table.txt 101 | ``` 102 | 103 | To use this with cis-X, copy the output in `$RESULTS_DIR/refexp` to 104 | `$CIS_X_HOME/refs/diseases/$DISEASE`, where `$DISEASE` is any name. Copy 105 | `$GENE_EXP_TABLE` to the same disease directory as the unfiltered cohort, 106 | named `exp.ref.entire.txt` 107 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | # fimo: libxml2-dev libxslt1-dev zlib1g-dev 4 | # multtest: r-base build-essential 5 | # nokogiri: build-essential ruby-dev zlib1g-dev liblzma-dev 6 | # twoBitToFa: libkrb5-3 7 | # 8 | # Set the timezone before updating to avoid having to interact with tzdata (r-base dep). 9 | RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime \ 10 | && apt-get update \ 11 | && apt-get install -y \ 12 | # common 13 | build-essential \ 14 | libkrb5-3 \ 15 | wget \ 16 | zlib1g-dev \ 17 | # bedtools 18 | python-minimal \ 19 | # variants2matrix 20 | openjdk-8-jre-headless \ 21 | # ase, test-outliers 22 | r-base \ 23 | # screen 24 | libxml2-dev \ 25 | libxslt1-dev \ 26 | zlib1g-dev \ 27 | # seed 28 | liblzma-dev \ 29 | ruby \ 30 | ruby-dev \ 31 | && rm -r /var/lib/apt/lists/* 32 | 33 | # core 34 | 35 | RUN wget -O - https://cpanmin.us | perl - App::cpanminus \ 36 | && cpanm Data::Compare \ 37 | && chown --recursive root:root /root/.cpanm 38 | 39 | COPY src/other/meme_glam2_fix_new_gcc.patch tmp/ 40 | 41 | RUN cd /tmp \ 42 | && wget http://meme-suite.org/meme-software/4.9.0/meme_4.9.0_4.tar.gz \ 43 | && echo "3feed2e28a5d17aa5fc04e226b7473a0d5a443055993365bf2116708be68c7fe *meme_4.9.0_4.tar.gz" | sha256sum --check \ 44 | && tar xf meme_4.9.0_4.tar.gz \ 45 | && cd meme_4.9.0 \ 46 | && patch -p1 < /tmp/meme_glam2_fix_new_gcc.patch \ 47 | && ./configure --prefix=/opt/meme \ 48 | && make -j$(nproc) \ 49 | && make install \ 50 | && rm -r /tmp/meme* 51 | 52 | RUN cd /tmp \ 53 | && wget https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools-2.28.0.tar.gz \ 54 | && echo "15af6d10ed28fb3113cd3edce742fd4275f224bc06ecb98d70d869940220bc32 *bedtools-2.28.0.tar.gz" | sha256sum --check \ 55 | && tar xf bedtools-2.28.0.tar.gz \ 56 | && cd bedtools2 \ 57 | && make -j$(nproc) \ 58 | && cp bin/* /usr/local/bin \ 59 | && rm -r /tmp/bedtools* 60 | 61 | RUN echo 'source("https://raw.githubusercontent.com/Bioconductor/LegacyInstall/827129e25128453f19a61ce0e8f99d903155ad01/biocLite.R"); biocLite("multtest")' \ 62 | | R --vanilla 63 | 64 | RUN cd /usr/local/bin \ 65 | && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/twoBitToFa \ 66 | && chmod +x twoBitToFa 67 | 68 | # seed 69 | 70 | RUN gem install --no-document nokogiri --version 1.12.5 71 | 72 | RUN cd /usr/local/bin \ 73 | && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/faToTwoBit \ 74 | && chmod +x faToTwoBit 75 | 76 | RUN cd /usr/local/bin \ 77 | && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/liftOver \ 78 | && chmod +x liftOver 79 | 80 | # variants2matrix 81 | 82 | ENV V2M_HOME /opt/variants2matrix 83 | ENV PERL5LIB ${V2M_HOME}/lib/perl 84 | ENV CLASSPATH ${V2M_HOME}/lib/java/bambino-1.0.jar:${V2M_HOME}/lib/java/indelxref-1.0.jar:${V2M_HOME}/lib/java/picard.jar:${V2M_HOME}/lib/java/samplenamelib-1.0.jar 85 | 86 | RUN cd /tmp \ 87 | && wget http://ftp.stjude.org/pub/software/cis-x/variants2matrix.tar.gz \ 88 | && echo "6502f1bd5d8ec64d357092c21b5eb3b9cefc135a41b8b0d0d3124c2ba2f80311 *variants2matrix.tar.gz" | sha256sum --check \ 89 | && tar xf variants2matrix.tar.gz --directory /opt --no-same-owner \ 90 | && rm variants2matrix.tar.gz 91 | 92 | RUN cd /tmp \ 93 | && wget https://sjr-redesign.stjude.org/content/dam/research-redesign/labs/zhang-lab/cis-x-refs-20200212.tar.gz \ 94 | && echo "1074dd48157cd00dc407ff06e0bca01c0546d1886e6c1f6fb7d25e1d42b060c0 *cis-x-refs-20200212.tar.gz" | sha256sum --check \ 95 | && mkdir -p /opt/cis-x/refs \ 96 | && tar xf cis-x-refs-20200212.tar.gz --strip-components 1 --directory /opt/cis-x/refs --no-same-owner \ 97 | && rm cis-x-refs-20200212.tar.gz 98 | 99 | # set for ruby 100 | ENV LANG C.UTF-8 101 | 102 | ENV PATH /opt/cis-x/bin:/opt/meme/bin:${V2M_HOME}/bin:${PATH} 103 | 104 | COPY bin /opt/cis-x/bin 105 | COPY src /opt/cis-x/src 106 | 107 | ENTRYPOINT ["/opt/cis-x/bin/cis-X"] 108 | -------------------------------------------------------------------------------- /src/core/src/01.get.markder.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | my $sid = $ARGV[0]; 3 | my $high20 = $ARGV[1]; ### high20 from WGS. 4 | my $cnvloh_in = $ARGV[2]; 5 | my $snv4_out = $ARGV[3]; 6 | my $het_out = $ARGV[4]; 7 | my $bad_lst = $ARGV[5]; 8 | my $covg = $ARGV[6]; 9 | 10 | my $upper = 0.7; 11 | my $lower = 0.3; 12 | #my $covg = 10; 13 | my %badlst = (); 14 | my %chrom = (); 15 | my %col2snv = (); 16 | my %cnvloh = (); 17 | 18 | for my $i (1 .. 22) { 19 | my $c = "chr" . $i; 20 | $chrom{$c} = 1; 21 | } 22 | 23 | my $infile = $bad_lst; 24 | open IN, "< $infile" or die "$infile: $!"; 25 | while() { 26 | chomp; 27 | next if $. == 1; 28 | $badlst{$_} = 1; 29 | } 30 | close IN; 31 | 32 | $infile = $cnvloh_in; 33 | open IN, "< $infile" or die "$infile: $!"; 34 | while() { 35 | chomp; 36 | next if $. == 1; 37 | my @F = split/\t/; 38 | my $chr = $F[0]; 39 | unless ($chr =~ /^chr/) { 40 | $chr = "chr" . $chr; 41 | } 42 | my $id = "$chr.$F[1].$F[2]"; 43 | $cnvloh{chrom}{$chr}{$id} = 1; 44 | $cnvloh{region}{$id}{chr} = $chr; 45 | $cnvloh{region}{$id}{start} = $F[1]; 46 | $cnvloh{region}{$id}{end} = $F[2]; 47 | } 48 | close IN; 49 | 50 | #my $outfile = "$sid.heterozygous.txt"; 51 | my $outfile = $het_out; 52 | open OUT, "> $outfile" or die "$outfile: $!"; 53 | open SNV4OUT, "> $snv4_out" or die "$snv4_out: $!"; 54 | print OUT "chrom\tposition\tref\tmut\tref_T\tref_G\tmut_T\tmut_G\tcnvlohTag\n"; 55 | open H20, "< $high20" or die "$high20: $!"; 56 | while() { 57 | chomp; 58 | my @F = split/\t/; 59 | if ($. == 1) { 60 | for my $i (0 .. $#F) { 61 | if ($F[$i] eq "Chr") { 62 | $col2snv{chr} = $i; 63 | } 64 | if ($F[$i] eq "Pos") { 65 | $col2snv{pos} = $i; 66 | } 67 | if ($F[$i] eq "reference_tumor_count") { 68 | $col2snv{ref_tum_num} = $i; 69 | } 70 | if ($F[$i] eq "alternative_tumor_count") { 71 | $col2snv{mut_tum_num} = $i; 72 | } 73 | if ($F[$i] eq "Chr_Allele") { 74 | $col2snv{ref_g} = $i; 75 | } 76 | if ($F[$i] eq "Alternative_Allele") { 77 | $col2snv{mut_g} = $i; 78 | } 79 | if ($F[$i] eq "reference_normal_count") { 80 | $col2snv{ref_norm_num} = $i; 81 | } 82 | if ($F[$i] eq "alternative_normal_count") { 83 | $col2snv{mut_norm_num} = $i; 84 | } 85 | } 86 | next; 87 | } 88 | # next unless $F[5] eq "SNP"; 89 | next unless ($F[$col2snv{ref_g}] =~ /[ATCG]/ and $F[$col2snv{mut_g}] =~ /[ATCG]/); ### make sure only SNP was included. 90 | my $chrom_i = $F[$col2snv{chr}]; 91 | unless ($chrom_i =~ /^chr/) { 92 | $chrom_i = "chr" . $chrom_i; 93 | } 94 | next unless $chrom{$chrom_i}; 95 | my $cvg = $F[$col2snv{ref_tum_num}] + $F[$col2snv{mut_tum_num}]; 96 | next unless $cvg >= $covg; 97 | next unless ($F[$col2snv{ref_tum_num}] >= 3 and $F[$col2snv{mut_tum_num}] >= 3); ### Require minimal 3 reads for each allele. This is for low covg sites, like 8. 2019-04-08. 98 | my $snv4 = "$chrom_i.$F[$col2snv{pos}].$F[$col2snv{ref_g}].$F[$col2snv{mut_g}]"; 99 | next if $badlst{$snv4}; ## drop the BAD markers. 100 | 101 | ### filter markers in cnv-loh regions 102 | ### updated 2018-12-04. no filter at this stage, give a tag instead indicating if a marker sits inside cnvloh region. 103 | # my $hit = 0; 104 | my $tag = "diploid"; 105 | for my $id (sort keys %{$cnvloh{chrom}{$chrom_i}}) { 106 | if ($F[$col2snv{pos}] >= $cnvloh{region}{$id}{start} and $F[$col2snv{pos}] <= $cnvloh{region}{$id}{end}) { 107 | # $hit = 1; 108 | $tag = "cnvloh"; 109 | } 110 | } 111 | # next if $hit == 1; 112 | ### end of cnv-loh filter 113 | 114 | my $maf = $F[$col2snv{mut_tum_num}] / $cvg; 115 | if ($lower <= $maf and $maf <= $upper) { 116 | print OUT "$chrom_i\t$F[$col2snv{pos}]\t$F[$col2snv{ref_g}]\t$F[$col2snv{mut_g}]\t$F[$col2snv{ref_tum_num}]\t$F[$col2snv{ref_norm_num}]\t$F[$col2snv{mut_tum_num}]\t$F[$col2snv{mut_norm_num}]\t$tag\n"; 117 | print SNV4OUT "$snv4\n"; 118 | } 119 | } 120 | close H20; 121 | close OUT; 122 | close SNV4OUT; 123 | 124 | -------------------------------------------------------------------------------- /src/core/src/07.gene.model.Oct2017.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $input = $ARGV[1]; 5 | my $refgene = $ARGV[2]; 6 | my $ai_thresh_di = $ARGV[3]; 7 | my $ai_thresh_cnv = $ARGV[4]; 8 | my $pvalue = $ARGV[5]; 9 | my $cnv_loh_action = $ARGV[6]; 10 | my $output = $ARGV[7]; 11 | my $covg_rna = $ARGV[8]; 12 | 13 | $covg_rna = 3 if $covg_rna < 3; ### 2019-04-08. 14 | 15 | my (%gene,@gene,%chr2g,%g2ase,$head); 16 | my $infile = $refgene; 17 | open IN, "< $infile" or die "$infile: $!"; 18 | while() { 19 | chomp; 20 | next if $. == 1; 21 | my @F = split/\t/; 22 | push @gene, $F[1]; 23 | $chr2g{$F[2]}{$F[1]} = 1; 24 | $gene{$F[1]}{chrom} = $F[2]; 25 | $gene{$F[1]}{strand} = $F[3]; 26 | $gene{$F[1]}{start} = $F[4]; 27 | $gene{$F[1]}{end} = $F[5]; 28 | $gene{$F[1]}{name} = $F[12]; 29 | $gene{$F[1]}{cdsstartstat} = $F[13]; 30 | $gene{$F[1]}{cdsendstat} = $F[14]; 31 | } 32 | close IN; 33 | 34 | open IN, "< $input" or die "$input: $!"; 35 | while() { 36 | chomp; 37 | if ($. == 1) { 38 | $head = $_; 39 | next; 40 | } 41 | my @F = split/\t/; 42 | my $ai = $F[12]; 43 | my $pval = $F[11]; 44 | my $tag = $F[8]; 45 | my $chrom = $F[0]; 46 | my $pos = $F[1]; 47 | my $maf_rna = $F[7]; 48 | my $ase = "no"; 49 | my $snv4 = "$F[0].$F[1].$F[2].$F[3]"; 50 | next unless $F[6] >= $covg_rna; ### 2019-04-08. 51 | if ($cnv_loh_action eq "drop" and $tag eq "cnvloh") { 52 | next; 53 | } 54 | if ($tag eq "diploid") { 55 | if ($ai >= $ai_thresh_di and $pval < $pvalue) { 56 | $ase = "yes"; 57 | } 58 | }elsif ($tag eq "cnvloh") { 59 | if ($ai >= $ai_thresh_cnv and $pval < $pvalue) { 60 | $ase = "yes"; 61 | } 62 | }else { 63 | print "Error: wrong tag $tag for $snv4.\n"; 64 | } 65 | for my $g (keys %{$chr2g{$chrom}}) { 66 | if ($pos >= $gene{$g}{start} and $pos <= $gene{$g}{end}) { 67 | $g2ase{$g}{$snv4}{ai} = $ai; 68 | $g2ase{$g}{$snv4}{ase} = $ase; 69 | $g2ase{$g}{$snv4}{pval} = $pval; 70 | $g2ase{$g}{$snv4}{tag} = $tag; 71 | $g2ase{$g}{$snv4}{mafrna} = $maf_rna; 72 | } 73 | } 74 | } 75 | close IN; 76 | 77 | my $outfile = $output; 78 | open OUT, "> $outfile" or die "$outfile: $!"; 79 | print OUT "gene\tgsym\tchrom\tstrand\tstart\tend\tcdsStartStat\tcdsEndStat\tmarkers\tase_markers\taverage_ai_all\taverage_ai_ase\tpval_all_markers\tpval_ase_markers\tai_all_markers\tai_ase_markers\ttag_all_markers\tmaf_rna_all_markers\n"; 80 | for my $g (@gene) { 81 | my $markers = 0; 82 | my $ase_markers = 0; 83 | my $avg_all = "na"; 84 | my $avg_ase = "na"; 85 | my $p_all = "na"; 86 | my $p_ase = "na"; 87 | my $ai_all = "na"; 88 | my $ai_ase = "na"; 89 | my $tag_all = "na"; 90 | my $maf_rna_all = "na"; 91 | if ($g2ase{$g}) { 92 | my @markers = sort keys %{$g2ase{$g}}; 93 | my $sum1 = 0; 94 | my $sum2 = 0; 95 | for my $m (@markers) { 96 | $markers++; 97 | $sum1 += $g2ase{$g}{$m}{ai}; 98 | if ($p_all eq "na") { 99 | $p_all = $g2ase{$g}{$m}{pval}; 100 | $ai_all = $g2ase{$g}{$m}{ai}; 101 | $tag_all = $g2ase{$g}{$m}{tag}; 102 | $maf_rna_all = $g2ase{$g}{$m}{mafrna}; 103 | }else { 104 | $p_all .= ",$g2ase{$g}{$m}{pval}"; 105 | $ai_all .= ",$g2ase{$g}{$m}{ai}"; 106 | $tag_all .= ",$g2ase{$g}{$m}{tag}"; 107 | $maf_rna_all .= ",$g2ase{$g}{$m}{mafrna}"; 108 | } 109 | if ($g2ase{$g}{$m}{ase} eq "yes") { 110 | $ase_markers++; 111 | $sum2 += $g2ase{$g}{$m}{ai}; 112 | if ($p_ase eq "na") { 113 | $p_ase = $g2ase{$g}{$m}{pval}; 114 | $ai_ase = $g2ase{$g}{$m}{ai}; 115 | }else { 116 | $p_ase .= ",$g2ase{$g}{$m}{pval}"; 117 | $ai_ase .= ",$g2ase{$g}{$m}{ai}"; 118 | } 119 | } 120 | } 121 | $avg_all = sprintf("%.3f",$sum1/$markers); 122 | if ($ase_markers > 0) { 123 | $avg_ase = sprintf("%.3f",$sum2/$ase_markers); 124 | } 125 | } 126 | print OUT "$g\t$gene{$g}{name}\t$gene{$g}{chrom}\t$gene{$g}{strand}\t$gene{$g}{start}\t$gene{$g}{end}\t$gene{$g}{cdsstartstat}\t$gene{$g}{cdsendstat}\t$markers\t$ase_markers\t$avg_all\t$avg_ase\t$p_all\t$p_ase\t$ai_all\t$ai_ase\t$tag_all\t$maf_rna_all\n"; 127 | } 128 | close OUT; 129 | -------------------------------------------------------------------------------- /src/seed/bin/cis-X-seed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | CIS_X_SEED_HOME=$(realpath $(dirname $0)/..) 6 | 7 | OUT_DIR=$1 8 | TMP_DIR=${2:-/tmp} 9 | 10 | if [ $# -lt 1 ]; then 11 | basename $0 12 | echo 13 | echo "USAGE:" 14 | echo " cis-X seed [tmp-dir]" 15 | exit 1 16 | fi 17 | 18 | if [ ! -f $OUT_DIR/GRCh37-lite.2bit ]; then 19 | wget --directory-prefix $TMP_DIR \ 20 | https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/GRCh37-lite.fa.gz 21 | # wget --directory-prefix $TMP_DIR \ 22 | # https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/GRCh37-lite.md5.checksum 23 | # md5sum --check $TMP_DIR/GRCh37-lite.fa.gz.md5 24 | gzip --decompress $TMP_DIR/GRCh37-lite.fa.gz 25 | faToTwoBit $TMP_DIR/GRCh37-lite.fa $OUT_DIR/GRCh37-lite.2bit 26 | fi 27 | 28 | if [ ! -f $OUT_DIR/hg19_refGene ]; then 29 | wget --output-document $OUT_DIR/hg19_refGene \ 30 | --post-data 'clade=mammal&org=Human&db=hg19&hgta_group=genes&hgta_track=refSeqComposite&hgta_table=refGene&hgta_regionType=genome&position=chr21%3A33031597-33041570&hgta_outputType=primaryTable&boolshad.sendToGalaxy=0&boolshad.sendToGreat=0&boolshad.sendToGenomeSpace=0&hgta_outFileName=hg19_refGene&hgta_compressType=none&hgta_doTopSubmit=get+output' \ 31 | http://genome.ucsc.edu/cgi-bin/hgTables 32 | fi 33 | 34 | if [ ! -f $OUT_DIR/hg19_refGene.bed ]; then 35 | $CIS_X_SEED_HOME/bin/hg19_ref_gene_to_bed $OUT_DIR/hg19_refGene > $OUT_DIR/hg19_refGene.bed 36 | fi 37 | 38 | if [ ! -f $OUT_DIR/HOCOMOCOv10_HUMAN_mono_meme_format.meme ]; then 39 | wget --directory-prefix $OUT_DIR \ 40 | http://hocomoco10.autosome.ru/final_bundle/HUMAN/mono/HOCOMOCOv10_HUMAN_mono_meme_format.meme 41 | fi 42 | 43 | if [ ! -f $OUT_DIR/HOCOMOCOv10_annotation_HUMAN_mono.tsv ]; then 44 | wget --directory-prefix $OUT_DIR \ 45 | http://hocomoco10.autosome.ru/final_bundle/HUMAN/mono/HOCOMOCOv10_annotation_HUMAN_mono.tsv 46 | fi 47 | 48 | if [ ! -f $OUT_DIR/hESC.combined.domain.hg19.bed ]; then 49 | wget --directory-prefix $TMP_DIR http://chromosome.sdsc.edu/mouse/hi-c/hESC.domain.tar.gz 50 | tar xf $TMP_DIR/hESC.domain.tar.gz --directory $TMP_DIR 51 | 52 | wget --directory-prefix $TMP_DIR \ 53 | http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz 54 | 55 | liftOver \ 56 | $TMP_DIR/hESC/combined/total.combined.domain \ 57 | $TMP_DIR/hg18ToHg19.over.chain.gz \ 58 | $OUT_DIR/hESC.combined.domain.hg19.bed \ 59 | /dev/null 60 | fi 61 | 62 | if [ ! -f $OUT_DIR/ImprintGenes.txt ]; then 63 | $CIS_X_SEED_HOME/bin/scrape_geneimprint \ 64 | http://www.geneimprint.com/site/genes-by-species.Homo+sapiens \ 65 | > $OUT_DIR/ImprintGenes.txt 66 | fi 67 | 68 | if [ ! -f $OUT_DIR/roadmapData.promoter.merged.111.bed ] \ 69 | || [ ! -f $OUT_DIR/roadmapData.enhancer.mergeducsc.111.bed ] \ 70 | || [ ! -f $OUT_DIR/roadmapData.dyadic.merged.111.bed ] 71 | then 72 | wget --output-document $TMP_DIR/jul2013.roadmapData.qc.csv \ 73 | https://docs.google.com/spreadsheets/d/1yikGx4MsO9Ei36b64yOy9Vb6oPC5IBGlFbYEt-N6gOM/export?format=csv 74 | fi 75 | 76 | if [ ! -f $OUT_DIR/roadmapData.promoter.merged.111.bed ]; then 77 | wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/prom \ 78 | https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_prom/ 79 | gzip --decompress $TMP_DIR/prom/*.bed.gz 80 | $CIS_X_SEED_HOME/bin/merge_roadmap \ 81 | $TMP_DIR/prom \ 82 | $TMP_DIR/jul2013.roadmapData.qc.csv \ 83 | > $OUT_DIR/roadmapData.promoter.merged.111.bed 84 | fi 85 | 86 | if [ ! -f $OUT_DIR/roadmapData.enhancer.merged.111.bed ]; then 87 | wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/enh \ 88 | https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_enh/ 89 | gzip --decompress $TMP_DIR/enh/*.bed.gz 90 | $CIS_X_SEED_HOME/bin/merge_roadmap \ 91 | $TMP_DIR/enh \ 92 | $TMP_DIR/jul2013.roadmapData.qc.csv \ 93 | > $OUT_DIR/roadmapData.enhancer.merged.111.bed 94 | fi 95 | 96 | if [ ! -f $OUT_DIR/roadmapData.dyadic.merged.111.bed ]; then 97 | wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/dyadic \ 98 | https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_dyadic/ 99 | gzip --decompress $TMP_DIR/dyadic/*.bed.gz 100 | $CIS_X_SEED_HOME/bin/merge_roadmap \ 101 | $TMP_DIR/enh \ 102 | $TMP_DIR/jul2013.roadmapData.qc.csv \ 103 | > $OUT_DIR/roadmapData.dyadic.merged.111.bed 104 | fi 105 | -------------------------------------------------------------------------------- /src/core/src/exp.check.R: -------------------------------------------------------------------------------- 1 | 2 | argv <- commandArgs(TRUE) 3 | 4 | sample <- argv[1] 5 | fpkm_in <- argv[2] 6 | ref_bi_in <- argv[3] 7 | ref_cohort_in <- argv[4] 8 | ref_white_in <- argv[5] 9 | outfile <- argv[6] 10 | precalt <- argv[7] 11 | 12 | fpkm.raw <- read.table(fpkm_in,sep="\t",header=T,row.names=2,quote="",stringsAsFactors=F) 13 | if (!sample %in% colnames(fpkm.raw)) { 14 | stop("sample not present in fpkm matrix.") 15 | } 16 | fpkm.sid <- fpkm.raw[,sample] 17 | names(fpkm.sid) <- rownames(fpkm.raw) 18 | 19 | ref.bi <- read.table(ref_bi_in,sep="\t",row.names=1,quote="",header=T,stringsAsFactors=F) 20 | ref.cohort <- read.table(ref_cohort_in,sep="\t",header=T,row.names=2,quote="",stringsAsFactors=F) 21 | ref.white <- read.table(ref_white_in,sep="\t",row.names=1,quote="",header=T,stringsAsFactors=F) 22 | ref.rawtval <- read.table(precalt,sep="\t",stringsAsFactors=F) 23 | ref.t <- ref.rawtval[,1] 24 | ref.t.total <- length(ref.t) 25 | 26 | out <- NULL 27 | for (i in 1:length(fpkm.sid)) { 28 | x.i <- fpkm.sid[i] 29 | x.i.raw <- x.i 30 | g.i <- names(fpkm.sid)[i] 31 | y.bi.raw <- NULL 32 | y.bi.sid <- NULL 33 | y.bi <- NULL 34 | t.bi <- NULL 35 | r.bi <- NULL 36 | p.bi <- NULL 37 | l.bi <- NULL 38 | t.bi.perc <- NULL 39 | y.cohort.raw <- NULL 40 | y.cohort.sid <- NULL 41 | y.cohort <- NULL 42 | t.cohort <- NULL 43 | r.cohort <- NULL 44 | p.cohort <- NULL 45 | l.cohort <- NULL 46 | t.cohort.perc <- NULL 47 | y.white.raw <- NULL 48 | y.white.sid <- NULL 49 | y.white <- NULL 50 | t.white <- NULL 51 | r.white <- NULL 52 | p.white <- NULL 53 | l.white <- NULL 54 | t.white.perc <- NULL 55 | if (x.i > 0) { 56 | x.i <- log10(x.i+0.1) 57 | if (g.i %in% rownames(ref.bi)) { 58 | y.bi.raw <- log10(as.numeric(unlist(strsplit(ref.bi[g.i,3],",",perl=T)))+0.1) 59 | y.bi.sid <- unlist(strsplit(ref.bi[g.i,2],",",perl=T)) 60 | if (sample %in% y.bi.sid) { 61 | y.bi <- y.bi.raw[!y.bi.sid %in% sample] 62 | }else { 63 | y.bi <- y.bi.raw 64 | } 65 | t.bi <- (x.i-mean(y.bi))/((1+(length(y.bi)-2)^-1)*(sd(y.bi)^2))^0.5 66 | p.bi <- pt(t.bi,length(y.bi)-2,lower.tail=F) 67 | r.bi <- length(y.bi[y.bi>x.i])+1 68 | l.bi <- length(y.bi) 69 | t.bi.perc <- (length(ref.t[ref.t>t.bi])+1)/ref.t.total 70 | } 71 | if (g.i %in% rownames(ref.white)) { 72 | y.white.raw <- log10(as.numeric(unlist(strsplit(ref.white[g.i,3],",",perl=T)))+0.1) 73 | y.white.sid <- unlist(strsplit(ref.white[g.i,2],",",perl=T)) 74 | if (sample %in% y.white.sid) { 75 | y.white <- y.white.raw[!y.white.sid %in% sample] 76 | }else { 77 | y.white <- y.white.raw 78 | } 79 | t.white <- (x.i-mean(y.white))/((1+(length(y.white)-2)^-1)*(sd(y.white)^2))^0.5 80 | p.white <- pt(t.white,length(y.white)-2,lower.tail=F) 81 | r.white <- length(y.white[y.white>x.i])+1 82 | l.white <- length(y.white) 83 | t.white.perc <- (length(ref.t[ref.t>t.white])+1)/ref.t.total 84 | } 85 | if (g.i %in% rownames(ref.cohort)) { 86 | y.cohort.raw <- log10(as.numeric(ref.cohort[g.i,7:ncol(ref.cohort)])+0.1) 87 | y.cohort.sid <- colnames(ref.cohort)[7:ncol(ref.cohort)] 88 | if (sample %in% y.cohort.sid) { 89 | y.cohort <- y.cohort.raw[!y.cohort.sid %in% sample] 90 | }else { 91 | y.cohort <- y.cohort.raw 92 | } 93 | t.cohort <- (x.i-mean(y.cohort))/((1+(length(y.cohort)-2)^-1)*(sd(y.cohort)^2))^0.5 94 | p.cohort <- pt(t.cohort,length(y.cohort)-2,lower.tail=F) 95 | r.cohort <- length(y.cohort[y.cohort>x.i])+1 96 | l.cohort <- length(y.cohort) 97 | t.cohort.perc <- (length(ref.t[ref.t>t.cohort])+1)/ref.t.total 98 | } 99 | if (is.null(p.bi)) {p.bi <- "na"} 100 | if (is.null(r.bi)) {r.bi <- "na"} 101 | if (is.null(l.bi)) {l.bi <- "na"} 102 | if (is.null(t.bi)) {t.bi <- "na"} 103 | if (is.null(t.bi.perc)) {t.bi.perc <- "na"} 104 | if (is.null(p.white)) {p.white <- "na"} 105 | if (is.null(r.white)) {r.white <- "na"} 106 | if (is.null(l.white)) {l.white <- "na"} 107 | if (is.null(t.white)) {t.white <- "na"} 108 | if (is.null(t.white.perc)) {t.white.perc <- "na"} 109 | if (is.null(p.cohort)) {p.cohort <- "na"} 110 | if (is.null(r.cohort)) {r.cohort <- "na"} 111 | if (is.null(l.cohort)) {l.cohort <- "na"} 112 | if (is.null(t.cohort)) {t.cohort <- "na"} 113 | if (is.null(t.cohort.perc)) {t.cohort.perc <- "na"} 114 | out <- rbind(out,c(g.i,x.i.raw,l.bi,p.bi,r.bi,t.bi,t.bi.perc,l.cohort,p.cohort,r.cohort,t.cohort,t.cohort.perc,l.white,p.white,r.white,t.white,t.white.perc)) 115 | } 116 | } 117 | colnames(out) <- c("Gene","fpkm.raw","size.bi","p.bi","rank.bi","tstatistic.bi","qval.bi","size.cohort","p.cohort","rank.cohort","tstatistic.cohort","qval.cohort","size.white","p.white","rank.white","tstatistic.white","qval.white") 118 | write.table(out,file=outfile,sep="\t",row.names=F,quote=F) 119 | -------------------------------------------------------------------------------- /src/core/src/scan.sv.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $ase_result = $ARGV[1]; 5 | my $ase_result_run = $ARGV[2]; 6 | my $sv_result = $ARGV[3]; 7 | my $output = $ARGV[4]; 8 | my $win = $ARGV[5]; 9 | my $refgene = $ARGV[6]; 10 | 11 | my (%genes,%chr2g,%candidates,%g2query); 12 | 13 | my $infile = $ase_result; 14 | open IN, "< $infile" or die "$infile: $!"; 15 | while() { 16 | chomp; 17 | next if $. == 1; 18 | my @F = split/\t/; 19 | $genes{$F[0]}{gsym} = $F[1]; 20 | $genes{$F[0]}{chrom} = $F[2]; 21 | $genes{$F[0]}{strand} = $F[3]; 22 | $genes{$F[0]}{start} = $F[4]; 23 | $genes{$F[0]}{end} = $F[5]; 24 | $chr2g{$F[2]}{$F[0]} = 1; 25 | $candidates{$F[1]} = 1; 26 | } 27 | close IN; 28 | 29 | $infile = $ase_result_run; 30 | open IN, "< $infile" or die "$infile: $!"; 31 | while() { 32 | chomp; 33 | next if $. == 1; 34 | my @F = split/\t/; 35 | if ($F[8]) { 36 | my @G = split(/,/,$F[8]); 37 | for my $g (@G) { 38 | next if $candidates{$g}; 39 | $g2query{$g}{tag} = 1; 40 | } 41 | } 42 | } 43 | close IN; 44 | 45 | $infile = $refgene; 46 | open IN, "< $infile" or die "$infile: $!"; 47 | while() { 48 | chomp; 49 | my @F = split/\t/; 50 | if ($g2query{$F[3]}) { 51 | my $len = $F[2] - $F[1]; 52 | if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) { 53 | $g2query{$F[3]}{acc} = $F[4]; 54 | $g2query{$F[3]}{chrom} = $F[0]; 55 | $g2query{$F[3]}{strand} = $F[5]; 56 | $g2query{$F[3]}{start} = $F[1]; 57 | $g2query{$F[3]}{end} = $F[2]; 58 | $g2query{$F[3]}{len} = $len; 59 | $g2query{$F[3]}{tag} = 2; 60 | }else { 61 | 1; 62 | } 63 | } 64 | } 65 | close IN; 66 | 67 | for my $g (sort keys %g2query) { 68 | if ($g2query{$g}{tag} == 1) { 69 | print "Error, $g not annotated.\n"; 70 | }else { 71 | $candidates{$g} = 1; 72 | my $acc = $g2query{$g}{acc}; 73 | $genes{$acc}{gsym} = $g; 74 | $genes{$acc}{chrom} = $g2query{$g}{chrom}; 75 | $genes{$acc}{strand} = $g2query{$g}{strand}; 76 | $genes{$acc}{start} = $g2query{$g}{start}; 77 | $genes{$acc}{end} = $g2query{$g}{end}; 78 | $chr2g{$g2query{$g}{chrom}}{$acc} = 1; 79 | } 80 | } 81 | 82 | $infile = $sv_result; 83 | open IN, "< $infile" or die "$infile: $!"; 84 | open OUT, "> $output" or die "$output: $!"; 85 | while() { 86 | chomp; 87 | if ($. == 1) { 88 | print OUT "gsym_left\tdist_left\tgsym_right\tdist_right\t$_\n"; 89 | next; 90 | } 91 | my @F = split/\t/; 92 | my $record = 0; 93 | my %left = (); 94 | my %right = (); 95 | ### check breakpoint on left. 96 | my $chrom = $F[0]; 97 | unless ($chrom =~ /^chr/) { 98 | $chrom = "chr" . $chrom; 99 | } 100 | my $pos = $F[1]; 101 | if ($chr2g{$chrom}) { 102 | my @g = keys %{$chr2g{$chrom}}; 103 | for my $g (@g) { 104 | if ($pos >= $genes{$g}{start}-$win and $pos <= $genes{$g}{end}+$win) { 105 | my $dist = 0; 106 | $record = 1; 107 | if ($genes{$g}{strand} eq "+") { 108 | $dist = $pos - $genes{$g}{start}; 109 | }else { 110 | $dist = $pos - $genes{$g}{end}; 111 | } 112 | $left{$genes{$g}{gsym}} = $dist; 113 | } 114 | } 115 | } 116 | ### check breakpoint on right. 117 | $chrom = $F[3]; 118 | unless ($chrom =~ /^chr/) { 119 | $chrom = "chr" . $chrom; 120 | } 121 | $pos = $F[4]; 122 | if ($chr2g{$chrom}) { 123 | my @g = keys %{$chr2g{$chrom}}; 124 | for my $g (@g) { 125 | if ($pos >= $genes{$g}{start}-$win and $pos <= $genes{$g}{end}+$win) { 126 | my $dist = 0; 127 | $record = 1; 128 | if ($genes{$g}{strand} eq "+") { 129 | $dist = $pos - $genes{$g}{start}; 130 | }else { 131 | $dist = $pos - $genes{$g}{end}; 132 | } 133 | $right{$genes{$g}{gsym}} = $dist; 134 | } 135 | } 136 | } 137 | next unless $record; 138 | my $left_gsym = ""; 139 | my $left_dist = ""; 140 | my $right_gsym = ""; 141 | my $right_dist = ""; 142 | for my $l (sort keys %left) { 143 | next unless $l; 144 | $left_gsym .= "$l,"; 145 | $left_dist .= "$left{$l},"; 146 | } 147 | for my $r (sort keys %right) { 148 | next unless $r; 149 | $right_gsym .= "$r,"; 150 | $right_dist .= "$right{$r},"; 151 | } 152 | $left_gsym =~ s/\,$//; 153 | $left_dist =~ s/\,$//; 154 | $right_gsym =~ s/\,$//; 155 | $right_dist =~ s/\,$//; 156 | print OUT "$left_gsym\t$left_dist\t$right_gsym\t$right_dist\t$_\n"; 157 | } 158 | close IN; 159 | close OUT; 160 | -------------------------------------------------------------------------------- /src/core/src/ase.candidate.byrun.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $thresh_fpkm = $ARGV[1]; 5 | my $thresh_loo_pvalue = $ARGV[2]; 6 | my $ase_result_run = $ARGV[3]; 7 | my $ohe_result = $ARGV[4]; 8 | my $outfile = $ARGV[5]; 9 | my $imprinting_genes = $ARGV[6]; 10 | my $oncogenes = $ARGV[7]; 11 | my $num_markers = $ARGV[8]; 12 | my $ase_result_gene = $ARGV[9]; 13 | my $thresh_ase_delta_di = $ARGV[10]; 14 | my $thresh_ase_delta_cnv = $ARGV[11]; 15 | 16 | my (%imprint,%g2loo,%glst,%oncog,%g2ase); 17 | 18 | my $infile = $imprinting_genes; 19 | open IN, "< $infile" or die "$infile: $!"; 20 | while() { 21 | chomp; 22 | next if $. == 1; 23 | my @F = split/\t/; 24 | $imprint{$F[0]} = $F[3]; 25 | } 26 | close IN; 27 | 28 | $infile = $oncogenes; 29 | open IN, "< $infile" or die "$infile: $!"; 30 | while() { 31 | chomp; 32 | next if $. == 1; 33 | $_ =~ s/\"//g; 34 | my @F = split/\t/; 35 | next unless $F[14]; 36 | next if $F[14] eq "TSG"; 37 | next if $F[0] eq "IGH" or $F[0] eq "IGK" or $F[0] eq "IGL" or $F[0] eq "HLA-A"; 38 | $oncog{$F[0]} = $F[14]; 39 | } 40 | close IN; 41 | 42 | $infile = $ase_result_gene; 43 | open IN, "< $infile" or die "$infile: $!"; 44 | while() { 45 | chomp; 46 | next if $. == 1; 47 | my @F = split/\t/; 48 | $g2ase{$F[1]}{pval} = $F[20]; ### using raw p here. 49 | $g2ase{$F[1]}{delta} = $F[19]; 50 | $g2ase{$F[1]}{tag} = $F[16]; 51 | } 52 | close IN; 53 | 54 | $infile = $ohe_result; 55 | open IN, "< $infile" or die "$infile: $!"; 56 | while() { 57 | chomp; 58 | next if $. == 1; 59 | my @F = split/\t/; 60 | $g2loo{$F[0]}{fpkm} = $F[1]; 61 | if ($F[12] ne "na") { 62 | $g2loo{$F[0]}{tval} = $F[15]; 63 | $g2loo{$F[0]}{tperc} = $F[16]; 64 | $g2loo{$F[0]}{rank} = $F[14]; 65 | $g2loo{$F[0]}{size} = $F[12]; 66 | $g2loo{$F[0]}{source} = "white_list"; 67 | }elsif ($F[2] ne "na") { 68 | $g2loo{$F[0]}{tval} = $F[5]; 69 | $g2loo{$F[0]}{tperc} = $F[6]; 70 | $g2loo{$F[0]}{rank} = $F[4]; 71 | $g2loo{$F[0]}{size} = $F[2]; 72 | $g2loo{$F[0]}{source} = "bi_cohort"; 73 | }else { 74 | $g2loo{$F[0]}{tval} = $F[10]; 75 | $g2loo{$F[0]}{tperc} = $F[11]; 76 | $g2loo{$F[0]}{rank} = $F[9]; 77 | $g2loo{$F[0]}{size} = $F[7]; 78 | $g2loo{$F[0]}{source} = "entire_cohort"; 79 | } 80 | } 81 | close IN; 82 | 83 | $infile = $ase_result_run; 84 | open IN, "< $infile" or die "$infile: $!"; 85 | open OUT, "> $outfile" or die "$outfile: $!"; 86 | while() { 87 | chomp; 88 | if ($. == 1) { 89 | print OUT "$_\tCandidates\n"; 90 | next; 91 | } 92 | my @F = split/\t/; 93 | next unless $F[5] >= $num_markers; 94 | if ($F[7]) { 95 | my @G = split(/,/,$F[7]); 96 | my $candidates = ""; 97 | for my $g (@G) { 98 | if ($imprint{$g} and ($imprint{$g} eq "Imprinted")) { 99 | next; 100 | } 101 | next unless $g2loo{$g}; 102 | my $keep = 0; 103 | if ($g2loo{$g}{fpkm} >= $thresh_fpkm and $g2loo{$g}{tperc} < $thresh_loo_pvalue) { 104 | $keep = 1; 105 | }elsif ($oncog{$g} and $g2loo{$g}{fpkm} >= 1 and $g2loo{$g}{tperc} < $thresh_loo_pvalue) { 106 | $keep = 1; 107 | }else { 108 | next; 109 | } 110 | if ($keep == 1) { 111 | if ($g2ase{$g}) { 112 | my @tag = split(/,/,$g2ase{$g}{tag}); 113 | my $tagnum = scalar(@tag); 114 | my $tagcnv = 0; 115 | for my $t (@tag) { 116 | if ($t eq "cnvloh") { 117 | $tagcnv++; 118 | } 119 | } 120 | my $class = "diploid"; 121 | if ($tagcnv/$tagnum > 0.3) { 122 | $class = "cnvloh"; 123 | } 124 | if ($class eq "diploid") { 125 | if ($g2ase{$g}{pval} < 0.1 and $g2ase{$g}{delta} >= $thresh_ase_delta_di) { 126 | $candidates .= "$g,"; 127 | } 128 | }elsif ($class eq "cnvloh") { 129 | if ($g2ase{$g}{pval} < 0.1 and $g2ase{$g}{delta} >= $thresh_ase_delta_cnv) { 130 | $candidates .= "$g,"; 131 | } 132 | }else { 133 | print "Wrong class for $class.\n"; 134 | next; 135 | } 136 | }else { 137 | $candidates .= "$g,"; 138 | } 139 | } 140 | } 141 | $candidates =~ s/\,$//; 142 | print OUT "$_\t$candidates\n"; 143 | }else { 144 | print OUT "$_\t\n"; 145 | } 146 | } 147 | close IN; 148 | close OUT; 149 | -------------------------------------------------------------------------------- /src/core/src/check.TAD.cnv.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | ### to be consistant with previous filter, the promoter was used. 4 | ### promoter was defined as 2kb upstream and 200bp downstream of tss, as in doi:10.1038/ng.3101 5 | #### TAD was combined hESC and IMR90 from Bing Ren's paper. 6 | 7 | my $sampleid = $ARGV[0]; 8 | my $tad_ref = $ARGV[1]; 9 | my $refgene = $ARGV[2]; 10 | my $input = $ARGV[3]; 11 | my $output = $ARGV[4]; 12 | 13 | my (%tad,%g2pro); 14 | 15 | my $infile = $tad_ref; 16 | open IN, "< $infile" or die "$infile: $!"; 17 | while() { 18 | chomp; 19 | my @F = split/\t/; 20 | my $id = "$F[0].$F[1].$F[2]"; 21 | $tad{$F[0]}{$id}{start} = $F[1]; 22 | $tad{$F[0]}{$id}{end} = $F[2]; 23 | $tad{$F[0]}{$id}{source} = "hESC"; 24 | } 25 | close IN; 26 | 27 | $infile = $refgene; 28 | open IN, "< $infile" or die "$infile: $!"; 29 | while() { 30 | chomp; 31 | my @F = split/\t/; 32 | next if $F[0] =~ /_random/; 33 | next if $F[0] =~ /_hap/; 34 | next if $F[0] =~ /chrUn/; 35 | if ($F[5] eq "+") { 36 | $g2pro{$F[3]}{$F[4]}{start} = $F[1] - 2000; 37 | $g2pro{$F[3]}{$F[4]}{end} = $F[1] + 200; 38 | $g2pro{$F[3]}{$F[4]}{chrom} = $F[0]; 39 | }elsif ($F[5] eq "-") { 40 | $g2pro{$F[3]}{$F[4]}{end} = $F[2] + 2000; 41 | $g2pro{$F[3]}{$F[4]}{start} = $F[2] - 200; 42 | $g2pro{$F[3]}{$F[4]}{chrom} = $F[0]; 43 | }else { 44 | print "Wrong strand info: $F[5] for $F[4].\n"; 45 | } 46 | } 47 | close IN; 48 | 49 | $infile = $input; 50 | my $outfile = $output; 51 | open OUT, "> $outfile" or die "$outfile: $!"; 52 | open IN, "< $infile" or die "$infile: $!"; 53 | while() { 54 | chomp; 55 | my @F = split/\t/; 56 | if ($. == 1) { 57 | print OUT "candidate.inTAD"; 58 | for my $i (2 .. $#F) { 59 | print OUT "\t$F[$i]"; 60 | } 61 | print OUT "\n"; 62 | next; 63 | } 64 | my %targets = (); 65 | my $target = ""; 66 | if ($F[0]) { 67 | my $chrom = $F[2]; 68 | unless ($chrom =~ /^chr/) { 69 | $chrom = "chr" . $chrom; 70 | } 71 | my $pos_l = $F[3]; 72 | my $pos_r = $F[4]; 73 | my @g = split(/,/,$F[0]); 74 | for my $tad (sort keys %{$tad{$chrom}}) { 75 | if ($pos_l <= $tad{$chrom}{$tad}{end} and $pos_l >= $tad{$chrom}{$tad}{start}) { 76 | for my $g (@g) { 77 | my $overlap = 0; 78 | if ($g2pro{$g}) { 79 | for my $acc (sort keys %{$g2pro{$g}}) { 80 | if ($g2pro{$g}{$acc}{chrom} ne $chrom) { 81 | print "Wrong chromosome for gene $g.\n"; 82 | }else { 83 | if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) { 84 | 1; 85 | }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) { 86 | $overlap = 1; 87 | }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) { 88 | $overlap = 1; 89 | }else { 90 | 1; 91 | } 92 | } 93 | } 94 | }else { 95 | print "No promoter info for $g.\n"; 96 | } 97 | if ($overlap == 1) { 98 | $targets{$g} = 1; 99 | } 100 | } 101 | } 102 | if ($pos_r <= $tad{$chrom}{$tad}{end} and $pos_r >= $tad{$chrom}{$tad}{start}) { 103 | for my $g (@g) { 104 | my $overlap = 0; 105 | if ($g2pro{$g}) { 106 | for my $acc (sort keys %{$g2pro{$g}}) { 107 | if ($g2pro{$g}{$acc}{chrom} ne $chrom) { 108 | print "Wrong chromosome for gene $g.\n"; 109 | }else { 110 | if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) { 111 | 1; 112 | }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) { 113 | $overlap = 1; 114 | }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) { 115 | $overlap = 1; 116 | }else { 117 | 1; 118 | } 119 | } 120 | } 121 | }else { 122 | print "No promoter info for $g.\n"; 123 | } 124 | if ($overlap == 1) { 125 | $targets{$g} = 1; 126 | } 127 | } 128 | } 129 | } 130 | } 131 | my @targets = sort keys %targets; 132 | $target = join(',',@targets); 133 | if ($target) { 134 | print OUT "$target"; 135 | for my $i (2 .. $#F) { 136 | print OUT "\t$F[$i]"; 137 | } 138 | print OUT "\n"; 139 | } 140 | } 141 | close IN; 142 | close OUT; 143 | 144 | -------------------------------------------------------------------------------- /src/core/src/scan.cnv.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $ase_result = $ARGV[1]; 5 | my $ase_result_run = $ARGV[2]; 6 | my $cnv_result = $ARGV[3]; 7 | my $output = $ARGV[4]; 8 | my $win = $ARGV[5]; 9 | my $size = $ARGV[6]; 10 | my $refgene = $ARGV[7]; 11 | my $perc_overlap = $ARGV[8]; 12 | 13 | my (%genes,%chr2g,%candidates,%g2query); 14 | 15 | my $infile = $ase_result; 16 | open IN, "< $infile" or die "$infile: $!"; 17 | while() { 18 | chomp; 19 | next if $. == 1; 20 | my @F = split/\t/; 21 | $genes{$F[0]}{gsym} = $F[1]; 22 | $genes{$F[0]}{chrom} = $F[2]; 23 | $genes{$F[0]}{strand} = $F[3]; 24 | $genes{$F[0]}{start} = $F[4]; 25 | $genes{$F[0]}{end} = $F[5]; 26 | $chr2g{$F[2]}{$F[0]} = 1; 27 | $candidates{$F[1]} = 1; 28 | } 29 | close IN; 30 | 31 | $infile = $ase_result_run; 32 | open IN, "< $infile" or die "$infile: $!"; 33 | while() { 34 | chomp; 35 | next if $. == 1; 36 | my @F = split/\t/; 37 | if ($F[8]) { 38 | my @G = split(/,/,$F[8]); 39 | for my $g (@G) { 40 | next if $candidates{$g}; 41 | $g2query{$g}{tag} = 1; 42 | } 43 | } 44 | } 45 | close IN; 46 | 47 | $infile = $refgene; 48 | open IN, "< $infile" or die "$infile: $!"; 49 | while() { 50 | chomp; 51 | my @F = split/\t/; 52 | if ($g2query{$F[3]}) { 53 | my $len = $F[2] - $F[1]; 54 | if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) { 55 | $g2query{$F[3]}{acc} = $F[4]; 56 | $g2query{$F[3]}{chrom} = $F[0]; 57 | $g2query{$F[3]}{strand} = $F[5]; 58 | $g2query{$F[3]}{start} = $F[1]; 59 | $g2query{$F[3]}{end} = $F[2]; 60 | $g2query{$F[3]}{len} = $len; 61 | $g2query{$F[3]}{tag} = 2; 62 | }else { 63 | 1; 64 | } 65 | } 66 | } 67 | close IN; 68 | 69 | for my $g (sort keys %g2query) { 70 | if ($g2query{$g}{tag} == 1) { 71 | print "Error, $g not annotated.\n"; 72 | }else { 73 | $candidates{$g} = 1; 74 | my $acc = $g2query{$g}{acc}; 75 | $genes{$acc}{gsym} = $g; 76 | $genes{$acc}{chrom} = $g2query{$g}{chrom}; 77 | $genes{$acc}{strand} = $g2query{$g}{strand}; 78 | $genes{$acc}{start} = $g2query{$g}{start}; 79 | $genes{$acc}{end} = $g2query{$g}{end}; 80 | $chr2g{$g2query{$g}{chrom}}{$acc} = 1; 81 | } 82 | } 83 | 84 | $infile = $cnv_result; 85 | open IN, "< $infile" or die "$infile: $!"; 86 | open OUT, "> $output" or die "$output: $!"; 87 | while() { 88 | chomp; 89 | if ($. == 1) { 90 | print OUT "gsym\tdist\t$_\n"; 91 | next; 92 | } 93 | my $record = 0; 94 | my @F = split/\t/; 95 | my $length = $F[2] - $F[1]; 96 | next unless $length <= $size; 97 | my %target = (); 98 | ### check for intersection. 99 | my $chrom = $F[0]; 100 | unless ($chrom =~ /^chr/) { 101 | $chrom = "chr" . $chrom; 102 | } 103 | my $pos_left = $F[1]; 104 | my $pos_right = $F[2]; 105 | # my $left_pos = $F[1]; 106 | # my $right_pos = $F[2]; 107 | if ($chr2g{$chrom}) { 108 | my @g = keys %{$chr2g{$chrom}}; 109 | for my $g (@g) { 110 | if ($pos_left > $genes{$g}{end}+$win) { 111 | 1; 112 | }elsif ($pos_right < $genes{$g}{start}-$win) { 113 | 1; 114 | }else { 115 | my $overlap = 0; 116 | my $glen = $genes{$g}{end} - $genes{$g}{start}; 117 | if ($pos_left <= $genes{$g}{start} and $pos_right >= $genes{$g}{start}) { 118 | if ($pos_right < $genes{$g}{end}) { 119 | $overlap = $pos_right - $genes{$g}{start}; 120 | }else { 121 | $overlap = $genes{$g}{end} - $genes{$g}{start}; 122 | } 123 | }elsif ($pos_left >= $genes{$g}{start} and $pos_left <= $genes{$g}{end}) { 124 | if ($pos_right <= $genes{$g}{end}) { 125 | $overlap = $pos_right - $pos_left; 126 | }else { 127 | $overlap = $genes{$g}{end} - $pos_left; 128 | } 129 | } 130 | if ($overlap/$glen < $perc_overlap) { 131 | $record = 1; 132 | my $dist = 0; 133 | if ($genes{$g}{strand} eq "+") { 134 | $dist = $pos_left - $genes{$g}{start}; 135 | $dist = $pos_right - $genes{$g}{start} if (abs($pos_right - $genes{$g}{start}) < abs($pos_left - $genes{$g}{start})); 136 | }else { 137 | $dist = $pos_left - $genes{$g}{end}; 138 | $dist = $pos_right - $genes{$g}{end} if (abs($pos_right - $genes{$g}{end}) < abs($pos_left - $genes{$g}{end})); 139 | } 140 | $target{$genes{$g}{gsym}} = $dist; 141 | } 142 | } 143 | } 144 | } 145 | next unless $record; 146 | my $target_gsym = ""; 147 | my $target_dist = ""; 148 | for my $t (sort keys %target) { 149 | next unless $t; 150 | $target_gsym .= "$t,"; 151 | $target_dist .= "$target{$t},"; 152 | } 153 | $target_gsym =~ s/\,$//; 154 | $target_dist =~ s/\,$//; 155 | print OUT "$target_gsym\t$target_dist\t$_\n"; 156 | } 157 | close IN; 158 | close OUT; 159 | 160 | -------------------------------------------------------------------------------- /src/core/src/check.TAD.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | ### to be consistant with previous filter, the promoter was used. 4 | ### promoter was defined as 2kb upstream and 200bp downstream of tss, as in doi:10.1038/ng.3101 5 | #### TAD was combined hESC and IMR90 from Bing Ren's paper. 6 | 7 | my $sampleid = $ARGV[0]; 8 | my $tad_ref = $ARGV[1]; 9 | my $refgene = $ARGV[2]; 10 | my $input = $ARGV[3]; 11 | my $output = $ARGV[4]; 12 | 13 | my (%tad,%g2pro); 14 | 15 | my $infile = $tad_ref; 16 | open IN, "< $infile" or die "$infile: $!"; 17 | while() { 18 | chomp; 19 | my @F = split/\t/; 20 | my $id = "$F[0].$F[1].$F[2]"; 21 | $tad{$F[0]}{$id}{start} = $F[1]; 22 | $tad{$F[0]}{$id}{end} = $F[2]; 23 | $tad{$F[0]}{$id}{source} = "hESC"; 24 | } 25 | close IN; 26 | 27 | $infile = $refgene; 28 | open IN, "< $infile" or die "$infile: $!"; 29 | while() { 30 | chomp; 31 | my @F = split/\t/; 32 | next if $F[0] =~ /_random/; 33 | next if $F[0] =~ /_hap/; 34 | next if $F[0] =~ /chrUn/; 35 | if ($F[5] eq "+") { 36 | $g2pro{$F[3]}{$F[4]}{start} = $F[1] - 2000; 37 | $g2pro{$F[3]}{$F[4]}{end} = $F[1] + 200; 38 | $g2pro{$F[3]}{$F[4]}{chrom} = $F[0]; 39 | }elsif ($F[5] eq "-") { 40 | $g2pro{$F[3]}{$F[4]}{end} = $F[2] + 2000; 41 | $g2pro{$F[3]}{$F[4]}{start} = $F[2] - 200; 42 | $g2pro{$F[3]}{$F[4]}{chrom} = $F[0]; 43 | }else { 44 | print "Wrong strand info: $F[5] for $F[4].\n"; 45 | } 46 | } 47 | close IN; 48 | 49 | $infile = $input; 50 | my $outfile = $output; 51 | open OUT, "> $outfile" or die "$outfile: $!"; 52 | open IN, "< $infile" or die "$infile: $!"; 53 | while() { 54 | chomp; 55 | my @F = split/\t/; 56 | if ($. == 1) { 57 | print OUT "left.candidate.inTAD\tright.candidate.inTAD"; 58 | for my $i (4 .. $#F) { 59 | print OUT "\t$F[$i]"; 60 | } 61 | print OUT "\n"; 62 | next; 63 | } 64 | my $left = ""; 65 | my $right = ""; 66 | if ($F[0]) { 67 | my $chrom = $F[4]; 68 | unless ($chrom =~ /^chr/) { 69 | $chrom = "chr" . $chrom; 70 | } 71 | my $pos = $F[5]; 72 | my @g = split(/,/,$F[0]); 73 | for my $tad (sort keys %{$tad{$chrom}}) { 74 | if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) { 75 | for my $g (@g) { 76 | my $overlap = 0; 77 | if ($g2pro{$g}) { 78 | for my $acc (sort keys %{$g2pro{$g}}) { 79 | if ($g2pro{$g}{$acc}{chrom} ne $chrom) { 80 | print "Wrong chromosome for gene $g.\n"; 81 | }else { 82 | if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) { 83 | 1; 84 | }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) { 85 | $overlap = 1; 86 | }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) { 87 | $overlap = 1; 88 | }else { 89 | 1; 90 | } 91 | } 92 | } 93 | }else { 94 | print "No promoter info for $g.\n"; 95 | } 96 | if ($overlap == 1) { 97 | $left .= "$g,"; 98 | } 99 | } 100 | } 101 | } 102 | } 103 | if ($F[2]) { 104 | my $chrom = $F[7]; 105 | unless ($chrom =~ /^chr/) { 106 | $chrom = "chr" . $chrom; 107 | } 108 | my $pos = $F[8]; 109 | my @g = split(/,/,$F[2]); 110 | for my $tad (sort keys %{$tad{$chrom}}) { 111 | if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) { 112 | for my $g (@g) { 113 | my $overlap = 0; 114 | if ($g2pro{$g}) { 115 | for my $acc (sort keys %{$g2pro{$g}}) { 116 | if ($g2pro{$g}{$acc}{chrom} ne $chrom) { 117 | print "Wrong chromosome for gene $g.\n"; 118 | }else { 119 | if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) { 120 | 1; 121 | }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) { 122 | $overlap = 1; 123 | }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) { 124 | $overlap = 1; 125 | }else { 126 | 1; 127 | } 128 | } 129 | } 130 | }else { 131 | print "No promoter info for $g.\n"; 132 | } 133 | if ($overlap == 1) { 134 | $right .= "$g,"; 135 | } 136 | } 137 | } 138 | } 139 | } 140 | $left =~ s/\,$//; 141 | $right =~ s/\,$//; 142 | if ($left or $right) { 143 | print OUT "$left\t$right"; 144 | for my $i (4 .. $#F) { 145 | print OUT "\t$F[$i]"; 146 | } 147 | print OUT "\n"; 148 | } 149 | } 150 | close IN; 151 | close OUT; 152 | 153 | -------------------------------------------------------------------------------- /dnanexus/cis-x/dxapp.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cis-x", 3 | "title": "St. Jude cis-X", 4 | "summary": "Search for activating regulatory variants in the tumor genome", 5 | "dxapi": "1.0.0", 6 | "version": "1.5.0-2", 7 | "openSource": true, 8 | "details": { 9 | "upstreamUrl": "https://github.com/stjude/cis-x", 10 | "upstreamVersion": "1.5.0", 11 | "upstreamLicenses": ["Apache-2.0"] 12 | }, 13 | "inputSpec": [ 14 | { 15 | "name": "sample_id", 16 | "label": "Sample ID", 17 | "class": "string" 18 | }, 19 | { 20 | "name": "markers", 21 | "label": "Single nucleotide variants", 22 | "class": "file", 23 | "help": "Tab-delimited text file containing raw sequence variants" 24 | }, 25 | { 26 | "name": "cnv_loh", 27 | "label": "CNV/LOH regions", 28 | "class": "file", 29 | "help": "Tab-delimited text file containing any aneuploidy region existing in the tumor genome under analysis" 30 | }, 31 | { 32 | "name": "bam", 33 | "label": "RNA-seq BAM", 34 | "class": "file" 35 | }, 36 | { 37 | "name": "bai", 38 | "label": "RNA-seq BAM index", 39 | "class": "file" 40 | }, 41 | { 42 | "name": "fpkm_matrix", 43 | "label": "Gene expression table", 44 | "class": "file", 45 | "help": "Tab-delimited text file containing gene level expressions for the tumor under analysis in FPKM (fragments per kilobase of transcript per million mapped reads)" 46 | }, 47 | { 48 | "name": "snv_indel", 49 | "label": "Somatic SNV/indel list", 50 | "class": "file", 51 | "help": "Tab-delimited file containing somatic SNV/indels in the tumor genome" 52 | }, 53 | { 54 | "name": "sv", 55 | "label": "Somatic SV", 56 | "class": "file", 57 | "help": "Tab-delimited file containing somatic acquired structural variants in the tumor genome" 58 | }, 59 | { 60 | "name": "cna", 61 | "label": "Somatic CNV", 62 | "class": "file", 63 | "help": "Tab-delimited file containing copy number aberrations in the tumor genome" 64 | }, 65 | { 66 | "name": "disease", 67 | "label": "Disease subtype", 68 | "class": "string", 69 | "choices": ["NBL", "TALL"] 70 | }, 71 | { 72 | "name": "cnv_loh_action", 73 | "label": "CNV/LOH action", 74 | "class": "string", 75 | "choices": ["keep", "drop"], 76 | "default": "keep" 77 | }, 78 | { 79 | "name": "min_coverage_wgs", 80 | "label": "Minimum coverage in WGS", 81 | "class": "int", 82 | "default": 10 83 | }, 84 | { 85 | "name": "min_coverage_rna_seq", 86 | "label": "Minimum coverage in WGS", 87 | "class": "int", 88 | "default": 10 89 | }, 90 | { 91 | "name": "fpkm_threshold_candidate", 92 | "label": "Candidate FPKM threshold", 93 | "class": "float", 94 | "default": 5.0 95 | }, 96 | { 97 | "name": "user_annotation", 98 | "label": "User annotations", 99 | "class": "file", 100 | "optional": true 101 | }, 102 | { 103 | "name": "chr_string", 104 | "label": "Reference sequence names have 'chr' prefix", 105 | "class": "string", 106 | "choices": ["TRUE", "FALSE"], 107 | "default": "TRUE" 108 | }, 109 | { 110 | "name": "tad_info", 111 | "label": "TAD annotations", 112 | "class": "file", 113 | "optional": true 114 | } 115 | ], 116 | "outputSpec": [ 117 | { 118 | "name": "cis_activated_candidates", 119 | "label": "cis-activated candidates", 120 | "class": "file", 121 | "help": "cis-activated candidates in the tumor genome under analysis" 122 | }, 123 | { 124 | "name": "sv_candidates", 125 | "label": "SV candidates", 126 | "class": "file", 127 | "help": "Structural variants (SV) candidates predicted as the causal for the cis-activated genes in the regulatory territory" 128 | }, 129 | { 130 | "name": "cna_candidates", 131 | "label": "CNA candidates", 132 | "class": "file", 133 | "help": "Copy number aberrations (CNA) predicted as the causal for the cis-activated genes in the regulatory territory" 134 | }, 135 | { 136 | "name": "snv_indel_candidates", 137 | "label": "SNV/indel candidates", 138 | "class": "file", 139 | "help": "SNV/indel candidates predicted as functional and predicted transcription factors" 140 | }, 141 | { 142 | "name": "ohe_results", 143 | "label": "OHE results", 144 | "class": "file", 145 | "help": "Raw outlier high expression (OHE) results" 146 | }, 147 | { 148 | "name": "ase_gene_results", 149 | "label": "Gene level ASE results", 150 | "class": "file", 151 | "help": "Raw gene level allelic specific expression (ASE) results" 152 | }, 153 | { 154 | "name": "ase_marker_results", 155 | "label": "Single marker ASE results", 156 | "class": "file", 157 | "help": "Raw single marker allelic specific expression (ASE) results" 158 | } 159 | ], 160 | "runSpec": { 161 | "timeoutPolicy": { 162 | "*": { 163 | "hours": 48 164 | } 165 | }, 166 | "interpreter": "bash", 167 | "distribution": "Ubuntu", 168 | "release": "16.04", 169 | "version": "1", 170 | "file": "src/cis-x.sh" 171 | }, 172 | "regionalOptions": { 173 | "azure:westus": { 174 | "systemRequirements": { 175 | "*": { 176 | "instanceType": "azure:mem1_ssd1_x4" 177 | } 178 | } 179 | } 180 | }, 181 | "access": { 182 | "allProjects": "VIEW", 183 | "project": "CONTRIBUTE" 184 | }, 185 | "authorizedUsers": [ 186 | "org-stjude_cloud", 187 | "org-stjude_cloud_app_runners" 188 | ] 189 | } 190 | -------------------------------------------------------------------------------- /src/core/bin/cis-X-run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -e 4 | 5 | CIS_X_HOME=$(realpath $(dirname $0)/../../..) 6 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..) 7 | PATH=$CIS_X_CORE_HOME/bin:$PATH 8 | 9 | usage() { 10 | basename $0 11 | echo 12 | echo "USAGE:" 13 | echo " cis-X run -s -o -l -g -b -e -m -v -c -d -a -w -r -f -u -h -t " 14 | echo 15 | echo "ARGS:" 16 | echo " -s Sample ID" 17 | echo " -o Output directory" 18 | echo " -l Path to single nucleotide markers" 19 | echo " -g Path to CNV/LOH regions" 20 | echo " -b Path to a RNA-Seq BAM (index must be in same directory)" 21 | echo " -e Path to gene expression table" 22 | echo " -m Path to somatic SNV/indels" 23 | echo " -v Path to somatic SVs" 24 | echo " -c Path to somatic CNVs" 25 | echo " -d Disease name" 26 | echo " -a Action of markers in CNV/LOH regions, either keep or drop (default=keep)" 27 | echo " -w Minimal coverage in WGS to include a heterozygous marker (default=10)" 28 | echo " -r Minimal coverage in RNA-seq to include a heterozygous marker (default=10)" 29 | echo " -f FPKM threshold for nominate cis-activated candidate (default=5)" 30 | echo " -u User applied annotation file in BED format (default=NotSpecified)" 31 | echo " -h if the RNA-seq BAM with 'chr' in name, TRUE|FALSE (default=TRUE)" 32 | echo " -t Path to the TAD annotation file in BED format in hg19 (default=hESC)" 33 | } 34 | 35 | CNV_LOH_ACTION=keep 36 | COVG_WGS=10 37 | COVG_RNA=10 38 | THRESH_CANDIDATE_FPKM=5 39 | ANNO_USER=NotSpecified 40 | CHR_STRING=TRUE 41 | TAD_INFO=hESC 42 | 43 | while getopts s:o:l:g:b:e:m:v:c:d:a:w:r:f:u:h:t: option 44 | do 45 | case "${option}" 46 | in 47 | s) SAMPLE_ID=${OPTARG};; 48 | o) ROOTDIR=${OPTARG};; 49 | l) HIGH20=${OPTARG};; 50 | g) CNV_LOH=${OPTARG};; 51 | b) RNABAM=${OPTARG};; 52 | e) FPKM_MATRIX=${OPTARG};; 53 | m) SNVINDEL_IN=${OPTARG};; 54 | v) SV_IN=${OPTARG};; 55 | c) CNA_IN=${OPTARG};; 56 | d) DISEASE=${OPTARG};; 57 | a) CNV_LOH_ACTION=${OPTARG};; 58 | w) COVG_WGS=${OPTARG};; 59 | r) COVG_RNA=${OPTARG};; 60 | f) THRESH_CANDIDATE_FPKM=${OPTARG};; 61 | u) ANNO_USER=${OPTARG};; 62 | h) CHR_STRING=${OPTARG};; 63 | t) TAD_INFO=${OPTARG};; 64 | esac 65 | done 66 | 67 | echo "INFO: $(date): cis-X-run: checking parameters" 68 | if [[ ! $SAMPLE_ID ]]; then 69 | echo "ERROR: sample-id not specified." 70 | usage 71 | exit 1 72 | elif [[ ! $ROOTDIR ]]; then 73 | echo "ERROR: results-dir not specified." 74 | usage 75 | exit 1 76 | elif [[ ! -f $HIGH20 ]]; then 77 | echo "ERROR: marker file $HIGH20 not exist." 78 | usage 79 | exit 1 80 | elif [[ ! -f $CNV_LOH ]]; then 81 | echo "ERROR: cnv-loh file $CNV_LOH not exist." 82 | usage 83 | exit 1 84 | elif [[ ! -f $RNABAM ]]; then 85 | echo "ERROR: RNA-seq BAM file $RNABAM not exist." 86 | usage 87 | exit 1 88 | elif [[ ! -f $FPKM_MATRIX ]]; then 89 | echo "ERROR: fpkm-matrix file $FPKM_MATRIX not exist." 90 | usage 91 | exit 1 92 | elif [[ ! -f $SNVINDEL_IN ]]; then 93 | echo "ERROR: snv-indel list file $SNVINDEL_IN not exist." 94 | usage 95 | exit 1 96 | elif [[ ! -f $SV_IN ]]; then 97 | echo "ERROR: sv list file $SV_IN not exist." 98 | usage 99 | exit 1 100 | elif [[ ! -f $CNA_IN ]]; then 101 | echo "ERROR: cna list file $CNA_IN not exist." 102 | usage 103 | exit 1 104 | elif [[ ! $DISEASE ]]; then 105 | echo "ERROR: disease type not specified." 106 | usage 107 | exit 1 108 | elif [[ $ANNO_USER != NotSpecified && ! -f $ANNO_USER ]]; then 109 | echo "ERROR: user specified annotation file $ANNO_USER not exist." 110 | usage 111 | exit 1 112 | elif [[ $TAD_INFO != hESC && ! -f $TAD_INFO ]]; then 113 | echo "ERROR: user specified TAD annotation file $TAD_INFO not exist." 114 | usage 115 | exit 1 116 | fi 117 | 118 | if [[ $TAD_INFO != hESC ]]; then 119 | TAD=$TAD_INFO 120 | else 121 | TAD=$CIS_X_HOME/refs/external/hESC.combined.domain.hg19.bed 122 | fi 123 | 124 | #SAMPLE_ID=$1 125 | #ROOTDIR=$2 126 | #HIGH20=$3 127 | #CNV_LOH=$4 128 | #RNABAM=$5 129 | #FPKM_MATRIX=$6 130 | #SNVINDEL_IN=$7 131 | #SV_IN=$8 132 | #CNA_IN=$9 133 | #DISEASE=${10} 134 | #CNV_LOH_ACTION=${11} 135 | #COVG_WGS=${12} 136 | #COVG_RNA=${13} 137 | #THRESH_CANDIDATE_FPKM=${14} 138 | 139 | echo "INFO: $(date): cis-X-run: start" 140 | 141 | WORKDIR=$ROOTDIR/$SAMPLE_ID/working_space 142 | mkdir -p $WORKDIR 143 | cd $WORKDIR 144 | 145 | SNV4_OUT="$WORKDIR/$SAMPLE_ID.snv4.txt" 146 | HET_OUT="$WORKDIR/$SAMPLE_ID.heterozygous.markers.txt" 147 | 148 | echo "INFO: $(date): cis-X-run: getting markers" 149 | cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT $COVG_WGS 150 | 151 | echo "INFO: $(date): cis-X-run: building matrices" 152 | MATRIX_OUT="$WORKDIR/matrix_combined_matrix_simple.tab" 153 | cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT $CHR_STRING 154 | 155 | echo "INFO: $(date): cis-X-run: running allelic specific expression tests" 156 | ASE_RESULT_MARKER="$WORKDIR/$SAMPLE_ID.ase.combine.WGS.RNAseq.goodmarkers.binom.txt" 157 | ASE_RESULT_GENE="$WORKDIR/$SAMPLE_ID.ase.gene.model.fdr.txt" 158 | ASE_RESULT_RUN="$WORKDIR/$SAMPLE_ID.ase.candidates.runs.txt" 159 | cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE $ASE_RESULT_RUN $CNV_LOH_ACTION $COVG_RNA 160 | 161 | echo "INFO: $(date): cis-X-run: running outlier high expression tests" 162 | OHE_RESULT="$WORKDIR/$SAMPLE_ID.OHE.results.txt" 163 | cis-X-test-outliers $SAMPLE_ID $DISEASE $FPKM_MATRIX $OHE_RESULT 164 | 165 | echo "INFO: $(date): cis-X-run: nominating candidates" 166 | CANDIDATES_RESULT="$WORKDIR/$SAMPLE_ID.cisActivated.candidates.txt" 167 | CANDIDATES_RESULT_RUN="$WORKDIR/$SAMPLE_ID.cisActivated.candidates.byRuns.txt" 168 | cis-X-nominate $SAMPLE_ID $ASE_RESULT_GENE $OHE_RESULT $CANDIDATES_RESULT $THRESH_CANDIDATE_FPKM $ASE_RESULT_RUN $CANDIDATES_RESULT_RUN 169 | 170 | echo "INFO: $(date): cis-X-run: screening candidates" 171 | SV_CAN="$SAMPLE_ID.sv.candidates.txt" 172 | CNA_CAN="$SAMPLE_ID.cna.candidates.txt" 173 | SNVINDEL_CAN="$SAMPLE_ID.snvindel.candidates.txt" 174 | cis-X-screen $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $SV_IN $CNA_IN $SNVINDEL_IN $FPKM_MATRIX $WORKDIR $SV_CAN $CNA_CAN $SNVINDEL_CAN $ANNO_USER $TAD 175 | 176 | RESDIR="$ROOTDIR/$SAMPLE_ID" 177 | 178 | cp $ASE_RESULT_GENE $RESDIR 179 | cp $ASE_RESULT_MARKER $RESDIR 180 | cp $OHE_RESULT $RESDIR 181 | cp $CANDIDATES_RESULT $RESDIR 182 | cp $CANDIDATES_RESULT_RUN $RESDIR 183 | cp $SNVINDEL_CAN $RESDIR 184 | cp $CNA_CAN $RESDIR 185 | cp $SV_CAN $RESDIR 186 | 187 | echo "INFO: $(date): cis-X-run: done" 188 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cis-X 2 | 3 | **cis-X** searches for activating regulatory variants in the tumor genome. 4 | 5 | Activating regular variants usually cause the cis-activation of target genes. 6 | To find cis-activated genes, allelic specific/imbalance expressions (ASE) and 7 | outlier high expression (OHE) signals are used. Variants in the same 8 | topologically associated domains with the candidates can then be searched, 9 | including structural variants (SV), copy number aberrations (CNA), and single 10 | nucleotide variations (SNV) and insertion/deletions (indel). 11 | 12 | A transcription factor binding analysis is also done, using motifs from 13 | [HOCOMOCO] v10 models. 14 | 15 | cis-X currently only works with hg19 (GRCh37). 16 | 17 | More details and examples on running cis-X can be found in the [user guide]. 18 | 19 | [HOCOMOCO]: http://hocomoco11.autosome.ru/ 20 | [user guide]: https://sjr-redesign.stjude.org/content/dam/research-redesign/labs/zhang-lab/cis-x-instructions.pdf 21 | 22 | ## Installation 23 | 24 | Installation is simply unpacking the source to a working directory and adding 25 | `$CIS_X_HOME/bin` to `PATH`. 26 | 27 | ### Prerequisites 28 | 29 | See [cis-X run][run] and [cis-X seed][seed] for the required tools and 30 | references. 31 | 32 | ## Usage 33 | 34 | ``` 35 | cis-X 36 | 37 | USAGE: 38 | cis-X [args...] 39 | 40 | SUBCOMMANDS: 41 | ref-exp Generate reference expression matrices 42 | run Search for activating regulatory variants in the tumor genome 43 | seed Download and generate a set of common references 44 | ``` 45 | 46 | For more details on how to run each command, see its respective README: 47 | [ref-exp], [run], and [seed]. 48 | 49 | ### Docker 50 | 51 | cis-X has a `Dockerfile` to create a [Docker] image, which sets up and installs 52 | all the required dependencies (sans references). To use this image, [install 53 | Docker] for your platform. 54 | 55 | For typical inputs, cis-X requires at least 4 GiB of RAM. This resource can 56 | be increased for the desktop version of Docker by going to Docker preferences 57 | \> Advanced \> Memory. 58 | 59 | [Docker]: https://www.docker.com/ 60 | [install Docker]: https://docs.docker.com/get-started/get-docker/ 61 | 62 | #### Build 63 | 64 | In the cis-X project directory, build the Docker image. 65 | 66 | ``` 67 | $ docker image build --tag cis-x . 68 | ``` 69 | 70 | #### Run 71 | 72 | The Docker image uses `bin/cis-X` as its entrypoint, giving access to all of its 73 | commands. 74 | 75 | The image assumes two working directories: `/data` for inputs and `/results` 76 | for outputs. `/data` can be read-only, whereas `/results` needs write access. 77 | External references (see [cis-X seed][seed]) also need to be mounted to 78 | `/opt/cis-x/refs/external`. For example, mounting to these directories requires 79 | three flags: 80 | 81 | ``` 82 | --mount type=bind,source=$HOME/research/data,target=/data,readonly \ 83 | --mount type=bind,source=/tmp/references,target=/opt/cis-x/refs/external,readonly \ 84 | --mount type=bind,source=$(pwd)/cis-x-out,target=/results \ 85 | ``` 86 | 87 | The source directives can point to any absolute path that can be accessed 88 | locally. They do not need to match their target directory. Also note that the 89 | results directory must exist before running the command. 90 | 91 | ##### Examples 92 | 93 | ###### cis-X seed 94 | 95 | > [!NOTE] 96 | > **cis-X seed** will likely not work due to [link rot](https://en.wikipedia.org/wiki/Link_rot). An alternative is to use the references included in the [demo data](#demo) instead. 97 | 98 | A basic example is running [cis-X seed][seed], which downloads and preprocesses 99 | required reference files to a directory. To run this locally, the `seed` 100 | subcommand is used, passing the destination directory of the resulting files. 101 | 102 | ``` 103 | $ cis-X seed /tmp/refs/external 104 | ``` 105 | 106 | To run this in a container using the Docker image, pass the subcommand and arguments 107 | as the command the container runs. 108 | 109 | ``` 110 | $ docker container run cis-x seed /tmp/refs/external 111 | ``` 112 | 113 | This, however, writes files to the container, rather than the host. To write 114 | files to the host from the container, mount the host destination directory to 115 | the container, e.g., `$(pwd)/refs/external` to `/opt/cis-x/refs/external`. The 116 | argument passed to the command must match the target directory. 117 | 118 | ``` 119 | $ docker container run \ 120 | --mount type=bind,source=$(pwd)/refs/external,target=/opt/cis-x/refs/external \ 121 | cis-x \ 122 | seed \ 123 | /opt/cis-x/refs/external 124 | ``` 125 | 126 | ###### cis-X run 127 | 128 | The following template shows the minimum set of arguments to execute the `run` 129 | command, with variables showing what needs to be set. 130 | 131 | ``` 132 | $ docker container run \ 133 | --mount type=bind,source=$DATA_DIR,target=/data,readonly \ 134 | --mount type=bind,source=$REFS_DIR,target=/opt/cis-x/refs/external,readonly \ 135 | --mount type=bind,source=$RESULT_DIR,target=/results \ 136 | cis-x \ 137 | run \ 138 | -s $SAMPLE_ID \ 139 | -o /results \ 140 | -l /data/$MARKERS \ 141 | -g /data/$CNV_LOH_REGIONS \ 142 | -b /data/$BAM \ 143 | -e /data/$GENE_EXPRESSION_TABLE \ 144 | -m /data/$SOMATIC_SNV_INDEL \ 145 | -v /data/$SOMATIC_SV \ 146 | -c /data/$SOMATIC_CNV \ 147 | -d $DISEASE 148 | ``` 149 | 150 | Note that pathname arguments are relative to the container's target. For 151 | example, mounting `$HOME/research` and with an input located at 152 | `$HOME/research/sample-001/markers.txt`, the corresponding argument is 153 | `/data/sample-001/markers.txt`. 154 | 155 | See the [Docker reference for `run`][docker-run] for more container run 156 | options. See [cis-X run][run] for more runtime options. 157 | 158 | [docker-run]: https://docs.docker.com/engine/containers/run/ 159 | 160 | ## Demo 161 | 162 | The next example runs cis-X with [demo data] (`cis-X-demo.tar.gz`). 163 | 164 | Set up the project home directory with the demo data. The following commands 165 | assume the demo is extracted to a `tmp` directory in the root of the project. 166 | 167 | ``` 168 | $ git clone https://github.com/stjude/cis-x.git 169 | $ cd cis-x 170 | $ docker image build --tag cis-x . 171 | $ mkdir tmp 172 | $ wget --directory-prefix tmp http://ftp.stjude.org/pub/software/cis-x/cis-X-demo.tar.gz 173 | $ tar xf tmp/cis-X-demo.tar.gz --directory tmp 174 | ``` 175 | 176 | Then run cis-X. 177 | 178 | ``` 179 | $ docker container run \ 180 | --mount type=bind,source=$(pwd)/tmp/demo/data,target=/data,readonly \ 181 | --mount type=bind,source=$(pwd)/tmp/demo/ref,target=/opt/cis-x/refs/external,readonly \ 182 | --mount type=bind,source=$(pwd)/tmp,target=/results \ 183 | cis-x \ 184 | run \ 185 | -s SJALL018373_D1 \ 186 | -o /results \ 187 | -l /data/SJALL018373_D1.test.wgs.markers.txt \ 188 | -g /data/SJALL018373_D1.test.wgs.cnvloh.txt \ 189 | -b /data/SJALL018373_D1.test.RNAseq.bam \ 190 | -e /data/SJALL018373_D1.test.RNASEQ_all_fpkm.txt \ 191 | -m /data/SJALL018373_D1.test.mut.txt \ 192 | -v /data/SJALL018373_D1.test.sv.txt \ 193 | -c /data/SJALL018373_D1.test.cna.txt \ 194 | -d TALL \ 195 | -a drop \ 196 | -w 10 \ 197 | -r 10 \ 198 | -f 5 199 | ``` 200 | 201 | [demo data]: http://ftp.stjude.org/pub/software/cis-x/cis-X-demo.tar.gz 202 | 203 | [ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp 204 | [run]: https://github.com/stjude/cis-x/tree/master/src/core 205 | [seed]: https://github.com/stjude/cis-x/tree/master/src/seed 206 | -------------------------------------------------------------------------------- /src/core/src/ase.candidate.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $thresh_ase_pvalue = $ARGV[0]; 4 | my $thresh_ase_delta_di = $ARGV[1]; 5 | my $thresh_ase_delta_cnv = $ARGV[2]; 6 | my $thresh_fpkm = $ARGV[3]; 7 | my $thresh_loo_pvalue = $ARGV[4]; 8 | my $sid = $ARGV[5]; 9 | my $outfile = $ARGV[6]; 10 | my $ase_result_gene = $ARGV[7]; 11 | my $ohe_result = $ARGV[8]; 12 | my $thresh_loo_hi_perc = $ARGV[9]; 13 | my $imprinting_genes = $ARGV[10]; 14 | my $oncogenes = $ARGV[11]; 15 | 16 | my (%imprint,%g2loo,%glst,%oncog); 17 | 18 | my $infile = $imprinting_genes; 19 | open IN, "< $infile" or die "$infile: $!"; 20 | while() { 21 | chomp; 22 | next if $. == 1; 23 | my @F = split/\t/; 24 | $imprint{$F[0]} = $F[3]; 25 | } 26 | close IN; 27 | 28 | $infile = $oncogenes; 29 | open IN, "< $infile" or die "$infile: $!"; 30 | while() { 31 | chomp; 32 | next if $. == 1; 33 | $_ =~ s/\"//g; 34 | my @F = split/\t/; 35 | next unless $F[14]; 36 | next if $F[14] eq "TSG"; 37 | next if $F[0] eq "IGH" or $F[0] eq "IGK" or $F[0] eq "IGL" or $F[0] eq "HLA-A"; 38 | $oncog{$F[0]} = $F[14]; 39 | } 40 | close IN; 41 | 42 | $infile = $ohe_result; 43 | open IN, "< $infile" or die "$infile: $!"; 44 | while() { 45 | chomp; 46 | next if $. == 1; 47 | my @F = split/\t/; 48 | $g2loo{$F[0]}{fpkm} = $F[1]; 49 | if ($F[12] ne "na") { 50 | $g2loo{$F[0]}{tval} = $F[15]; 51 | $g2loo{$F[0]}{tperc} = $F[16]; 52 | $g2loo{$F[0]}{rank} = $F[14]; 53 | $g2loo{$F[0]}{size} = $F[12]; 54 | $g2loo{$F[0]}{source} = "white_list"; 55 | }elsif ($F[2] ne "na") { 56 | $g2loo{$F[0]}{tval} = $F[5]; 57 | $g2loo{$F[0]}{tperc} = $F[6]; 58 | $g2loo{$F[0]}{rank} = $F[4]; 59 | $g2loo{$F[0]}{size} = $F[2]; 60 | $g2loo{$F[0]}{source} = "bi_cohort"; 61 | }else { 62 | $g2loo{$F[0]}{tval} = $F[10]; 63 | $g2loo{$F[0]}{tperc} = $F[11]; 64 | $g2loo{$F[0]}{rank} = $F[9]; 65 | $g2loo{$F[0]}{size} = $F[7]; 66 | $g2loo{$F[0]}{source} = "entire_cohort"; 67 | } 68 | } 69 | close IN; 70 | 71 | $infile = $ase_result_gene; 72 | open OUT, "> $outfile" or die "$outfile: $!"; 73 | open IN, "< $infile" or die "$infile: $!"; 74 | while() { 75 | chomp; 76 | if ($. == 1) { 77 | print OUT "$_\tFPKM\tloo.source\tloo.cohort.size\tloo.tstatistic\tloo.qval\tloo.rank\timprinting.status\tcandidate.group\tdescription\n"; 78 | next; 79 | } 80 | my @F = split/\t/; 81 | my $imprint = ""; 82 | my $candidate_group = ""; 83 | my $description = ""; 84 | if ($imprint{$F[1]}) { 85 | $imprint = $imprint{$F[1]}; 86 | } 87 | next if $imprint eq "Imprinted"; 88 | next unless $g2loo{$F[1]}; 89 | $glst{$F[1]} = 1; 90 | ### 2018/12/25, for the known oncogenes in cosmic, keep for next step if p-value pass the threshold and fpkm >= 1. 91 | if ($g2loo{$F[1]}{fpkm} >= $thresh_fpkm) { 92 | 1; 93 | }elsif ($oncog{$F[1]} and $g2loo{$F[1]}{fpkm} >= 1) { 94 | $description = "rescued-ohe"; 95 | }else { 96 | next; 97 | } 98 | # next unless $g2loo{$F[1]}{fpkm} >= $thresh_fpkm; 99 | my $tag = $F[16]; 100 | my @tag = split(/,/,$tag); 101 | my $tagnum = scalar(@tag); 102 | my $tagcnv = 0; 103 | for my $t (@tag) { 104 | if ($t eq "cnvloh") { 105 | $tagcnv++; 106 | } 107 | } 108 | my $class = "diploid"; 109 | if ($tagcnv/$tagnum > 0.3) { 110 | $class = "cnvloh"; 111 | } 112 | # next unless $g2loo{$F[1]}{pval} < $thresh_loo_pvalue; 113 | if ($F[22] < $thresh_ase_pvalue) { 114 | if ($class eq "cnvloh") { 115 | ### use $thresh_ase_delta_cnv if > 30% of the markers sits in cnvloh region. 116 | if ($F[19] >= $thresh_ase_delta_cnv) { 117 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 118 | $candidate_group = "ase_outlier"; 119 | }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) { 120 | $candidate_group = "ase_high"; 121 | }else { 122 | 1; 123 | } 124 | }else { 125 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 126 | $candidate_group = "uncertain_outlier"; 127 | }else { 128 | 1; 129 | } 130 | } 131 | }else { 132 | if ($F[19] >= $thresh_ase_delta_di) { 133 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 134 | $candidate_group = "ase_outlier"; 135 | }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) { 136 | $candidate_group = "ase_high"; 137 | }else { 138 | 1; 139 | } 140 | }else { 141 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 142 | $candidate_group = "uncertain_outlier"; 143 | }else { 144 | 1; 145 | } 146 | } 147 | } 148 | if ($oncog{$F[1]}) { 149 | if ($description) { 150 | $description = "$oncog{$F[1]}, $description"; 151 | }else { 152 | $description = "$oncog{$F[1]}"; 153 | } 154 | } 155 | }else { 156 | ### for the known oncogenes in cosmic, rescue if raw p-value < 0.05 && over 90% of markers show mono-allelic transcription (maf-rna < 0.1 || maf-rna > 0.9). 157 | my $keep = 1; 158 | if ($oncog{$F[1]} and $F[20] < 0.05) { 159 | my @mafrna = split(/,/,$F[17]); 160 | my $rnasig = 0; 161 | my $rnatot = scalar(@mafrna); 162 | for my $f (@mafrna) { 163 | if ($f > 0.9 or $f < 0.1) { 164 | $rnasig++; 165 | } 166 | } 167 | if ($rnasig/$rnatot < 0.9) { 168 | $keep = 0; 169 | } 170 | }else { 171 | $keep = 0; 172 | } 173 | if ($keep == 1) { 174 | if ($class eq "cnvloh") { 175 | ### use $thresh_ase_delta_cnv if > 30% of the markers sits in cnvloh region. 176 | if ($F[19] >= $thresh_ase_delta_cnv) { 177 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 178 | $candidate_group = "ase_outlier"; 179 | }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) { 180 | $candidate_group = "ase_high"; 181 | }else { 182 | 1; 183 | } 184 | }else { 185 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 186 | $candidate_group = "uncertain_outlier"; 187 | }else { 188 | 1; 189 | } 190 | } 191 | }else { 192 | if ($F[19] >= $thresh_ase_delta_di) { 193 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 194 | $candidate_group = "ase_outlier"; 195 | }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) { 196 | $candidate_group = "ase_high"; 197 | }else { 198 | 1; 199 | } 200 | }else { 201 | if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) { 202 | $candidate_group = "uncertain_outlier"; 203 | }else { 204 | 1; 205 | } 206 | } 207 | } 208 | if ($description) { 209 | $description = "$oncog{$F[1]}, rescued-ase, $description"; 210 | }else { 211 | $description = "$oncog{$F[1]}, rescued-ase"; 212 | } 213 | } 214 | } 215 | next unless $candidate_group eq "ase_outlier"; 216 | print OUT "$_\t$g2loo{$F[1]}{fpkm}\t$g2loo{$F[1]}{source}\t$g2loo{$F[1]}{size}\t$g2loo{$F[1]}{tval}\t$g2loo{$F[1]}{tperc}\t$g2loo{$F[1]}{rank}\t$imprint\t$candidate_group\t$description\n"; 217 | } 218 | close IN; 219 | -------------------------------------------------------------------------------- /src/core/src/snvindel.prep.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $snvindel_in = $ARGV[1]; 5 | my $ase_result = $ARGV[2]; 6 | my $ase_result_run = $ARGV[3]; 7 | my $sv_result = $ARGV[4]; 8 | my $cna_result = $ARGV[5]; 9 | my $tad_ref = $ARGV[6]; 10 | my $snvindel_list = $ARGV[7]; 11 | my $seqlist = $ARGV[8]; 12 | my $snvindel_win = $ARGV[9]; 13 | my $refgene = $ARGV[10]; 14 | 15 | my (%genes,%solved,%chr2g,%g2pro,%snvindel,%tad,%g2query); 16 | 17 | my $infile = $tad_ref; 18 | open IN, "< $infile" or die "$infile: $!"; 19 | while() { 20 | chomp; 21 | my @F = split/\t/; 22 | my $id = "$F[0].$F[1].$F[2]"; 23 | $tad{$F[0]}{$id}{start} = $F[1]; 24 | $tad{$F[0]}{$id}{end} = $F[2]; 25 | $tad{$F[0]}{$id}{source} = "hESC"; 26 | } 27 | close IN; 28 | 29 | $infile = $ase_result; 30 | open IN, "< $infile" or die "$infile: $!"; 31 | while() { 32 | chomp; 33 | next if $. == 1; 34 | my @F = split/\t/; 35 | $genes{$F[1]}{gsym} = $F[1]; 36 | $genes{$F[1]}{chrom} = $F[2]; 37 | $genes{$F[1]}{strand} = $F[3]; 38 | $genes{$F[1]}{start} = $F[4]; 39 | $genes{$F[1]}{end} = $F[5]; 40 | $chr2g{$F[2]}{$F[1]} = 1; 41 | if ($F[3] eq "+") { 42 | $g2pro{$F[1]}{start} = $F[4] - 2000; 43 | $g2pro{$F[1]}{end} = $F[4] + 200; 44 | }elsif ($F[3] eq "-") { 45 | $g2pro{$F[1]}{start} = $F[5] - 200; 46 | $g2pro{$F[1]}{end} = $F[5] + 2000; 47 | } 48 | } 49 | close IN; 50 | 51 | $infile = $ase_result_run; 52 | open IN, "< $infile" or die "$infile: $!"; 53 | while() { 54 | chomp; 55 | next if $. == 1; 56 | my @F = split/\t/; 57 | if ($F[8]) { 58 | my @G = split(/,/,$F[8]); 59 | for my $g (@G) { 60 | next if $genes{$g}; 61 | $g2query{$g}{tag} = 1; 62 | } 63 | } 64 | } 65 | close IN; 66 | 67 | $infile = $refgene; 68 | open IN, "< $infile" or die "$infile: $!"; 69 | while() { 70 | chomp; 71 | my @F = split/\t/; 72 | if ($g2query{$F[3]}) { 73 | my $len = $F[2] - $F[1]; 74 | if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) { 75 | $g2query{$F[3]}{acc} = $F[4]; 76 | $g2query{$F[3]}{chrom} = $F[0]; 77 | $g2query{$F[3]}{strand} = $F[5]; 78 | $g2query{$F[3]}{start} = $F[1]; 79 | $g2query{$F[3]}{end} = $F[2]; 80 | $g2query{$F[3]}{len} = $len; 81 | $g2query{$F[3]}{tag} = 2; 82 | }else { 83 | 1; 84 | } 85 | } 86 | } 87 | close IN; 88 | 89 | for my $g (sort keys %g2query) { 90 | if ($g2query{$g}{tag} == 1) { 91 | print "Error, $g not annotated.\n"; 92 | }else { 93 | $genes{$g}{gsym} = $g; 94 | $genes{$g}{chrom} = $g2query{$g}{chrom}; 95 | $genes{$g}{strand} = $g2query{$g}{strand}; 96 | $genes{$g}{start} = $g2query{$g}{start}; 97 | $genes{$g}{end} = $g2query{$g}{end}; 98 | $chr2g{$g2query{$g}{chrom}}{$g} = 1; 99 | if ($g2query{$g}{strand} eq "+") { 100 | $g2pro{$g}{start} = $g2query{$g}{start} - 2000; 101 | $g2pro{$g}{end} = $g2query{$g}{start} + 200; 102 | }elsif ($g2query{$g}{strand} eq "-") { 103 | $g2pro{$g}{start} = $g2query{$g}{end} - 200; 104 | $g2pro{$g}{end} = $g2query{$g}{end} + 2000; 105 | } 106 | } 107 | } 108 | 109 | $infile = $sv_result; 110 | open IN, "< $infile" or die "$infile: $!"; 111 | while() { 112 | chomp; 113 | next if $. == 1; 114 | my @F = split/\t/; 115 | if ($F[0]) { 116 | my @g = split(/,/,$F[0]); 117 | for my $g (@g) { 118 | $solved{$g} = 1; 119 | } 120 | } 121 | if ($F[1]) { 122 | my @g = split(/,/,$F[1]); 123 | for my $g (@g) { 124 | $solved{$g} = 1; 125 | } 126 | } 127 | } 128 | close IN; 129 | 130 | $infile = $cna_result; 131 | open IN, "< $infile" or die "$infile: $!"; 132 | while() { 133 | chomp; 134 | next if $. == 1; 135 | my @F = split/\t/; 136 | if ($F[0]) { 137 | my @g = split(/,/,$F[0]); 138 | for my $g (@g) { 139 | $solved{$g} = 1; 140 | } 141 | } 142 | } 143 | close IN; 144 | 145 | $infile = $snvindel_in; 146 | open IN, "< $infile" or die "$infile: $!"; 147 | while() { 148 | chomp; 149 | next if $. == 1; 150 | my @F = split/\t/; 151 | my $chrom = $F[0]; 152 | unless ($chrom =~ /^chr/) { 153 | $chrom = "chr" . $chrom; 154 | } 155 | my $pos = $F[1]; 156 | my $target = ""; 157 | my $dist_o = ""; 158 | my @g = sort keys %{$chr2g{$chrom}}; 159 | ### filter TAD. 160 | for my $tad (sort keys %{$tad{$chrom}}) { 161 | if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) { 162 | for my $g (@g) { 163 | next if $solved{$g}; 164 | my $overlap = 0; 165 | if ($g2pro{$g}) { 166 | if ($g2pro{$g}{start} > $tad{$chrom}{$tad}{end}) { 167 | 1; 168 | }elsif ($g2pro{$g}{start} >= $tad{$chrom}{$tad}{start}) { 169 | $overlap = 1; 170 | }elsif ($g2pro{$g}{end} >= $tad{$chrom}{$tad}{start}) { 171 | $overlap = 1; 172 | }else { 173 | 1; 174 | } 175 | }else { 176 | print "No promoter info for $g.\n"; 177 | } 178 | if ($overlap == 1) { 179 | ### require distance between target gene tss less than $snvindel_win. 180 | my $dist = abs($pos - $genes{$g}{start}); 181 | if ($genes{$g}{strand} eq "-") { 182 | $dist = abs($pos - $genes{$g}{end}); 183 | } 184 | if ($dist <= $snvindel_win) { 185 | $target .= "$g,"; 186 | $dist_o .= "$dist,"; 187 | } 188 | } 189 | } 190 | } 191 | } 192 | if ($target) { 193 | $target =~ s/\,$//; 194 | $dist_o =~ s/\,$//; 195 | my $snv4 = "$chrom.$F[1].$F[2].$F[3]"; 196 | $snvindel{$snv4}{target} = $target; 197 | $snvindel{$snv4}{type} = $F[4]; 198 | $snvindel{$snv4}{dist} = $dist_o; 199 | } 200 | } 201 | close IN; 202 | 203 | my $outfile = $snvindel_list; 204 | my $seq_out = $seqlist; 205 | open OUT, "> $outfile" or die "$outfile: $!"; 206 | open SEQLST, "> $seq_out" or die "$seq_out: $!"; 207 | print OUT "snv4\ttype\tref\tmut\tleft_name\tright_name\ttarget\tdist\tstart\tlength\n"; 208 | for my $snv4 (sort keys %snvindel) { 209 | my @s = split(/\./,$snv4); 210 | my $chrom = $s[0]; 211 | my $pos = $s[1]; 212 | my $ref = $s[2]; 213 | my $mut = $s[3]; 214 | my $target = $snvindel{$snv4}{target}; 215 | my $type = $snvindel{$snv4}{type}; 216 | my $dist = $snvindel{$snv4}{dist}; 217 | my $left = ""; 218 | my $right = ""; 219 | my $start = 21; 220 | my $length = 1; 221 | if ($type eq "snv") { 222 | $start_l = $pos - 1 - 20; 223 | $end_l = $pos - 1; 224 | $start_r = $pos - 1 + 1; 225 | $end_r = $pos - 1 + 1 + 20; 226 | $left = $chrom . ":" . $start_l . "-" . $end_l; 227 | $right = $chrom . ":" . $start_r . "-" . $end_r; 228 | }else { 229 | if ($ref eq "-") { 230 | $type = "ins"; 231 | $start_l = $pos - 1 - 20; 232 | $end_l = $pos - 1; 233 | $start_r = $pos - 1; 234 | $end_r = $pos - 1 + 20; 235 | $left = $chrom . ":" . $start_l . "-" . $end_l; 236 | $right = $chrom . ":" . $start_r . "-" . $end_r; 237 | my @seq = split(//,$mut); 238 | $length = scalar(@seq); 239 | }elsif ($mut eq "-") { 240 | $type = "del"; 241 | my @seq = split(//,$ref); 242 | my $len = scalar(@seq); 243 | $length = $len; 244 | $start_l = $pos - 1 - 20; 245 | $end_l = $pos - 1; 246 | $start_r = $pos - 1 + $len; 247 | $end_r = $pos - 1 + $len + 20; 248 | $left = $chrom . ":" . $start_l . "-" . $end_l; 249 | $right = $chrom . ":" . $start_r . "-" . $end_r; 250 | }else { 251 | $type = "complex_indel"; 252 | my @seq = split(//,$ref); 253 | my $len = scalar(@seq); 254 | $start_l = $pos - 1 - 20; 255 | $end_l = $pos - 1; 256 | $start_r = $pos - 1 + $len; 257 | $end_r = $pos - 1 + $len + 20; 258 | $left = $chrom . ":" . $start_l . "-" . $end_l; 259 | $right = $chrom . ":" . $start_r . "-" . $end_r; 260 | } 261 | } 262 | $left =~ s/^chr//; 263 | $right =~ s/^chr//; 264 | print OUT "$snv4\t$type\t$ref\t$mut\t$left\t$right\t$target\t$dist\t$start\t$length\n"; 265 | print SEQLST "$left\n$right\n"; 266 | } 267 | close OUT; 268 | close SEQLST; 269 | -------------------------------------------------------------------------------- /src/core/src/snvindel.process.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | 3 | my $sid = $ARGV[0]; 4 | my $fimo_pred = $ARGV[1]; 5 | my $fimo_acc2gsym = $ARGV[2]; 6 | my $snvindel_input = $ARGV[3]; 7 | my $fpkm_res = $ARGV[4]; 8 | my $tf_fpkm_thresh = $ARGV[5]; 9 | my $output = $ARGV[6]; 10 | my $roadmap_enh = $ARGV[7]; 11 | my $roadmap_pro = $ARGV[8]; 12 | my $roadmap_dya = $ARGV[9]; 13 | my $anno_user = $ARGV[10]; 14 | 15 | my (%tf2gsym,%tflst,%g2fpkm,%var,%var2tf,%chr2var); 16 | 17 | my $infile = $fimo_acc2gsym; 18 | open IN, "< $infile" or die "$infile: $!"; 19 | while() { 20 | chomp; 21 | next if $. == 1; 22 | my @F = split/\t/; 23 | next if $F[0] =~ /RETRACTED/; 24 | # $tf2gsym{$F[0]} = $F[2]; 25 | # $tflst{$F[2]} = 1; 26 | $tf2gsym{$F[0]} = $F[1]; 27 | $tflst{$F[1]} = 1; 28 | } 29 | close IN; 30 | 31 | ### coded with current format, may need improve. 32 | $infile = $fpkm_res; 33 | open IN, "< $infile" or die "$infile: $!"; 34 | while() { 35 | chomp; 36 | next if $. == 1; 37 | my @F = split/\t/; 38 | next unless $tflst{$F[1]}; 39 | if ($g2fpkm{$F[1]}) { 40 | if ($g2fpkm{$F[1]} < $F[7]) { 41 | $g2fpkm{$F[1]} = $F[7]; 42 | } 43 | }else { 44 | $g2fpkm{$F[1]} = $F[7]; 45 | } 46 | } 47 | close IN; 48 | 49 | $infile = $snvindel_input; 50 | open IN, "< $infile" or die "$infile: $!"; 51 | while() { 52 | chomp; 53 | next if $. == 1; 54 | my @F = split/\t/; 55 | my @f = split(/\./,$F[0]); 56 | $var{$F[0]}{target} = $F[6]; 57 | $var{$F[0]}{dist} = $F[7]; 58 | $var{$F[0]}{type} = $F[1]; 59 | $var{$F[0]}{start} = $F[8]; 60 | $var{$F[0]}{len} = $F[9]; 61 | $var{$F[0]}{mut} = $F[3]; 62 | $var{$F[0]}{ref} = $F[2]; 63 | $var{$F[0]}{chrom} = $f[0]; 64 | $var{$F[0]}{pos} = $f[1]; 65 | $chr2var{$f[0]}{$F[0]} = 1; 66 | } 67 | close IN; 68 | 69 | open IN, "< $roadmap_enh" or die "$roadmap_enh: $!"; 70 | while() { 71 | chomp; 72 | my @F = split/\t/; 73 | for my $var (sort keys %{$chr2var{$F[0]}}) { 74 | if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) { 75 | $var{$var}{enh}{$F[7]} = 1; 76 | } 77 | } 78 | } 79 | close IN; 80 | 81 | open IN, "< $roadmap_pro" or die "$roadmap_pro: $!"; 82 | while() { 83 | chomp; 84 | my @F = split/\t/; 85 | for my $var (sort keys %{$chr2var{$F[0]}}) { 86 | if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) { 87 | $var{$var}{pro}{$F[7]} = 1; 88 | } 89 | } 90 | } 91 | close IN; 92 | 93 | open IN, "< $roadmap_dya" or die "$roadmap_dya: $!"; 94 | while() { 95 | chomp; 96 | my @F = split/\t/; 97 | for my $var (sort keys %{$chr2var{$F[0]}}) { 98 | if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) { 99 | $var{$var}{dya}{$F[7]} = 1; 100 | } 101 | } 102 | } 103 | close IN; 104 | 105 | if ($anno_user ne "NotSpecified") { 106 | open IN, "< $anno_user" or die "$anno_user: $!"; 107 | while() { 108 | chomp; 109 | my @F = split/\t/; 110 | if ($. == 1) { 111 | if ($#F < 2) { 112 | print "Error: please provide the annotation file with BED format.\n"; 113 | exit; 114 | } 115 | } 116 | my $uchrom = $F[0]; 117 | unless ($uchrom =~ /^chr/) { 118 | $uchrom = "chr" . $uchrom; 119 | } 120 | for my $var (sort keys %{$chr2var{$uchrom}}) { 121 | if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) { 122 | my $uname = "$F[0].$F[1].$F[2]"; 123 | $var{$var}{usr}{$uname} = 1; 124 | } 125 | } 126 | } 127 | close IN; 128 | } 129 | 130 | my @var = sort keys %var; 131 | my $varnum = scalar(@var); 132 | if ($varnum == 0) { 133 | open OUT, "> $output" or die "$output: $!"; 134 | print OUT "chrom\tpos\tref\tmut\ttarget\tdist\ttf\tEpiRoadmap_enhancer\tEpiRoadmap_promoter\tEpiRoadmap_dyadic\tUser_Annot\n"; 135 | close OUT; 136 | }else { 137 | $infile = $fimo_pred; 138 | open IN, "< $infile" or die "$infile: $!"; 139 | while() { 140 | chomp; 141 | next if $. == 1; 142 | my @F = split/\t/; 143 | my $tf = ""; 144 | my $fpkm = 0; 145 | if ($tf2gsym{$F[0]}) { 146 | $tf = $tf2gsym{$F[0]}; 147 | } 148 | next unless $tf; 149 | if ($tf eq "MYBL1" or $tf eq "MYBL2") { ### 2019-03-11, The motif for MYB, MYBL1/2 are very similar in this version of db. May update in later version. 150 | $tf = "MYB"; 151 | } 152 | if ($g2fpkm{$tf}) { 153 | $fpkm = $g2fpkm{$tf}; 154 | } 155 | next unless $fpkm > $tf_fpkm_thresh; 156 | my $mut = 0; 157 | my $ref = 0; 158 | my @f = split(/\./,$F[1]); 159 | my $var = "$f[0].$f[1].$f[2].$f[3]"; 160 | my $type = $var{$var}{type}; 161 | my $pos = $var{$var}{start}; 162 | my $len = $var{$var}{len}; 163 | if ($type eq "snv") { 164 | if ($pos >= $F[2] and $pos <= $F[3]) { 165 | if ($f[4] eq "mut") { 166 | $mut = 1; 167 | }elsif ($f[4] eq "ref") { 168 | $ref = 1; 169 | }else { 170 | print "Wrong type for $F[1].\n"; 171 | } 172 | } 173 | }elsif ($type eq "ins") { 174 | if ($f[4] eq "mut") { 175 | my $start = $pos; 176 | my $end = $pos + $len - 1; 177 | if ($start > $F[3]) { 178 | 1; 179 | }elsif ($start >= $F[2]) { 180 | $mut = 1; 181 | }elsif ($end >= $F[2]) { 182 | $mut = 1; 183 | }else { 184 | 1; 185 | } 186 | }elsif ($f[4] eq "ref") { 187 | if ($pos >= $F[2] and $pos <= $F[3]) { 188 | $ref = 1; 189 | } 190 | }else { 191 | print "Wrong type for $F[1].\n"; 192 | } 193 | }elsif ($type eq "del") { 194 | if ($f[4] eq "mut") { 195 | if ($pos >= $F[2] and $pos <= $F[3]) { 196 | $mut = 1; 197 | } 198 | }elsif ($f[4] eq "ref") { 199 | my $start = $pos; 200 | my $end = $pos + $len - 1; 201 | if ($start > $F[3]) { 202 | 1; 203 | }elsif ($start >= $F[2]) { 204 | $ref = 1; 205 | }elsif ($end >= $F[2]) { 206 | $ref = 1; 207 | }else { 208 | 1; 209 | } 210 | }else { 211 | print "Wrong type for $F[1].\n"; 212 | } 213 | }elsif ($type eq "complex_indel") { 214 | if ($f[4] eq "mut") { 215 | my @mutseq = split(//,$var{$var}{mut}); 216 | my $seqlen = scalar(@mutseq); 217 | my $start = $pos; 218 | my $end = $pos + $seqlen - 1; 219 | if ($start > $F[3]) { 220 | 1; 221 | }elsif ($start >= $F[2]) { 222 | $mut = 1; 223 | }elsif ($end >= $F[2]) { 224 | $mut = 1; 225 | }else { 226 | 1; 227 | } 228 | }elsif ($f[4] eq "ref") { 229 | my @refseq = split(//,$var{$var}{ref}); 230 | my $seqlen = scalar(@refseq); 231 | my $start = $pos; 232 | my $end = $pos + $seqlen - 1; 233 | if ($start > $F[3]) { 234 | 1; 235 | }elsif ($start >= $F[2]) { 236 | $ref = 1; 237 | }elsif ($end >= $F[2]) { 238 | $ref = 1; 239 | }else { 240 | 1; 241 | } 242 | }else { 243 | print "Wrong type for $F[1].\n"; 244 | } 245 | }else { 246 | print "Wrong type for $type.\n"; 247 | } 248 | if ($f[4] eq "mut") { 249 | if ($var{$var}{tf}{$tf}{mut} and $var{$var}{tf}{$tf}{mut} == 1) { 250 | 1; 251 | }else { 252 | $var{$var}{tf}{$tf}{mut} = $mut; 253 | } 254 | }elsif ($f[4] eq "ref") { 255 | if ($var{$var}{tf}{$tf}{ref} and $var{$var}{tf}{$tf}{ref} == 1) { 256 | 1; 257 | }else { 258 | $var{$var}{tf}{$tf}{ref} = $ref; 259 | } 260 | }else { 261 | print "Wrong type for $F[1].\n"; 262 | } 263 | $var{$var}{tf}{$tf}{fpkm} = $fpkm; 264 | } 265 | close IN; 266 | 267 | open OUT, "> $output" or die "$output: $!"; 268 | print OUT "chrom\tpos\tref\tmut\ttype\ttarget\tdist\ttf\tEpiRoadmap_enhancer\tEpiRoadmap_promoter\tEpiRoadmap_dyadic\tUser_Annot\n"; 269 | for my $var (sort keys %var) { 270 | my $pred_tf = ""; 271 | for my $tf (sort {$var{$var}{tf}{$b}{fpkm} <=> $var{$var}{tf}{$a}{fpkm}} keys %{$var{$var}{tf}}) { 272 | my $ref = 0; 273 | my $mut = 0; 274 | $ref = $var{$var}{tf}{$tf}{ref} if $var{$var}{tf}{$tf}{ref}; 275 | $mut = $var{$var}{tf}{$tf}{mut} if $var{$var}{tf}{$tf}{mut}; 276 | if ($mut == 1 and $ref == 0) { 277 | $pred_tf .= "$tf,"; 278 | } 279 | } 280 | if ($pred_tf) { 281 | my @var = split(/\./,$var); 282 | $pred_tf =~ s/\,$//; 283 | my $enh = ""; 284 | my $pro = ""; 285 | my $dya = ""; 286 | my $user = ""; 287 | if ($var{$var}{enh}) { 288 | my @enh = sort keys %{$var{$var}{enh}}; 289 | $enh = join(',',@enh); 290 | } 291 | if ($var{$var}{pro}) { 292 | my @pro = sort keys %{$var{$var}{pro}}; 293 | $pro = join(',',@pro); 294 | } 295 | if ($var{$var}{dya}) { 296 | my @dya = sort keys %{$var{$var}{dya}}; 297 | $dya = join(',',@dya); 298 | } 299 | if ($var{$var}{usr}) { 300 | my @user = sort keys %{$var{$var}{usr}}; 301 | $user = join(',',@user); 302 | } 303 | print OUT "$var[0]\t$var[1]\t$var[2]\t$var[3]\t$var{$var}{type}\t$var{$var}{target}\t$var{$var}{dist}\t$pred_tf\t$enh\t$pro\t$dya\t$user\n"; 304 | } 305 | } 306 | close OUT; 307 | } 308 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /src/core/src/ase_runs.pl: -------------------------------------------------------------------------------- 1 | #! /usr/bin/perl -w 2 | ### updated 2019-04-08. 3 | 4 | my $infile = $ARGV[0]; 5 | my $delta_di = $ARGV[1]; 6 | my $delta_cnv = $ARGV[2]; 7 | my $outfile = $ARGV[3]; 8 | my $bedfile = $ARGV[4]; 9 | my $num_markers = $ARGV[5]; 10 | my $frac_markers = $ARGV[6]; 11 | my $dist_markers = $ARGV[7]; 12 | 13 | my $cluster = 0; 14 | 15 | print "$frac_markers\n"; 16 | 17 | #my $num_markers = 15; ### at least 4 markers to call a run. 18 | #my $frac_markers = 0.75; ### minimal 75% of markers to be "s" or "e" to call a run. 19 | #my $dist_markers = 200000; ### markers separated over 200kb will not be joined into a run. 20 | 21 | my (%runs,%buff,$last1,$last2); 22 | my $buff = 0; 23 | 24 | open IN, "< $infile" or die "$infile: $!"; 25 | while() { 26 | chomp; 27 | next if $. == 1; 28 | my @F = split/\t/; 29 | my $tag = "f"; 30 | 31 | if ($F[10] == 0 or $F[9] == 0) { 32 | $tag = "e"; ### extreme 33 | }elsif (($F[10] == 1 and $F[9] >= 4) or ($F[9] == 1 and $F[10] >= 4)) { 34 | if ($F[11] <= 0.05) { 35 | $tag = "e"; ### extreme with error but significant 36 | }else { 37 | $tag = "E"; ### extreme with error 38 | } 39 | }elsif ($F[11] <= 0.05) { 40 | if ($F[8] eq "diploid") { 41 | if ($F[12] >= $delta_di) { 42 | $tag = "s"; ### significant 43 | } 44 | }elsif ($F[8] eq "cnvloh") { 45 | if ($F[12] >= $delta_cnv) { 46 | $tag = "s"; ### significant 47 | } 48 | }else { 49 | print "Error: wrong tag for $F[8] at $F[0] $F[1].\n"; 50 | } 51 | }else { 52 | 1; 53 | } 54 | 55 | if ($tag eq "s" or $tag eq "e" or $tag eq "E") { 56 | if ($buff == 0) { 57 | $buff = 1; 58 | $buff{chrom} = $F[0]; 59 | $buff{pos} = $F[1]; 60 | $buff{tags} = $tag; 61 | }elsif ($F[0] ne $buff{chrom}) { 62 | my @t = split(//,$buff{tags}); 63 | my $num = 0; 64 | my $perc = 0; 65 | my $numE = 0; 66 | my $percE = 0; 67 | my $totm = scalar(@t); 68 | for my $m (@t) { 69 | $num++ if ($m eq "s" or $m eq "e"); 70 | $numE++ if $m eq "E"; 71 | } 72 | $perc = $num / $totm; 73 | $percE = $numE / $totm; 74 | if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) { 75 | $cluster++; 76 | if ($t[$#t] ne "f" and $t[$#t-1] ne "f") { 77 | my @pos = split(/,/,$buff{pos}); 78 | $runs{$cluster}{chrom} = $buff{chrom}; 79 | $runs{$cluster}{start} = $pos[0]; 80 | $runs{$cluster}{end} = $pos[$#pos]; 81 | $runs{$cluster}{tags} = $buff{tags}; 82 | }elsif ($t[$#t] eq "f") { 83 | my @pos = split(/,/,$buff{pos}); 84 | $runs{$cluster}{chrom} = $buff{chrom}; 85 | $runs{$cluster}{start} = $pos[0]; 86 | $runs{$cluster}{end} = $pos[$#pos-1]; 87 | for my $i (0 .. $#t-1) { 88 | $runs{$cluster}{tags} .= "$t[$i]"; 89 | } 90 | }elsif ($t[$#t-1] eq "f") { 91 | my @pos = split(/,/,$buff{pos}); 92 | $runs{$cluster}{chrom} = $buff{chrom}; 93 | $runs{$cluster}{start} = $pos[0]; 94 | $runs{$cluster}{end} = $pos[$#pos-2]; 95 | for my $i (0 .. $#t-2) { 96 | $runs{$cluster}{tags} .= "$t[$i]"; 97 | } 98 | }else { 99 | print "Warning: $buff{tags}\n"; 100 | } 101 | } 102 | %buff = (); 103 | $buff = 1; 104 | $buff{chrom} = $F[0]; 105 | $buff{pos} = $F[1]; 106 | $buff{tags} = $tag; 107 | }else { 108 | my @pos = split(/,/,$buff{pos}); 109 | if ($F[1] - $pos[$#pos] <= $dist_markers) { 110 | $buff{pos} .= ",$F[1]"; 111 | $buff{tags} .= $tag; 112 | }else { 113 | my @t = split(//,$buff{tags}); 114 | my $num = 0; 115 | my $perc = 0; 116 | my $numE = 0; 117 | my $percE = 0; 118 | my $totm = scalar(@t); 119 | for my $m (@t) { 120 | $num++ if ($m eq "s" or $m eq "e"); 121 | $numE++ if $m eq "E"; 122 | } 123 | $perc = $num / $totm; 124 | $percE = $numE / $totm; 125 | if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) { 126 | $cluster++; 127 | if ($t[$#t] ne "f" and $t[$#t-1] ne "f") { 128 | my @pos = split(/,/,$buff{pos}); 129 | $runs{$cluster}{chrom} = $buff{chrom}; 130 | $runs{$cluster}{start} = $pos[0]; 131 | $runs{$cluster}{end} = $pos[$#pos]; 132 | $runs{$cluster}{tags} = $buff{tags}; 133 | }elsif ($t[$#t] eq "f") { 134 | my @pos = split(/,/,$buff{pos}); 135 | $runs{$cluster}{chrom} = $buff{chrom}; 136 | $runs{$cluster}{start} = $pos[0]; 137 | $runs{$cluster}{end} = $pos[$#pos-1]; 138 | for my $i (0 .. $#t-1) { 139 | $runs{$cluster}{tags} .= "$t[$i]"; 140 | } 141 | }elsif ($t[$#t-1] eq "f") { 142 | my @pos = split(/,/,$buff{pos}); 143 | $runs{$cluster}{chrom} = $buff{chrom}; 144 | $runs{$cluster}{start} = $pos[0]; 145 | $runs{$cluster}{end} = $pos[$#pos-2]; 146 | for my $i (0 .. $#t-2) { 147 | $runs{$cluster}{tags} .= "$t[$i]"; 148 | } 149 | }else { 150 | print "Warning: $buff{tags}\n"; 151 | } 152 | } 153 | %buff = (); 154 | $buff = 1; 155 | $buff{chrom} = $F[0]; 156 | $buff{pos} = $F[1]; 157 | $buff{tags} = $tag; 158 | } 159 | } 160 | }else { 161 | if ($buff == 1) { 162 | if ($F[0] ne $buff{chrom}) { 163 | my @t = split(//,$buff{tags}); 164 | my $num = 0; 165 | my $perc = 0; 166 | my $numE = 0; 167 | my $percE = 0; 168 | my $totm = scalar(@t); 169 | for my $m (@t) { 170 | $num++ if ($m eq "s" or $m eq "e"); 171 | $numE++ if $m eq "E"; 172 | } 173 | $perc = $num / $totm; 174 | $percE = $numE/ $totm; 175 | if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) { 176 | $cluster++; 177 | if ($t[$#t] ne "f" and $t[$#t] ne "f") { 178 | my @pos = split(/,/,$buff{pos}); 179 | $runs{$cluster}{chrom} = $buff{chrom}; 180 | $runs{$cluster}{start} = $pos[0]; 181 | $runs{$cluster}{end} = $pos[$#pos]; 182 | $runs{$cluster}{tags} = $buff{tags}; 183 | }elsif ($t[$#t] eq "f") { 184 | my @pos = split(/,/,$buff{pos}); 185 | $runs{$cluster}{chrom} = $buff{chrom}; 186 | $runs{$cluster}{start} = $pos[0]; 187 | $runs{$cluster}{end} = $pos[$#pos-1]; 188 | for my $i (0 .. $#t-1) { 189 | $runs{$cluster}{tags} .= "$t[$i]"; 190 | } 191 | }elsif ($t[$#t-1] eq "f") { 192 | my @pos = split(/,/,$buff{pos}); 193 | $runs{$cluster}{chrom} = $buff{chrom}; 194 | $runs{$cluster}{start} = $pos[0]; 195 | $runs{$cluster}{end} = $pos[$#pos-2]; 196 | for my $i (0 .. $#t-2) { 197 | $runs{$cluster}{tags} .= "$t[$i]"; 198 | } 199 | }else { 200 | print "Warning: $buff{tags}\n"; 201 | } 202 | }else { 203 | 1; 204 | } 205 | $buff = 0; 206 | %buff = (); 207 | }elsif ($last1 ne "f" and $last2 ne "f") { 208 | my @pos = split(/,/,$buff{pos}); 209 | if ($F[1] - $pos[$#pos] <= $dist_markers) { 210 | $buff{pos} .= ",$F[1]"; 211 | $buff{tags} .= $tag; 212 | }else { 213 | my @t = split(//,$buff{tags}); 214 | my $num = 0; 215 | my $perc = 0; 216 | my $numE = 0; 217 | my $percE = 0; 218 | my $totm = scalar(@t); 219 | for my $m (@t) { 220 | $num++ if ($m eq "s" or $m eq "e"); 221 | $numE++ if $m eq "E"; 222 | } 223 | $perc = $num / $totm; 224 | $percE = $numE/ $totm; 225 | if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) { 226 | $cluster++; 227 | if ($t[$#t] ne "f" and $t[$#t] ne "f") { 228 | my @pos = split(/,/,$buff{pos}); 229 | $runs{$cluster}{chrom} = $buff{chrom}; 230 | $runs{$cluster}{start} = $pos[0]; 231 | $runs{$cluster}{end} = $pos[$#pos]; 232 | $runs{$cluster}{tags} = $buff{tags}; 233 | }elsif ($t[$#t] eq "f") { 234 | my @pos = split(/,/,$buff{pos}); 235 | $runs{$cluster}{chrom} = $buff{chrom}; 236 | $runs{$cluster}{start} = $pos[0]; 237 | $runs{$cluster}{end} = $pos[$#pos-1]; 238 | for my $i (0 .. $#t-1) { 239 | $runs{$cluster}{tags} .= "$t[$i]"; 240 | } 241 | }elsif ($t[$#t-1] eq "f") { 242 | my @pos = split(/,/,$buff{pos}); 243 | $runs{$cluster}{chrom} = $buff{chrom}; 244 | $runs{$cluster}{start} = $pos[0]; 245 | $runs{$cluster}{end} = $pos[$#pos-2]; 246 | for my $i (0 .. $#t-2) { 247 | $runs{$cluster}{tags} .= "$t[$i]"; 248 | } 249 | }else { 250 | print "Warning: $buff{tags}\n"; 251 | } 252 | }else { 253 | 1; 254 | } 255 | $buff = 0; 256 | %buff = (); 257 | } 258 | }else { 259 | my @t = split(//,$buff{tags}); 260 | my $num = 0; 261 | my $perc = 0; 262 | my $numE = 0; 263 | my $percE = 0; 264 | my $totm = scalar(@t); 265 | for my $m (@t) { 266 | $num++ if ($m eq "s" or $m eq "e"); 267 | $numE++ if $m eq "E"; 268 | } 269 | $perc = $num / $totm; 270 | $percE = $numE/ $totm; 271 | if ($totm >= ($num_markers+1) and $perc >= $frac_markers and $percE < 1) { 272 | $cluster++; 273 | if ($t[$#t] eq "f") { 274 | my @pos = split(/,/,$buff{pos}); 275 | $runs{$cluster}{chrom} = $buff{chrom}; 276 | $runs{$cluster}{start} = $pos[0]; 277 | $runs{$cluster}{end} = $pos[$#pos-1]; 278 | for my $i (0 .. $#t-1) { 279 | $runs{$cluster}{tags} .= "$t[$i]"; 280 | } 281 | }elsif ($t[$#t-1] eq "f") { 282 | my @pos = split(/,/,$buff{pos}); 283 | $runs{$cluster}{chrom} = $buff{chrom}; 284 | $runs{$cluster}{start} = $pos[0]; 285 | $runs{$cluster}{end} = $pos[$#pos-2]; 286 | for my $i (0 .. $#t-2) { 287 | $runs{$cluster}{tags} .= "$t[$i]"; 288 | } 289 | }else { 290 | print "Warning: $buff{tags}\n"; 291 | } 292 | }else { 293 | 1; 294 | } 295 | $buff = 0; 296 | %buff = (); 297 | } 298 | }else { 299 | 1; 300 | } 301 | } 302 | 303 | if ($. == 2) { 304 | $last1 = $tag; 305 | $last2 = $last1; 306 | }else { 307 | $last2 = $last1; 308 | $last1 = $tag; 309 | } 310 | } 311 | close IN; 312 | 313 | open OUT, "> $outfile" or die "$outfile: $!"; 314 | open BED, "> $bedfile" or die "$bedfile: $!"; 315 | print OUT "Run_ID\tChrom\tStart\tEnd\tLength\tNum_Markers\tTag_Markers\n"; 316 | for my $c (sort {$a <=> $b} keys %runs) { 317 | my $len = $runs{$c}{end} - $runs{$c}{start}; 318 | my $num = scalar(split(//,$runs{$c}{tags})); 319 | print OUT "$c\t$runs{$c}{chrom}\t$runs{$c}{start}\t$runs{$c}{end}\t$len\t$num\t$runs{$c}{tags}\n"; 320 | print BED "$runs{$c}{chrom}\t$runs{$c}{start}\t$runs{$c}{end}\t$c\n"; 321 | } 322 | close OUT; 323 | close BED; 324 | -------------------------------------------------------------------------------- /dnanexus/cis-x/README.md: -------------------------------------------------------------------------------- 1 | 2 | # St. Jude cis-X 3 | 4 | Search for activating regulatory variants in the tumor genome 5 | 6 | 7 | Activating regular variants usually cause the cis-activation of target genes. 8 | To find cis-activated genes, allelic specific/imbalance expressions (ASE) and 9 | outlier high expression (OHE) signals are used. Variants in the same 10 | topologically associated domains with the candidates can then be searched, 11 | including structural variants (SV), copy number aberrations (CNA), and single 12 | nucleotide variations (SNV) and insertion/deletions (indel). 13 | 14 | A transcription factor binding analysis is also done, using motifs from 15 | [HOCOMOCO] v10 models. 16 | 17 | cis-X currently only works with hg19 (GRCh37). 18 | 19 | [HOCOMOCO]: http://hocomoco11.autosome.ru/ 20 | 21 | ## Inputs 22 | 23 | * `sample-id`: The sample ID. This is primarily used as the prefix for the 24 | filenames of the results. 25 | 26 | * `results-dir`: The output directory. See "[Outputs](#outputs)" for the 27 | resulting files. 28 | 29 | * `markers`: A list of single nucleotide markers. This is a tab-delimited 30 | file with the following columns: 31 | 32 | * `Chr`: chromosome name for the marker 33 | * `Pos`: genomic start location for the marker 34 | * `Chr_Allele`: reference allele 35 | * `Alternative_Allele`: alternative allele 36 | * `reference_tumor_count`: reference allele count in the tumor genome 37 | * `alternative_tumor_count`: alternative allele count in the tumor genome 38 | * `reference_normal_count`: reference allele count in the matched normal genome 39 | * `alternative_normal_count`: alternative count in the matched normal genome 40 | 41 | This file can be generated with Bambino. 42 | 43 | * `cnv-loh`: CNV/LOH regions. It contains all the genomic regions carrying 44 | copy number variations (CNV) or loss of heterozygosity (LOH), which will be 45 | filtered out during analysis. 46 | 47 | This is a tab-delimited file in the bed format. It must have at least the 48 | following three columns: 49 | 50 | * `chrom`: chromosome name 51 | * `loc.start`: genomic start location 52 | * `loc.end`: genomic end location 53 | 54 | If no CNV/LOH are in the genome under analysis, a file with no rows (but 55 | including headers) can be provided. 56 | 57 | This file can be generated with CONSERTING. 58 | 59 | * `bam`: The RNA-Seq BAM file aligned to hg19 (GRCh37). The index file is 60 | expected to be in the same directory with the same name and extension 61 | `.bai`, e.g, `/path/to/SJ001_D1.bam` and `/path/to/SJ001_D1.bam.bai`. 62 | 63 | StrongArm or STAR can be used for RNA-Seq alignment. 64 | 65 | * `fpkm-matrix`: A gene expression table. This is a tab-delimited file 66 | containing gene level expressions for the tumor under analysis. The 67 | expressions are in FPKM (fragments per kilobase of transcript per million 68 | mapped reads). 69 | 70 | * `GeneID`: gene [Ensembl] ID 71 | * `GeneName`: gene symbol 72 | * `Type`: [transcript type](https://www.gencodegenes.org/gencode_biotypes.html) 73 | * `Status`: transcript status (must be `KNOWN`, `NOVEL`, or `PUTATIVE`) 74 | * `Chr`: chromosome name 75 | * `Start` genomic start location 76 | * `End`: genomic end location 77 | * [SampleID...]: FPKM for the given sample 78 | 79 | This file can can be generated with the output of HTseq-count 80 | preprocessed through `src/other/mergeData_geneName.pl`. The data must be 81 | able to match values in the given gene specific reference expression 82 | matrices (see [cis-X ref-exp]) generated from a larger cohort. 83 | 84 | * `snv-indel`: Somatic SNV/indels. This is a tab-delimited file containing 85 | somatic sequence mutations present in the genome under analysis. It includes 86 | both single nucleotide variants (SNV) and small insertion/deletions (indel). 87 | The file must have the following columns: 88 | 89 | * `chr`: chromosome name 90 | * `pos`: genomic start location 91 | * `ref`: reference allele genotype 92 | * `mutant`: mutant allele genotype 93 | * `type`: mutation type (either `snv` or `indel`) 94 | 95 | Note that the coordinate used for an indel is after the inserted sequence. 96 | 97 | If no SNV/indels are in the sample under analysis, a file with no rows 98 | (but including headers) can be provided. 99 | 100 | This file can can be created with Bambino and then preprocessed using the 101 | steps taken in "[The genetic basis of early T-cell precursor acute lymphoblastic leukaemia][22237106]". 102 | 103 | * `sv` Somatic SVs. This is a tab-delimited file containing somatic-acquired 104 | structural variants (SV) in the cancer genome. The file must have the 105 | following columns: 106 | 107 | * `chrA`: chromosome name of the left breakpoint 108 | * `posA`: genomic location of the left breakpoint 109 | * `ortA`: strand orientation of the left breakpoint 110 | * `chrB`: chromosome name of the right breakpoint 111 | * `posB`: genomic location of the right breakpoint 112 | * `ortB`: strand orientation of the right breakpoint 113 | 114 | Strand orientations are denoted with a `+` for a sense or coding strand 115 | and `-` for a antisense or non-coding strand. 116 | 117 | If no somatic SVs are in the sample under analysis, a file with no rows (but 118 | including headers) can be provided. 119 | 120 | This file can be generated by CREST. 121 | 122 | * `cna` Somatic CNV. This is a tab-delimited file containing the genomic 123 | regions with somatic-acquired copy number aberrations (CNA) in the cancer 124 | genome. 125 | 126 | * `chr`: chromosome name 127 | * `start`: genomic start location 128 | * `end`: genomic end location 129 | * `logR`: log2 ratio 130 | 131 | If no somatic CNVs are in the sample under analysis, a file with no rows 132 | (but including headers) can be provided. 133 | 134 | This file can be generating by CONSERTING. 135 | 136 | * `disease`: The disease name. 137 | 138 | * `cnv_loh_action`: The behavior when handling markers in CNV/LOH regions. Can 139 | be either `keep` or `drop`. 140 | 141 | * `min_coverage_wgs`: The minimum coverage in WGS to be included in the 142 | analysis. 143 | 144 | * `min_coverage_rna_seq`: The minimum coverage in RNA-seq to be included in 145 | the analysis. 146 | 147 | * `fpkm_threshold_candidate`: The FPKM threshold for the nomination of a 148 | cis-activated candidate. 149 | 150 | * `user_annotation`: Annotations for the candidate SNV/indels in BED format. 151 | 152 | * `chr_string`: Whether the names in the reference sequence dictionary are 153 | prefixed with "chr". 154 | 155 | * `tad_info`: TAD information defining the regulatory territory used in 156 | noncoding variant analysis. 157 | 158 | [cis-X ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp 159 | [22237106]: https://www.ncbi.nlm.nih.gov/pubmed/22237106 160 | 161 | ## Outputs 162 | 163 | * `*.cisActivated.candidates.txt`: cis-activated candidates in the tumor 164 | genome under analysis. 165 | 166 | * `gene`: gene accession number ([RefSeq] ID) 167 | * `gsym`: gene symbol 168 | * `chrom`: chromosome name 169 | * `strand`: strand orientation 170 | * `start`: genomic start location 171 | * `end`: genomic end location 172 | * `cdsStartStat`: coding sequence (CDS) start status 173 | * `cdsEndStat`: coding sequence (CDS) end status 174 | * `markers`: number of heterozygous markers in this gene 175 | * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE) 176 | * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers 177 | * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers 178 | * `pval_all_markers`: p-value for each marker in the ASE test 179 | * `pval_ase_markers`: p-value for ASE markers in the ASE test 180 | * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers 181 | * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers 182 | * `comb.pval`: combined p-value for the ASE test 183 | * `mean.delta`: average BAF difference between RNA and DNA for all markers 184 | * `rawp`: raw p-value for the ASE test 185 | * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni) 186 | * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg) 187 | * `FPKM`: FPKM value 188 | * `loo.source`: which reference expression matrix was used in the outlier high expression (OHE) test 189 | * `loo.cohort.size`: number of cases in the reference expression matrix for this gene 190 | * `loo.pval`: p-value of the OHE test 191 | * `loo.rank`: rank of the case under analysis among the reference cases 192 | * `imprinting.status`: imprinting status of the gene 193 | * `candidate.group`: status of the gene, combining both ASE and outlier tests 194 | * `description`: status of the gene in COSMIC database 195 | 196 | Strand orientations are denoted with a `+` for a sense or coding strand 197 | and `-` for a antisense or non-coding strand. 198 | 199 | Coding sequence status is typically one of "none" (not specified), "unk" 200 | (unknown), "incmpl" (incomplete), or "cmpl" (complete). 201 | 202 | * `*.sv.candidates.txt`: Structural variant candidates predicted as the 203 | causal for the cis-activated genes in the regulatory territory. 204 | 205 | * `left.candidate.inTAD`: cis-activated candidate near the left breakpoint 206 | * `right.candidate.inTAD`: cis-activated candidate near the right breakpoint 207 | * `chrA`: chromosome name of the left breakpoint 208 | * `posA`: genomic location of the left breakpoint 209 | * `ortA`: strand orientation of the left breakpoint 210 | * `chrB`: chromosome name of the right breakpoint 211 | * `posB`: genomic location of the right breakpoint 212 | * `ortB`: strand orientation of the right breakpoint 213 | * `type`: type of translocation 214 | 215 | * `*.cna.candidates.txt`: Copy number aberrations predicted as the causal 216 | for the cis-activated genes in the regulatory territory. 217 | 218 | * `candidate.inTAD`: cis-activated candidate by the CNA 219 | * `chr`: chromosome name 220 | * `start`: genomic start position 221 | * `end`: genomic end location 222 | * `logR`: log ratio of the CNA 223 | 224 | * `*.snvindel.candidates.txt`: SNV/indel candidates predicted as functional 225 | and predicted transcription factors. The mutations are also annotated for 226 | known regulatory elements reported by the [NIH Roadmap Epigenomics Project] 227 | by collecting 111 cell lines. 228 | 229 | * `chrom`: chromosome name 230 | * `pos`: genomic start position 231 | * `ref`: reference allele genotype 232 | * `mut`: mutant allele genotype 233 | * `type`: mutation type (either `snv` or `indel`) 234 | * `target`: cis-activated candidate 235 | * `dist`: distance between the mutation and transcription start sites of the target gene 236 | * `tf`: transcription factors predicted to have the binding motif introduced by the mutation 237 | * `EpiRoadmap_enhancer`: enhancer regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 238 | * `EpiRoadmap_promoter`: promoter regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 239 | * `EpiRoadmap_dyadic`: dyadic regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 240 | * `User_Annot`: annotation from the user-provided BED file 241 | 242 | * `*.OHE.results.txt`: Raw results for outlier high expression test. 243 | 244 | * `Gene`: gene symbol 245 | * `fpkm.raw`: FPKM value 246 | * `size.bi`: number of cases in the bi-allelic reference cohort 247 | * `p.bi`: p-value in the outlier test using the bi-allelic reference cohort 248 | * `rank.bi`: rank of the expression level in the case under analysis compared to the bi-allelic reference cohort 249 | * `size.cohort`: number of cases in the entire reference cohort 250 | * `p.cohort`: p-value in the outlier test using the entire reference cohort 251 | * `rank.cohort`: rank of the expression level in the case under analysis compared to the entire reference cohort 252 | * `size.white`: number of cases in the whitelist reference cohort 253 | * `p.white`: p-value in the outlier test using the whitelist reference cohort 254 | * `rank.white`: rank of the expression level in the case under analysis compared to the whitelist reference cohort 255 | * `tscore.white`: t-score representing if the gene showed outlier expresssion using the whitelist reference cohort 256 | * `tscore.perc.white`: percentage of the t-score compared to the null distribution 257 | 258 | * `*.ase.gene.model.fdr.txt`: Raw results for gene level allelic specific 259 | expression test. 260 | 261 | * `gene`: gene accession number ([RefSeq] ID) 262 | * `gsym`: gene symbol 263 | * `chrom`: chromosome name 264 | * `strand`: strand orientation 265 | * `start`: genomic start location 266 | * `end`: genomic end location 267 | * `cdsStartStat`: coding sequence (CDS) start status 268 | * `cdsEndStat`: coding sequence (CDS) end status 269 | * `markers`: number of heterozygous markers in this gene 270 | * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE) 271 | * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers 272 | * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers 273 | * `pval_all_markers`: p-value for each marker in the ASE test 274 | * `pval_ase_markers`: p-value for ASE markers in the ASE test 275 | * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers 276 | * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers 277 | * `comb.pval`: combined p-value for the ASE test 278 | * `mean.delta`: average BAF difference between RNA and DNA for all markers 279 | * `rawp`: raw p-value for the ASE test 280 | * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni) 281 | * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg) 282 | 283 | Strand orientations are denoted with a `+` for a sense or coding strand 284 | and `-` for a antisense or non-coding strand. 285 | 286 | Coding sequence status is typically one of "none" (not specified), "unk" 287 | (unknown), "incmpl" (incomplete), or "cmpl" (complete). 288 | 289 | * `*.ase.combine.WGS.RNAseq.goodmarkers.binom.txt`: Raw results for single 290 | marker based allelic specific expression test. 291 | 292 | * `chrom`: chromosome name 293 | * `pos`: genomic start position 294 | * `ref`: reference allele genotype 295 | * `mut`: non-reference allele genotype 296 | * `cvg_wgs`: coverage of the marker from the whole genome sequence (WGS) 297 | * `mut_freq_wgs`: non-reference allele fraction in the WGS 298 | * `cvg_rna`: coverage of the marker from the RNA-seq 299 | * `mut_freq_rna`: non-reference allele fraction in the RNA-seq 300 | * `ref.1`: read count of the reference allele in the RNA-seq 301 | * `var`: read count of the non-reference allele in the RNA-seq 302 | * `pvalue`: p-value from the binomial test 303 | * `delta.abs`: absolute difference of the non-reference allele fraction between the WGS and RNA-seq 304 | 305 | [Ensembl]: http://www.ensembl.org/ 306 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html 307 | [RefSeq]: https://www.ncbi.nlm.nih.gov/refseq/ 308 | -------------------------------------------------------------------------------- /src/core/README.md: -------------------------------------------------------------------------------- 1 | # cis-X run 2 | 3 | **cis-X run** searches for activating regulatory variants in the tumor genome. 4 | 5 | ## Prerequisites 6 | 7 | * [Perl] ^5.10.1 8 | * [Data::Compare] ~1.25 9 | * [R] ^3.1.0 10 | * [multtest] ~2.36.0 11 | * [Java SE Runtime Environment] ~1.8.0_66 12 | * [MEME Suite] =4.9.0 13 | * [twoBitToFa]\* 14 | * [variants2matrix] (See below.) 15 | 16 | \* UCSC Genome Browser binaries are not versioned. The latest versions 17 | _should_ work. 18 | 19 | [Perl]: https://www.perl.org/ 20 | [Data::Compare]: https://metacpan.org/pod/Data::Compare 21 | [R]: https://www.r-project.org/ 22 | [multtest]: https://www.bioconductor.org/packages/release/bioc/html/multtest.html 23 | [Java SE Runtime Environment]: http://www.oracle.com/technetwork/java/javase/overview/index.html 24 | [MEME Suite]: http://meme-suite.org/ 25 | [twoBitToFa]: https://genome.ucsc.edu/goldenpath/help/twoBit.html 26 | [variants2matrix]: #variants2matrix 27 | 28 | ### variants2matrix 29 | 30 | variants2matrix is a St. Jude tool that is available from [St. Jude Research] 31 | (`variants2matrix.tar.gz`). It is expected to be in `PATH`, along with its 32 | Perl library and Java class paths, e.g., 33 | 34 | ``` 35 | $ V2M_HOME=$CIS_X_HOME/vendor/variants2matrix 36 | $ wget http://ftp.stjude.org/pub/software/cis-x/variants2matrix.tar.gz 37 | $ tar xf variants2matrix.tar.gz --directory $CIS_X_HOME/vendor 38 | $ export PATH=$V2M_HOME/bin:$PATH 39 | $ export PERL5LIB=$V2M_HOME/lib/perl:$PERL5LIB 40 | $ export CLASSPATH=$V2M_HOME/lib/java/bambino-1.0.jar:$V2M_HOME/lib/java/indelxref-1.0.jar:$V2M_HOME/lib/java/picard.jar:$V2M_HOME/lib/java/samplenamelib-1.0.jar:$CLASSPATH 41 | ``` 42 | 43 | ### References 44 | 45 | Reference files are not included with the source due to their large sizes. 46 | 47 | Internal references are placed in `$CIS_X_HOME/refs`. These files can be 48 | downloaded from [St. Jude Research] (`cis-x-refs-*.tar.gz`). It includes a 49 | blacklist of problematic polymorphism markers and two reference expression 50 | matrices for T-ALL and NBL. 51 | 52 | External references are expected to be in `$CIS_X_HOME/refs/external`. These 53 | are not distributed with cis-X, but the `cis-X seed` command can download and 54 | generate them. See [cis-X seed] for more details and a list of required 55 | reference files. 56 | 57 | [cis-X seed]: https://github.com/stjude/cis-x/tree/master/src/seed 58 | 59 | ## Usage 60 | 61 | ``` 62 | cis-X-run 63 | 64 | USAGE: 65 | cis-X run -s -o -l -g -b -e -m -v -c -d -a -w -r -f -u -h -t 66 | 67 | ARGS: 68 | -s Sample ID 69 | -o Output directory 70 | -l Path to single nucleotide markers 71 | -g Path to CNV/LOH regions 72 | -b Path to a RNA-Seq BAM (index must be in same directory) 73 | -e Path to gene expression table 74 | -m Path to somatic SNV/indels 75 | -v Path to somatic SVs 76 | -c Path to somatic CNVs 77 | -d Disease name 78 | -a Action of markers in CNV/LOH regions, either keep or drop (default=keep) 79 | -w Minimal coverage in WGS to include a heterozygous marker (default=10) 80 | -r Minimal coverage in RNA-seq to include a heterozygous marker (default=10) 81 | -f FPKM threshold for nominate cis-activated candidate (default=5) 82 | -u User applied annotation file in BED format (default=NotSpecified) 83 | -h if the RNA-seq BAM with 'chr' in name, TRUE|FALSE (default=TRUE) 84 | -t Path to the TAD annotation file in BED format in hg19 (default=hESC) 85 | ``` 86 | 87 | ## Inputs 88 | 89 | Running cis-X requires quite a few inputs. 90 | 91 | * `sample-id`: The sample ID. This is primarily used as the prefix for the 92 | filenames of the results. 93 | 94 | * `results-dir`: The output directory. See "[Outputs](#outputs)" for the 95 | resulting files. 96 | 97 | * `markers`: A list of single nucleotide markers. This is a tab-delimited 98 | file with the following columns: 99 | 100 | * `Chr`: chromosome name for the marker 101 | * `Pos`: genomic start location for the marker 102 | * `Chr_Allele`: reference allele 103 | * `Alternative_Allele`: alternative allele 104 | * `reference_tumor_count`: reference allele count in the tumor genome 105 | * `alternative_tumor_count`: alternative allele count in the tumor genome 106 | * `reference_normal_count`: reference allele count in the matched normal genome 107 | * `alternative_normal_count`: alternative count in the matched normal genome 108 | 109 | This file can be generated with Bambino. 110 | 111 | * `cnv-loh`: CNV/LOH regions. It contains all the genomic regions carrying 112 | copy number variations (CNV) or loss of heterozygosity (LOH), which will be 113 | filtered out during analysis. 114 | 115 | This is a tab-delimited file in the bed format. It must have at least the 116 | following three columns: 117 | 118 | * `chrom`: chromosome name 119 | * `loc.start`: genomic start location 120 | * `loc.end`: genomic end location 121 | 122 | If no CNV/LOH are in the genome under analysis, a file with no rows (but 123 | including headers) can be provided. 124 | 125 | This file can be generated with CONSERTING. 126 | 127 | * `bam`: The RNA-Seq BAM file aligned to hg19 (GRCh37). The index file is 128 | expected to be in the same directory with the same name and extension 129 | `.bai`, e.g, `/path/to/SJ001_D1.bam` and `/path/to/SJ001_D1.bam.bai`. 130 | 131 | StrongArm or STAR can be used for RNA-Seq alignment. 132 | 133 | * `fpkm-matrix`: A gene expression table. This is a tab-delimited file 134 | containing gene level expressions for the tumor under analysis. The 135 | expressions are in FPKM (fragments per kilobase of transcript per million 136 | mapped reads). 137 | 138 | * `GeneID`: gene [Ensembl] ID 139 | * `GeneName`: gene symbol 140 | * `Type`: [transcript type](https://www.gencodegenes.org/gencode_biotypes.html) 141 | * `Status`: transcript status (must be `KNOWN`, `NOVEL`, or `PUTATIVE`) 142 | * `Chr`: chromosome name 143 | * `Start` genomic start location 144 | * `End`: genomic end location 145 | * [SampleID...]: FPKM for the given sample 146 | 147 | This file can can be generated with the output of HTseq-count 148 | preprocessed through `src/other/mergeData_geneName.pl`. The data must be 149 | able to match values in the given gene specific reference expression 150 | matrices (see [cis-X ref-exp]) generated from a larger cohort. 151 | 152 | * `snv-indel`: Somatic SNV/indels. This is a tab-delimited file containing 153 | somatic sequence mutations present in the genome under analysis. It includes 154 | both single nucleotide variants (SNV) and small insertion/deletions (indel). 155 | The file must have the following columns: 156 | 157 | * `chr`: chromosome name 158 | * `pos`: genomic start location 159 | * `ref`: reference allele genotype 160 | * `mutant`: mutant allele genotype 161 | * `type`: mutation type (either `snv` or `indel`) 162 | 163 | Note that the coordinate used for an indel is after the inserted sequence. 164 | 165 | If no SNV/indels are in the sample under analysis, a file with no rows 166 | (but including headers) can be provided. 167 | 168 | This file can can be created with Bambino and then preprocessed using the 169 | steps taken in "[The genetic basis of early T-cell precursor acute lymphoblastic leukaemia][22237106]". 170 | 171 | * `sv` Somatic SVs. This is a tab-delimited file containing somatic-acquired 172 | structural variants (SV) in the cancer genome. The file must have the 173 | following columns: 174 | 175 | * `chrA`: chromosome name of the left breakpoint 176 | * `posA`: genomic location of the left breakpoint 177 | * `ortA`: strand orientation of the left breakpoint 178 | * `chrB`: chromosome name of the right breakpoint 179 | * `posB`: genomic location of the right breakpoint 180 | * `ortB`: strand orientation of the right breakpoint 181 | 182 | Strand orientations are denoted with a `+` for a sense or coding strand 183 | and `-` for a antisense or non-coding strand. 184 | 185 | If no somatic SVs are in the sample under analysis, a file with no rows (but 186 | including headers) can be provided. 187 | 188 | This file can be generated by CREST. 189 | 190 | * `cna` Somatic CNV. This is a tab-delimited file containing the genomic 191 | regions with somatic-acquired copy number aberrations (CNA) in the cancer 192 | genome. 193 | 194 | * `chr`: chromosome name 195 | * `start`: genomic start location 196 | * `end`: genomic end location 197 | * `logR`: log2 ratio 198 | 199 | If no somatic CNVs are in the sample under analysis, a file with no rows 200 | (but including headers) can be provided. 201 | 202 | This file can be generating by CONSERTING. 203 | 204 | * `disease`: The disease name. 205 | 206 | * `cnv_loh_action`: The behavior when handling markers in CNV/LOH regions. Can 207 | be either `keep` or `drop`. 208 | 209 | * `min_coverage_wgs`: The minimum coverage in WGS to be included in the 210 | analysis. 211 | 212 | * `min_coverage_rna_seq`: The minimum coverage in RNA-seq to be included in 213 | the analysis. 214 | 215 | * `fpkm_threshold_candidate`: The FPKM threshold for the nomination of a 216 | cis-activated candidate. 217 | 218 | * `user-annotation`: Annotations for the candidate SNV/indels in BED format. 219 | 220 | * `chr-string`: Whether the names in the reference sequence dictionary are 221 | prefixed with "chr". 222 | 223 | * `tad-info`: TAD information defining the regulatory territory used in 224 | noncoding variant analysis. 225 | 226 | [cis-X ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp 227 | [22237106]: https://www.ncbi.nlm.nih.gov/pubmed/22237106 228 | 229 | ## Outputs 230 | 231 | Results are saved as tab-delimited files to `$RESULTS_DIR`. 232 | 233 | * `*.cisActivated.candidates.txt`: cis-activated candidates in the tumor 234 | genome under analysis. 235 | 236 | * `gene`: gene accession number ([RefSeq] ID) 237 | * `gsym`: gene symbol 238 | * `chrom`: chromosome name 239 | * `strand`: strand orientation 240 | * `start`: genomic start location 241 | * `end`: genomic end location 242 | * `cdsStartStat`: coding sequence (CDS) start status 243 | * `cdsEndStat`: coding sequence (CDS) end status 244 | * `markers`: number of heterozygous markers in this gene 245 | * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE) 246 | * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers 247 | * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers 248 | * `pval_all_markers`: p-value for each marker in the ASE test 249 | * `pval_ase_markers`: p-value for ASE markers in the ASE test 250 | * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers 251 | * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers 252 | * `comb.pval`: combined p-value for the ASE test 253 | * `mean.delta`: average BAF difference between RNA and DNA for all markers 254 | * `rawp`: raw p-value for the ASE test 255 | * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni) 256 | * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg) 257 | * `FPKM`: FPKM value 258 | * `loo.source`: which reference expression matrix was used in the outlier high expression (OHE) test 259 | * `loo.cohort.size`: number of cases in the reference expression matrix for this gene 260 | * `loo.pval`: p-value of the OHE test 261 | * `loo.rank`: rank of the case under analysis among the reference cases 262 | * `imprinting.status`: imprinting status of the gene 263 | * `candidate.group`: status of the gene, combining both ASE and outlier tests 264 | * `description`: status of the gene in COSMIC database 265 | 266 | Strand orientations are denoted with a `+` for a sense or coding strand 267 | and `-` for a antisense or non-coding strand. 268 | 269 | Coding sequence status is typically one of "none" (not specified), "unk" 270 | (unknown), "incmpl" (incomplete), or "cmpl" (complete). 271 | 272 | * `*.sv.candidates.txt`: Structural variant candidates predicted as the 273 | causal for the cis-activated genes in the regulatory territory. 274 | 275 | * `left.candidate.inTAD`: cis-activated candidate near the left breakpoint 276 | * `right.candidate.inTAD`: cis-activated candidate near the right breakpoint 277 | * `chrA`: chromosome name of the left breakpoint 278 | * `posA`: genomic location of the left breakpoint 279 | * `ortA`: strand orientation of the left breakpoint 280 | * `chrB`: chromosome name of the right breakpoint 281 | * `posB`: genomic location of the right breakpoint 282 | * `ortB`: strand orientation of the right breakpoint 283 | * `type`: type of translocation 284 | 285 | * `*.cna.candidates.txt`: Copy number aberrations predicted as the causal 286 | for the cis-activated genes in the regulatory territory. 287 | 288 | * `candidate.inTAD`: cis-activated candidate by the CNA 289 | * `chr`: chromosome name 290 | * `start`: genomic start position 291 | * `end`: genomic end location 292 | * `logR`: log ratio of the CNA 293 | 294 | * `*.snvindel.candidates.txt`: SNV/indel candidates predicted as functional 295 | and predicted transcription factors. The mutations are also annotated for 296 | known regulatory elements reported by the [NIH Roadmap Epigenomics Project] 297 | by collecting 111 cell lines. 298 | 299 | * `chrom`: chromosome name 300 | * `pos`: genomic start position 301 | * `ref`: reference allele genotype 302 | * `mut`: mutant allele genotype 303 | * `type`: mutation type (either `snv` or `indel`) 304 | * `target`: cis-activated candidate 305 | * `dist`: distance between the mutation and transcription start sites of the target gene 306 | * `tf`: transcription factors predicted to have the binding motif introduced by the mutation 307 | * `EpiRoadmap_enhancer`: enhancer regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 308 | * `EpiRoadmap_promoter`: promoter regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 309 | * `EpiRoadmap_dyadic`: dyadic regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project]) 310 | * `User_Annot`: annotation from the user-provided BED file 311 | 312 | * `*.OHE.results.txt`: Raw results for outlier high expression test. 313 | 314 | * `Gene`: gene symbol 315 | * `fpkm.raw`: FPKM value 316 | * `size.bi`: number of cases in the bi-allelic reference cohort 317 | * `p.bi`: p-value in the outlier test using the bi-allelic reference cohort 318 | * `rank.bi`: rank of the expression level in the case under analysis compared to the bi-allelic reference cohort 319 | * `size.cohort`: number of cases in the entire reference cohort 320 | * `p.cohort`: p-value in the outlier test using the entire reference cohort 321 | * `rank.cohort`: rank of the expression level in the case under analysis compared to the entire reference cohort 322 | * `size.white`: number of cases in the whitelist reference cohort 323 | * `p.white`: p-value in the outlier test using the whitelist reference cohort 324 | * `rank.white`: rank of the expression level in the case under analysis compared to the whitelist reference cohort 325 | * `tscore.white`: t-score representing if the gene showed outlier expresssion using the whitelist reference cohort 326 | * `tscore.perc.white`: percentage of the t-score compared to the null distribution 327 | 328 | * `*.ase.gene.model.fdr.txt`: Raw results for gene level allelic specific 329 | expression test. 330 | 331 | * `gene`: gene accession number ([RefSeq] ID) 332 | * `gsym`: gene symbol 333 | * `chrom`: chromosome name 334 | * `strand`: strand orientation 335 | * `start`: genomic start location 336 | * `end`: genomic end location 337 | * `cdsStartStat`: coding sequence (CDS) start status 338 | * `cdsEndStat`: coding sequence (CDS) end status 339 | * `markers`: number of heterozygous markers in this gene 340 | * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE) 341 | * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers 342 | * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers 343 | * `pval_all_markers`: p-value for each marker in the ASE test 344 | * `pval_ase_markers`: p-value for ASE markers in the ASE test 345 | * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers 346 | * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers 347 | * `comb.pval`: combined p-value for the ASE test 348 | * `mean.delta`: average BAF difference between RNA and DNA for all markers 349 | * `rawp`: raw p-value for the ASE test 350 | * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni) 351 | * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg) 352 | 353 | Strand orientations are denoted with a `+` for a sense or coding strand 354 | and `-` for a antisense or non-coding strand. 355 | 356 | Coding sequence status is typically one of "none" (not specified), "unk" 357 | (unknown), "incmpl" (incomplete), or "cmpl" (complete). 358 | 359 | * `*.ase.combine.WGS.RNAseq.goodmarkers.binom.txt`: Raw results for single 360 | marker based allelic specific expression test. 361 | 362 | * `chrom`: chromosome name 363 | * `pos`: genomic start position 364 | * `ref`: reference allele genotype 365 | * `mut`: non-reference allele genotype 366 | * `cvg_wgs`: coverage of the marker from the whole genome sequence (WGS) 367 | * `mut_freq_wgs`: non-reference allele fraction in the WGS 368 | * `cvg_rna`: coverage of the marker from the RNA-seq 369 | * `mut_freq_rna`: non-reference allele fraction in the RNA-seq 370 | * `ref.1`: read count of the reference allele in the RNA-seq 371 | * `var`: read count of the non-reference allele in the RNA-seq 372 | * `pvalue`: p-value from the binomial test 373 | * `delta.abs`: absolute difference of the non-reference allele fraction between the WGS and RNA-seq 374 | 375 | [Ensembl]: http://www.ensembl.org/ 376 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html 377 | [RefSeq]: https://www.ncbi.nlm.nih.gov/refseq/ 378 | [St. Jude Research]: https://www.stjuderesearch.org/site/lab/zhang/cis-x 379 | --------------------------------------------------------------------------------