├── dnanexus
    └── cis-x
    │   ├── resources
    │       └── .gitkeep
    │   ├── README.developer.md
    │   ├── src
    │       └── cis-x.sh
    │   ├── dxapp.json
    │   └── README.md
├── .dockerignore
├── .gitignore
├── src
    ├── seed
    │   ├── bin
    │   │   ├── hg19_ref_gene_to_bed
    │   │   ├── scrape_geneimprint
    │   │   ├── merge_roadmap
    │   │   └── cis-X-seed
    │   └── README.md
    ├── other
    │   ├── hg19_refGene2bed.pl
    │   ├── realpath.c
    │   ├── meme_glam2_fix_new_gcc.patch
    │   └── mergeData_geneName.pl
    ├── ref-exp
    │   ├── bin
    │   │   ├── cis-X-ref-exp-generate
    │   │   ├── cis-X-ref-exp-prepare
    │   │   ├── cis-X-ref-exp
    │   │   └── cis-X-ref-exp-preprocess
    │   ├── src
    │   │   ├── format.precal.pl
    │   │   ├── cis-X.refexp.step1.pl
    │   │   ├── cis-X.refexp.step2.pl
    │   │   ├── refexp.gen.pl
    │   │   ├── precal.R
    │   │   ├── cleanup.bi.cases.R
    │   │   ├── filter.cohort.v2.pl
    │   │   └── collect.cohort.pl
    │   └── README.md
    └── core
    │   ├── bin
    │       ├── cis-X-mark
    │       ├── cis-X-test-outliers
    │       ├── cis-X-nominate
    │       ├── cis-X-build-matrix
    │       ├── cis-X-ase
    │       ├── cis-X-screen
    │       └── cis-X-run
    │   ├── src
    │       ├── sepCHR.pl
    │       ├── proc_ase_runs.pl
    │       ├── mergeVariantOut.pl
    │       ├── 05.merge.pl
    │       ├── binom.R
    │       ├── fdr.R
    │       ├── 02.add.count.pl
    │       ├── merge.fa.pl
    │       ├── 01.get.markder.pl
    │       ├── 07.gene.model.Oct2017.pl
    │       ├── exp.check.R
    │       ├── scan.sv.pl
    │       ├── ase.candidate.byrun.pl
    │       ├── check.TAD.cnv.pl
    │       ├── scan.cnv.pl
    │       ├── check.TAD.pl
    │       ├── ase.candidate.pl
    │       ├── snvindel.prep.pl
    │       ├── snvindel.process.pl
    │       └── ase_runs.pl
    │   └── README.md
├── bin
    └── cis-X
├── RELEASE.md
├── CHANGELOG.md
├── Dockerfile
├── README.md
└── LICENSE


/dnanexus/cis-x/resources/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | dnanexus
2 | refs/external
3 | tmp
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dnanexus/cis-x/resources/tmp
2 | refs
3 | tmp
4 | vendor
5 | 


--------------------------------------------------------------------------------
/src/seed/bin/hg19_ref_gene_to_bed:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | input = ARGV[0] or raise "missing input"
 5 | 
 6 | File.open(input) do |f|
 7 |   # skip header
 8 |   f.readline
 9 | 
10 |   f.each_line do |line|
11 |     r = line.split("\t")
12 |     puts "#{r[2]}\t#{r[4]}\t#{r[5]}\t#{r[12]}\t#{r[1]}\t#{r[3]}"
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/dnanexus/cis-x/README.developer.md:
--------------------------------------------------------------------------------
 1 | # St. Jude cis-X (dev)
 2 | 
 3 | The main script runs a container with a pre-built cis-X image. The DNAnexus
 4 | applet only executes the `run` command.
 5 | 
 6 | ## Build
 7 | 
 8 | ```
 9 | $ docker build --tag cis-x ../..
10 | $ mkdir -p resources/tmp
11 | $ docker save cis-x | gzip > resources/tmp/cis-x-latest.tar.gz
12 | $ dx build
13 | ```
14 | 


--------------------------------------------------------------------------------
/src/other/hg19_refGene2bed.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $infile = "hg19_refGene";
 4 | my $outfile = "hg19_refGene.bed";
 5 | open IN, "< $infile" or die "$infile: $!";
 6 | open OUT, "> $outfile" or die "$outfile: $!";
 7 | while(<IN>) {
 8 |     chomp;
 9 |     next if $. == 1;
10 |     my @F = split/\t/;
11 |     print OUT "$F[2]\t$F[4]\t$F[5]\t$F[12]\t$F[1]\t$F[3]\n";
12 | }
13 | close IN;
14 | close OUT;
15 | 
16 | 


--------------------------------------------------------------------------------
/src/seed/bin/scrape_geneimprint:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | require "open-uri"
 5 | 
 6 | require "nokogiri"
 7 | 
 8 | url = ARGV[0] or raise "missing url"
 9 | 
10 | document = Nokogiri::HTML(open(url))
11 | rows = document.css("table tr")
12 | 
13 | raise "missing data table" if rows.empty?
14 | 
15 | rows.each do |row|
16 |   puts row.css("th, td").map(&:text).join("\t")
17 | end
18 | 


--------------------------------------------------------------------------------
/src/other/realpath.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | int main(int argc, char **argv) {
 5 |     if (argc < 2) {
 6 |         fprintf(stderr, "%s: missing operand\n", argv[0]);
 7 |         return EXIT_FAILURE;
 8 |     }
 9 | 
10 |     for (int i = 1; i < argc; i++) {
11 |         char *rp = realpath(argv[i], NULL);
12 |         printf("%s\n", rp);
13 |         free(rp);
14 |     }
15 | 
16 |     return EXIT_SUCCESS;
17 | }
18 | 


--------------------------------------------------------------------------------
/src/ref-exp/bin/cis-X-ref-exp-generate:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..)
 4 | 
 5 | CONFIG=$1
 6 | RESULTS_DIR=$2
 7 | EXP_MATRIX=$3
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |     basename $0
11 |     echo
12 |     echo "USAGE:"
13 |     echo "    cis-X ref-exp generate <config> <results-dir> <exp-matrix>"
14 |     exit 1
15 | fi
16 | 
17 | perl $CIS_X_REF_EXP_HOME/src/cis-X.refexp.step2.pl $CONFIG $RESULTS_DIR $EXP_MATRIX
18 | 


--------------------------------------------------------------------------------
/src/ref-exp/bin/cis-X-ref-exp-prepare:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..)
 4 | 
 5 | CONFIG=$1
 6 | RESULTS_DIR=$2
 7 | CHR_STRING=$3
 8 | 
 9 | if [ $# -lt 3 ]; then
10 |     basename $0
11 |     echo
12 |     echo "USAGE:"
13 |     echo "    cis-X ref-exp prepare <config> <results-dir> <chr-string>"
14 |     exit 1
15 | fi
16 | 
17 | perl $CIS_X_REF_EXP_HOME/src/cis-X.refexp.step1.pl $CONFIG $RESULTS_DIR $CHR_STRING $COVG_WGS
18 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-mark:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 5 | 
 6 | SAMPLE_ID=$1
 7 | HIGH20=$2
 8 | CNVLOH=$3
 9 | SNV4_OUT=$4
10 | HET_OUT=$5
11 | COVG_WGS=$6
12 | 
13 | BADLST=$CIS_X_HOME/refs/SuperBad.good.bad.new
14 | 
15 | perl $CIS_X_CORE_HOME/src/01.get.markder.pl \
16 |     $SAMPLE_ID \
17 |     $HIGH20 \
18 |     $CNVLOH \
19 |     $SNV4_OUT \
20 |     $HET_OUT \
21 |     $BADLST \
22 |     $COVG_WGS
23 | 


--------------------------------------------------------------------------------
/src/other/meme_glam2_fix_new_gcc.patch:
--------------------------------------------------------------------------------
 1 | diff --git a/src/glam2_glam2.c b/src/glam2_glam2.c
 2 | index 60a9a3f..ef9c3dc 100644
 3 | --- a/src/glam2_glam2.c
 4 | +++ b/src/glam2_glam2.c
 5 | @@ -145,7 +145,7 @@ int aln_cmp(const void *a, const void *b) {
 6 |    return x < y ? +1 : x > y ? -1 : 0;
 7 |  }
 8 |  
 9 | -inline void report_external_failure(const char *prog, int status) {
10 | +void report_external_failure(const char *prog, int status) {
11 |    if (status == 0) {
12 |      // success
13 |    } if (status == -1) {
14 | 


--------------------------------------------------------------------------------
/src/core/src/sepCHR.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $input  = $ARGV[0];
 4 | my $chrom  = $ARGV[1];
 5 | my $output = $ARGV[2];
 6 | my $chr_string = $ARGV[3];
 7 | 
 8 | open IN, "< $input" or die "$input: $!";
 9 | open OUT, "> $output" or die "$output: $!";
10 | while(<IN>) {
11 |     chomp;
12 |     my @F = split(/\./,$_);
13 |     my $snv4 = $_;
14 |     unless ($chr_string eq "TRUE") {
15 |         $snv4 =~ s/^chr//;
16 |     }
17 |     if ($F[0] eq $chrom) {
18 |         print OUT "$snv4\n";
19 |     }
20 | }
21 | close IN;
22 | close OUT;
23 | 
24 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/format.precal.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $workdir = $ARGV[0];
 4 | 
 5 | my $infile = "$workdir/raw.tvalue.bicohort.txt";
 6 | my $outfile = "$workdir/refexp/precal.tvalue.bin_gt1.txt";
 7 | open OUT, "> $outfile" or die "$outfile: $!";
 8 | open IN, "< $infile" or die "$infile: $!";
 9 | while(<IN>) {
10 |     chomp;
11 |     next if $. == 1;
12 |     my @F = split/\t/;
13 |     my @f = split(/,/,$F[1]);
14 |     for my $f (@f) {
15 |         next if $f eq "NaN";
16 |         print OUT "$f\n";
17 |     }
18 | }
19 | close IN;
20 | close OUT;
21 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/cis-X.refexp.step1.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $config  = $ARGV[0];
 4 | my $workdir = $ARGV[1];
 5 | my $chr_string = $ARGV[2];
 6 | 
 7 | unless ($config and $workdir and $chr_string) {
 8 |     die("Usage: cis-X.refexp.step1.pl [config file] [working dir] [chr-string]");
 9 | }
10 | 
11 | my $outfile = "$workdir/cis-X.refexp.step1.commands.sh";
12 | open OUT, "> $outfile" or die "$outfile: $!";
13 | open IN, "< $config" or die "$config: $!";
14 | while(<IN>) {
15 |     chomp;
16 |     next if $. == 1;
17 |     my @F = split/\t/;
18 |     print OUT "cis-X ref-exp preprocess $F[0] $workdir $F[1] $F[2] $F[3] $chr_string\n";
19 | }
20 | close IN;
21 | close OUT;
22 | 
23 | 


--------------------------------------------------------------------------------
/bin/cis-X:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/..)
 4 | 
 5 | COMMAND=$1
 6 | 
 7 | usage() {
 8 |     basename $0
 9 |     echo
10 |     echo "USAGE:"
11 |     echo "    cis-X <SUBCOMMAND> [args...]"
12 |     echo ""
13 |     echo "SUBCOMMANDS:"
14 |     echo "    ref-exp  Generate reference expression matrices"
15 |     echo "    run      Search for activating regulatory variants in the tumor genome"
16 |     echo "    seed     Download and generate a set of common references"
17 |     exit 1
18 | }
19 | 
20 | case $COMMAND in
21 |     ref-exp) $CIS_X_HOME/src/ref-exp/bin/cis-X-ref-exp "${@:2}" ;;
22 |     run) $CIS_X_HOME/src/core/bin/cis-X-run "${@:2}" ;;
23 |     seed) $CIS_X_HOME/src/seed/bin/cis-X-seed "${@:2}" ;;
24 |     *) usage ;;
25 | esac
26 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/cis-X.refexp.step2.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | use Cwd qw(abs_path);
 4 | 
 5 | my $config  = $ARGV[0];
 6 | my $workdir = $ARGV[1];
 7 | my $expfile = $ARGV[2];
 8 | 
 9 | unless ($config and $workdir and $expfile) {
10 |     die("Usage: cis-X.refexp.step2.pl [config file] [working dir] [exp matrix]");
11 | }
12 | 
13 | my $codepath = abs_path($0);
14 | my $codedir  = `dirname $codepath`;
15 | chomp($codedir);
16 | print "$codedir\n";
17 | 
18 | system "perl -w $codedir/collect.cohort.pl $config $workdir $expfile";
19 | system "perl -w $codedir/filter.cohort.v2.pl $codedir $workdir";
20 | system "Rscript $codedir/cleanup.bi.cases.R $workdir";
21 | system "perl -w $codedir/refexp.gen.pl $workdir $expfile";
22 | system "Rscript $codedir/precal.R $workdir";
23 | system "perl -w $codedir/format.precal.pl $workdir";
24 | 


--------------------------------------------------------------------------------
/src/ref-exp/bin/cis-X-ref-exp:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_REF_EXP_HOME=$(realpath $(dirname $0)/..)
 4 | 
 5 | COMMAND=$1
 6 | 
 7 | usage() {
 8 |     basename $0
 9 |     echo
10 |     echo "USAGE:"
11 |     echo "    cis-X ref-exp <SUBCOMMAND> [args...]"
12 |     echo
13 |     echo "SUBCOMMANDS:"
14 |     echo "    generate    Generate a biallelic reference expression matrix"
15 |     echo "    prepare     Create a batch script for preprocessing inputs"
16 |     echo "    preprocess  Runs allelic specific expression (ASE) tests on inputs"
17 |     exit 1
18 | }
19 | 
20 | case $COMMAND in
21 |     generate) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-generate "${@:2}" ;;
22 |     prepare) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-prepare "${@:2}" ;;
23 |     preprocess) $CIS_X_REF_EXP_HOME/bin/cis-X-ref-exp-preprocess "${@:2}" ;;
24 |     *) usage ;;
25 | esac
26 | 


--------------------------------------------------------------------------------
/src/core/src/proc_ase_runs.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $ase_runs = $ARGV[0];
 4 | my $ase_runs_gene = $ARGV[1];
 5 | my $outfile = $ARGV[2];
 6 | 
 7 | my (%run2g);
 8 | 
 9 | open IN, "< $ase_runs_gene" or die "$ase_runs_gene: $!";
10 | while(<IN>) {
11 |     chomp;
12 |     my @F = split/\t/;
13 |     next if $F[10] == 0;
14 |     $run2g{$F[3]}{$F[7]} = 1;
15 | }
16 | close IN;
17 | 
18 | open IN, "< $ase_runs" or die "$ase_runs: $!";
19 | open OUT, "> $outfile" or die "$outfile: $!";
20 | while(<IN>) {
21 |     chomp;
22 |     if ($. == 1) {
23 |         print OUT "$_\tGenes_overlap_hc\n";
24 |         next;
25 |     }
26 |     my @F = split/\t/;
27 |     my $gene = "";
28 |     if ($run2g{$F[0]}) {
29 |         my @g = keys %{$run2g{$F[0]}};
30 |         $gene = join(',',@g);
31 |     }
32 |     print OUT "$_\t$gene\n";
33 | }
34 | close IN;
35 | close OUT;
36 | 


--------------------------------------------------------------------------------
/src/core/src/mergeVariantOut.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $workdir = $ARGV[0];
 4 | my $outfile = $ARGV[1];
 5 | my $chr_string = $ARGV[2];
 6 | 
 7 | open OUT, "> $outfile" or die "$outfile: $!";
 8 | for my $i (1 .. 22) {
 9 |     my $infile = "$workdir/matrix_chr" . $i . "_simple.tab";
10 |     if (! -e $infile) {
11 |         print "$infile not exist.\n";
12 |         next;
13 |     }
14 |     open IN, "< $infile" or die "$infile: $!";
15 |     while(<IN>) {
16 |         chomp;
17 |         if ($. == 1) {
18 |             if ($i == 1) {
19 |                 print OUT "$_\n";
20 |             }
21 |             next;
22 |         }
23 |         if ($chr_string eq "TRUE") {
24 |             print OUT "$_\n";
25 |         }else {
26 |             my @F = split/\t/;
27 |             my $snv4 = "chr" . $F[1];
28 |             print OUT "$F[0]\t$snv4\t$F[2]\t$F[3]\n";
29 |         }
30 |     }
31 |     close IN;
32 | }
33 | close OUT;
34 | 
35 | 


--------------------------------------------------------------------------------
/src/core/src/05.merge.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | ### Only the longest transcript was used.
 3 | 
 4 | my $infile = $ARGV[0];
 5 | my $outfile = $ARGV[1];
 6 | 
 7 | my $head = "";
 8 | my %dat = ();
 9 | my %g2len = ();
10 | 
11 | open IN, "< $infile" or die "$infile: $!";
12 | while(<IN>) {
13 |     chomp;
14 |     if ($. == 1) {
15 |         $head = $_;
16 |         next;
17 |     }
18 |     my @F = split/\t/;
19 |     next unless $F[8];
20 |     my $len = $F[5] - $F[4];
21 |     if ($dat{$F[1]}) {
22 |         if ($len > $g2len{$F[1]}) {
23 |             $dat{$F[1]} = $_;
24 |             $g2len{$F[1]} = $len;
25 |         }else {
26 |             1;
27 |         }
28 |     }else {
29 |         $dat{$F[1]} = $_;
30 |         $g2len{$F[1]} = $len;
31 |     }
32 | }
33 | close IN;
34 | 
35 | open OUT, "> $outfile" or die "$outfile: $!";
36 | print OUT "$head\n";
37 | for my $g (sort keys %dat) {
38 |     print OUT "$dat{$g}\n";
39 | }
40 | close OUT;
41 | 
42 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/refexp.gen.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $workdir = $ARGV[0];
 4 | my $expfile = $ARGV[1];
 5 | 
 6 | my $outdir = "$workdir/refexp";
 7 | system "mkdir -p $outdir";
 8 | 
 9 | my $infile = "$workdir/cis-X.refexp.step2.collect.filtered.bi.samples.cleared.txt";
10 | my $outfile = "$outdir/exp.ref.bi.txt";
11 | open OUT, "> $outfile" or die "$outfile: $!";
12 | print OUT "Gene\tnum.cases\tSJID\tfpkm\n";
13 | open IN, "< $infile" or die "$infile: $!";
14 | while(<IN>) {
15 |     chomp;
16 |     next if $. == 1;
17 |     my @F = split/\t/;
18 |     if ($F[5]>=10) {
19 |       if ($F[15] == 1) {
20 |           print OUT "$F[0]\t$F[16]\t$F[17]\t$F[18]\n";
21 |       }else {
22 |           print OUT "$F[0]\t$F[5]\t$F[6]\t$F[7]\n";
23 |       }
24 |     }
25 | }
26 | close IN;
27 | close OUT;
28 | 
29 | $outfile = "$outdir/exp.ref.white.txt";
30 | open OUT, "> $outfile" or die "$outfile: $!";
31 | print OUT "Gene\tnum.cases\tSID\tfpkm\n";
32 | close OUT;
33 | 
34 | 


--------------------------------------------------------------------------------
/src/core/src/binom.R:
--------------------------------------------------------------------------------
 1 | 
 2 | argv <- commandArgs(TRUE)
 3 | 
 4 | infile <- argv[1]
 5 | outfile <- argv[2]
 6 | 
 7 | out <- NULL
 8 | dat <- read.table(infile,sep="\t",header=T,quote="")
 9 | 
10 | for (i in 1:nrow(dat)) {
11 | 	p_corr <- NULL
12 |   covg <- dat[i,7]
13 | 	sigma <- 10.8*(1-exp(-1*covg/105))
14 | 	ep <- 0.5
15 | 	if (dat[i,9] == "cnvloh") {
16 | 		ep <- dat[i,6]
17 | 	}
18 | 	p_binom <- dbinom(seq(0,covg),covg,ep)
19 | 	p_norm  <- dnorm(seq(-1000,1000),mean=0,sd=sigma)
20 | 	p_conv  <- convolve(p_binom,p_norm,type="open")
21 | 	y <- abs(dat[i,11]/(dat[i,11]+dat[i,10]) - ep)
22 | 	if (dat[i,11] > covg*ep) {
23 | 		p_corr <- sum(p_conv[(1001+dat[i,11]):length(p_conv)])
24 | 	}else {
25 | 		p_corr <- sum(p_conv[1:(1001+dat[i,11])])
26 | 	}
27 | 	if (p_corr < 0) {
28 | 		p_corr <- 0
29 | 	}
30 | 	out <- rbind(out,c(p_corr,y))
31 | }
32 | colnames(out) <- c("pvalue","delta.abs")
33 | out <- cbind(dat,out)
34 | 
35 | write.table(out,file=outfile,sep="\t",quote=F,row.names=F)
36 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-test-outliers:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 5 | 
 6 | SAMPLE_ID=$1
 7 | DISEASE=$2
 8 | FPKM_MATRIX=$3
 9 | OHE_RESULT=$4
10 | 
11 | # Ensure reference expression matrices exist for the given disease ID.
12 | BILIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.bi.txt
13 | WHITELIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.white.txt
14 | WHOLELIST=$CIS_X_HOME/refs/diseases/$DISEASE/exp.ref.entire.txt
15 | PRECALT=$CIS_X_HOME/refs/diseases/$DISEASE/precal.tvalue.bin_gt1.txt
16 | 
17 | if [ ! -f $BILIST ] || [ ! -f $WHITELIST ] || [ ! -f $WHOLELIST ]; then
18 |     echo "ERROR: $(date): cis-X-test-outliers: reference expression matrices missing for $DISEASE"
19 |     exit 1
20 | fi
21 | 
22 | if [ ! -f $PRECALT ]; then
23 |     echo "ERROR: $(date): cis-X-test-outliers: reference expression matrices missing for $DISEASE"
24 |     exit 1
25 | fi
26 | 
27 | Rscript $CIS_X_CORE_HOME/src/exp.check.R \
28 |     $SAMPLE_ID \
29 |     $FPKM_MATRIX \
30 |     $BILIST \
31 |     $WHOLELIST \
32 |     $WHITELIST \
33 |     $OHE_RESULT \
34 |     $PRECALT
35 | 


--------------------------------------------------------------------------------
/src/core/src/fdr.R:
--------------------------------------------------------------------------------
 1 | suppressMessages(library(multtest))
 2 | 
 3 | argv <- commandArgs(TRUE)
 4 | 
 5 | infile <- argv[1]
 6 | outfile <- argv[2]
 7 | 
 8 | dat <- read.table(infile,sep="\t",header=T,quote="",stringsAsFactor=F)
 9 | 
10 | out <- NULL
11 | pval <- NULL
12 | ai <- NULL
13 | 
14 | for (i in 1:nrow(dat)) {
15 |     x <- as.numeric(unlist(strsplit(as.character(dat[i,13]),",",perl=T)))
16 |     y <- as.numeric(unlist(strsplit(as.character(dat[i,15]),",",perl=T)))
17 |     x.geom <- exp(sum(log(x))/length(x))
18 |     y.m <- mean(y)
19 |     out <- rbind(out, c(x.geom,y.m))
20 | }
21 | 
22 | colnames(out) <- c("comb.pval","mean.delta")
23 | rownames(out) <- dat[,1]
24 | 
25 | if (nrow(dat) == 1) {
26 |     out <- cbind(out,out[,1],out[,1],out[,1])
27 |     colnames(out) <- c(colnames(out)[1:2],c("rawp","Bonferroni","ABH"))
28 |     out <- cbind(dat,out)
29 | }else {
30 |     raw.p <- out[,1]
31 |     adj.p <- mt.rawp2adjp(raw.p,c("Bonferroni","ABH"))$adj
32 |     rownames(adj.p) <- names(raw.p[order(raw.p)])
33 |     out <- cbind(out,adj.p[rownames(out),])
34 |     out <- cbind(dat,out)
35 | }
36 | 
37 | write.table(out,file=outfile,sep="\t",quote=F,row.names=F)
38 | 


--------------------------------------------------------------------------------
/src/seed/bin/merge_roadmap:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # frozen_string_literal: true
 3 | 
 4 | require "csv"
 5 | 
 6 | EPIGENOME_ID_PATTERN = /E\d{3}/
 7 | 
 8 | # This expects a csv-formatted version of `jul2013.roadmapData.qc`:
 9 | # https://docs.google.com/spreadsheets/d/1yikGx4MsO9Ei36b64yOy9Vb6oPC5IBGlFbYEt-N6gOM/view
10 | def read_groups(path)
11 |   File.open(path) do |f|
12 |     # skip meta
13 |     6.times { f.readline }
14 | 
15 |     csv = CSV.new(f)
16 | 
17 |     csv.each.reduce({}) do |map, row|
18 |       map[row[1]] = row[3]
19 |       map
20 |     end
21 |   end
22 | end
23 | 
24 | def parse_eid(s)
25 |   matches = s.match(EPIGENOME_ID_PATTERN)
26 |   raise if !matches
27 |   matches[0]
28 | end
29 | 
30 | input_dir = ARGV[0] or raise "missing input_dir"
31 | meta_file = ARGV[1] or raise "missing meta_file"
32 | 
33 | groups = read_groups(meta_file)
34 | 
35 | pathnames = Dir[File.join(input_dir, "*.bed")].sort
36 | 
37 | pathnames.each do |pathname|
38 |   basename = File.basename(pathname)
39 |   eid = parse_eid(basename)
40 |   group = groups[eid]
41 | 
42 |   raise if !group
43 | 
44 |   File.foreach(pathname) do |line|
45 |     puts "#{line.chomp}\t#{eid}\t#{group}"
46 |   end
47 | end
48 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/precal.R:
--------------------------------------------------------------------------------
 1 | 
 2 | argv <- commandArgs(TRUE)
 3 | 
 4 | workdir <- argv[1]
 5 | 
 6 | infile <- paste(workdir,"/refexp/exp.ref.bi.txt",sep="")
 7 | outfile <- paste(workdir,"/raw.tvalue.bicohort.txt",sep="")
 8 | dat <- read.table(infile,sep="\t",header=T,row.names=1,quote="",stringsAsFactors=F)
 9 | rawt <- NULL
10 | 
11 | for (i in 1:nrow(dat)) {
12 |   rawt.i <- NULL
13 |   y.in  <- NULL
14 |   y.raw <- NULL
15 |   y.size <- NULL
16 |   y.median <- NULL
17 |   y.in  <- as.numeric(unlist(strsplit(dat[i,3],",",perl=T)))
18 |   y.raw <- log10(as.numeric(unlist(strsplit(dat[i,3],",",perl=T)))+0.1)
19 |   y.median <- median(y.in)
20 |   y.size <- length(y.in)
21 | 
22 |   if (y.size >= 20 && y.median >= 1) {
23 |     for (j in 1:length(y.raw)) {
24 |       y.white <- y.raw[-j]
25 |       x.i <- y.raw[j]
26 |       t.white <- (x.i-mean(y.white))/((1+(length(y.white)-2)^-1)*(sd(y.white)^2))^0.5
27 |       p.white <- pt(t.white,length(y.white)-2,lower.tail=F)
28 |       if (j == 1) {
29 |         rawt.i <- t.white
30 |       }else {
31 |         rawt.i <- paste(rawt.i,t.white,sep=",") 
32 |       }
33 |     }
34 |     rawt <- rbind(rawt,c(rownames(dat)[i],rawt.i))
35 |   }
36 | }
37 | write.table(rawt,file=outfile,row.names=F,quote=F,sep="\t")
38 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-nominate:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 5 | 
 6 | SAMPLE_ID=$1
 7 | ASE_RESULT_GENE=$2
 8 | OHE_RESULT=$3
 9 | CANDIDATES_RESULT=$4
10 | THRESH_FPKM=$5
11 | ASE_RESULT_RUN=$6
12 | CANDIDATES_RESULT_RUN=$7
13 | 
14 | THRESH_AI_DI=0.3
15 | THRESH_AI_CNV=0.2
16 | THRESH_PVALUE_ASE=0.05
17 | THRESH_PVALUE_LOO=0.05
18 | #THRESH_FPKM=5
19 | THRESH_LOO_Hi_Perc=0.1
20 | NUM_MARKERS=4
21 | 
22 | IMPRINTING_GENES=$CIS_X_HOME/refs/external/ImprintGenes.txt
23 | ONCOGENES=$CIS_X_HOME/refs/external/cancer_gene_census.txt
24 | 
25 | perl $CIS_X_CORE_HOME/src/ase.candidate.pl \
26 |     $THRESH_PVALUE_ASE \
27 |     $THRESH_AI_DI \
28 |     $THRESH_AI_CNV \
29 |     $THRESH_FPKM \
30 |     $THRESH_PVALUE_LOO \
31 |     $SAMPLE_ID \
32 |     $CANDIDATES_RESULT \
33 |     $ASE_RESULT_GENE \
34 |     $OHE_RESULT \
35 |     $THRESH_LOO_Hi_Perc \
36 |     $IMPRINTING_GENES \
37 |     $ONCOGENES
38 | 
39 | perl $CIS_X_CORE_HOME/src/ase.candidate.byrun.pl \
40 |     $SAMPLE_ID \
41 |     $THRESH_FPKM \
42 |     $THRESH_PVALUE_LOO \
43 |     $ASE_RESULT_RUN \
44 |     $OHE_RESULT \
45 |     $CANDIDATES_RESULT_RUN \
46 |     $IMPRINTING_GENES \
47 |     $ONCOGENES \
48 |     $NUM_MARKERS \
49 |     $ASE_RESULT_GENE \
50 |     $THRESH_AI_DI \
51 |     $THRESH_AI_CNV 
52 | 


--------------------------------------------------------------------------------
/src/ref-exp/bin/cis-X-ref-exp-preprocess:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e
 4 | 
 5 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/../../core)
 6 | PATH=$CIS_X_CORE_HOME/bin:$PATH
 7 | 
 8 | SAMPLE_ID=$1
 9 | ROOTDIR=$2
10 | HIGH20=$3
11 | RNABAM=$4
12 | CNV_LOH=$5
13 | CHR_STRING=$6
14 | 
15 | COVG_WGS=10
16 | COVG_RNA=10
17 | CNV_LOH_ACTION=drop
18 | 
19 | if [ $# -lt 6 ]; then
20 |     basename $0
21 |     echo
22 |     echo "USAGE:"
23 |     echo "    cis-X ref-exp preprocess <sample-id> <results-dir> <markers> <bam> <cnv-loh> <chr-string>"
24 |     exit 1
25 | fi
26 | 
27 | WORKDIR=$ROOTDIR/$SAMPLE_ID/working_space
28 | mkdir -p $WORKDIR
29 | cd $WORKDIR
30 | 
31 | SNV4_OUT="$WORKDIR/$SAMPLE_ID.snv4.txt"
32 | HET_OUT="$WORKDIR/$SAMPLE_ID.heterozygous.markers.txt"
33 | #cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT
34 | cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT $COVG_WGS
35 | 
36 | MATRIX_OUT="$WORKDIR/matrix_combined_matrix_simple.tab"
37 | #cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT
38 | cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT $CHR_STRING
39 | 
40 | ASE_RESULT_MARKER="$WORKDIR/$SAMPLE_ID.ase.combine.WGS.RNAseq.goodmarkers.binom.txt"
41 | ASE_RESULT_GENE="$WORKDIR/$SAMPLE_ID.ase.gene.model.fdr.txt"
42 | ASE_RESULT_RUN="$WORKDIR/$SAMPLE_ID.ase.candidates.runs.txt"
43 | #cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE
44 | cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE $ASE_RESULT_RUN $CNV_LOH_ACTION $COVG_RNA
45 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-build-matrix:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 4 | 
 5 | RNABAM=$1
 6 | SNV4_OUT=$2
 7 | WORKDIR=$3
 8 | MATRIX_OUT=$4
 9 | CHR_STRING=$5
10 | 
11 | LOG=$WORKDIR/log.txt
12 | 
13 | RNABAMLST="$WORKDIR/bam.lst"
14 | 
15 | echo $RNABAM > $RNABAMLST
16 | 
17 | for i in $(seq 1 22); do
18 |     CHROM="chr$i"
19 |     SNV4_CHR=snv4.seqchr.txt
20 | 
21 |     if [ -f $SNV4_CHR ]; then
22 |         rm $SNV4_CHR
23 |     fi
24 | 
25 |     if [ -f "commands.txt" ]; then
26 |         rm commands.txt
27 |     fi
28 | 
29 |     if [ -f "commands.sh" ]; then
30 |         rm commands.sh
31 |     fi
32 | 
33 |     perl $CIS_X_CORE_HOME/src/sepCHR.pl $SNV4_OUT $CHROM $SNV4_CHR $CHR_STRING
34 | 
35 |     LINE_TEMP=$(wc -l $SNV4_CHR | sed -e 's/^ *//' | cut -d" " -f1)
36 | 
37 |     if [ $LINE_TEMP -gt 0 ]; then
38 |         echo "INFO: $(date): cis-X-build-matrix: processing $CHROM"
39 | 
40 |         variants2matrix -now -bam-list $RNABAMLST -variant-file $SNV4_CHR -snv4 -flat -name $CHROM -step1 commands.txt 2>> $LOG
41 |         cat commands.txt |sed 's/^\/bin\/env //' > commands.sh
42 |         sh commands.sh 2>commands.err
43 |         variants2matrix -now -bam-list $RNABAMLST -variant-file $SNV4_CHR -snv4 -flat -name $CHROM -step2 -clean 2>> $LOG
44 |     fi
45 | done
46 | 
47 | perl $CIS_X_CORE_HOME/src/mergeVariantOut.pl $WORKDIR $MATRIX_OUT $CHR_STRING >> $LOG
48 | 
49 | MATRIX_OUT_LINE=$(wc -l $MATRIX_OUT | sed -e 's/^ *//' | cut -d" " -f1)
50 | 
51 | if [ $MATRIX_OUT_LINE -lt 2 ]; then
52 |     echo "No output from variants2matrix. Exiting."
53 |     exit 1
54 | fi
55 | 


--------------------------------------------------------------------------------
/src/core/src/02.add.count.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $sid      = $ARGV[0];
 4 | my $het_wgs  = $ARGV[1];
 5 | my $geno_rna = $ARGV[2];
 6 | my $output   = $ARGV[3];
 7 | #my $cvg_cut  = $ARGV[4];  ### output everything with minimal 3 reads support for at least one allele. Further filter will apply. 2019-04-08.
 8 | 
 9 | my %count = ();
10 | my $infile = $geno_rna;
11 | open IN, "< $infile" or die "$infile: $!";
12 | while(<IN>) {
13 |     chomp;
14 |     next if $. == 1;
15 |     my @F = split/\t/;
16 |     $count{$sid}{$F[1]}{ref} = $F[2];
17 |     $count{$sid}{$F[1]}{mut} = $F[3];
18 |     $count{$sid}{$F[1]}{cvg} = $F[2] + $F[3];
19 | }
20 | close IN;
21 | 
22 | $infile = $het_wgs;
23 | my $outfile = $output;
24 | open IN, "< $infile" or die "$infile: $!";
25 | open OUT, "> $outfile" or die "$outfile: $!";
26 | while(<IN>) {
27 |     chomp;
28 |     if ($. == 1) {
29 |         print OUT "chrom\tpos\tref\tmut\tcvg_wgs\tmut_freq_wgs\tcvg_rna\tmut_freq_rna\tcnvlohTag\tref\tvar\n";
30 |         next;
31 |     }
32 |     my @F = split/\t/;
33 |     my $snv4 = "$F[0].$F[1].$F[2].$F[3]";
34 |     my $cvg_wgs = $F[4] + $F[6];
35 |     my $freq_wgs = sprintf("%.3f",$F[6]/$cvg_wgs);
36 |     my $cvg_rna = $count{$sid}{$snv4}{cvg};
37 | #    if ($cvg_rna >= $cvg_cut) {
38 | #    if ($count{$sid}{$snv4}{ref} >= 3 or $count{$sid}{$snv4}{mut} >= 3) {
39 |     if ($cvg_rna >= 5) {
40 |         $freq_rna = sprintf("%.3f",$count{$sid}{$snv4}{mut}/$cvg_rna);
41 |         print OUT "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$cvg_wgs\t$freq_wgs\t$cvg_rna\t$freq_rna\t$F[8]\t$count{$sid}{$snv4}{ref}\t$count{$sid}{$snv4}{mut}\n";
42 |     }
43 | }
44 | close IN;
45 | close OUT;
46 | 


--------------------------------------------------------------------------------
/src/core/src/merge.fa.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $sid    = $ARGV[0];
 4 | my $varlst = $ARGV[1];
 5 | my $fa_in  = $ARGV[2];
 6 | my $fa_out = $ARGV[3];
 7 | 
 8 | my (%name2fa);
 9 | 
10 | my $infile = $fa_in;
11 | my $name = "";
12 | open IN, "< $infile" or die "$infile: $!";
13 | while(<IN>) {
14 |     chomp;
15 |     if ($_ =~ /^>/) {
16 |         $name = $_;
17 |         $name =~ s/^>//;
18 |         next;
19 |     }
20 |     $name2fa{$name} = $_;
21 | }
22 | close IN;
23 | 
24 | $infile = $varlst;
25 | my $outfile = $fa_out;
26 | open OUT, "> $outfile" or die "$outfile: $!";
27 | open IN, "< $infile" or die "$infile: $!";
28 | while(<IN>) {
29 |     chomp;
30 |     next if $. == 1;
31 |     my @F = split/\t/;
32 |     my $left = $name2fa{$F[4]};
33 |     my $right = $name2fa{$F[5]};
34 |     my $ref = $F[2];
35 |     my $mut = $F[3];
36 |     my $mut_seq = "";
37 |     my $ref_seq = "";
38 |     my $mut_id = "$F[0].mut";
39 |     my $ref_id = "$F[0].ref";
40 |     if ($F[1] eq "snv") {
41 |         $mut_seq = $left . $mut . $right;
42 |         $ref_seq = $left . $ref . $right;
43 |     }elsif ($F[1] eq "ins") {
44 |         $mut_seq = $left . $mut . $right;
45 |         $ref_seq = $left . $right;
46 |     }elsif ($F[1] eq "del") {
47 |         $mut_seq = $left . $right;
48 |         $ref_seq = $left . $ref . $right;
49 |     }elsif ($F[1] eq "complex_indel") {
50 |         $mut_seq = $left . $mut . $right;
51 |         $ref_seq = $left . $ref . $right;
52 |     }else {
53 |         print "Wrong var type of $F[1] for $F[0].\n";
54 |     }
55 |     print OUT ">";
56 |     print OUT "$mut_id\n";
57 |     print OUT "$mut_seq\n";
58 |     print OUT ">";
59 |     print OUT "$ref_id\n";
60 |     print OUT "$ref_seq\n";
61 | }
62 | close IN;
63 | close OUT;
64 | 
65 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Release
 2 | 
 3 |   * [ ] Update `CHANGELOG.md` with version and publication date.
 4 |   * [ ] Update version in `dnanexus/cis-x/dxapp.json`.
 5 |   * [ ] Stage changes: `git add dnanexus/cis-x/dxapp.json CHANGELOG.md`
 6 |   * [ ] Create git commit: `git commit -m "Bump version to $VERSION"`
 7 |   * [ ] Create git tag: `git tag -m "" -a v$VERSION`
 8 |   * [ ] Push release: `git push --follow-tags`
 9 | 
10 | ## DNAnexus
11 | 
12 |   * [ ] Build Docker image: `docker image build --tag cis-x .`
13 |   * [ ] Save Docker image: `docker image save cis-x | gzip > dnanexus/cis-x/resources/tmp/cis-x-latest.tar.gz`
14 |   * [ ] Check security context: `dx whoami`
15 |   * [ ] Build DNAnexus applet: `dx build --destination cis-x:/cis-x-$VERSION dnanexus/cis-x`
16 |   * [ ] Verify expected results:
17 | 
18 |     ```
19 |     dx run cis-x:/cis-x-$VERSION \
20 |       --input sample_id=SJALL018373_D1 \
21 |       --input markers=cis-x:/data/SJALL018373_D1.test.wgs.markers.txt \
22 |       --input cnv_loh=cis-x:/data/SJALL018373_D1.test.wgs.cnvloh.txt \
23 |       --input bam=cis-x:/data/SJALL018373_D1.test.RNAseq.bam \
24 |       --input bai=cis-x:/data/SJALL018373_D1.test.RNAseq.bam.bai \
25 |       --input fpkm_matrix=cis-x:/data/SJALL018373_D1.test.RNASEQ_all_fpkm.txt \
26 |       --input snv_indel=cis-x:/data/SJALL018373_D1.test.mut.txt \
27 |       --input sv=cis-x:/data/SJALL018373_D1.test.sv.txt \
28 |       --input cna=cis-x:/data/SJALL018373_D1.test.cna.txt \
29 |       --input disease=TALL \
30 |       --input cnv_loh_action=drop \
31 |       --input min_coverage_wgs=10 \
32 |       --input min_coverage_rna_seq=10 \
33 |       --destination cis-x:/results/$VERSION
34 |     ```
35 | 
36 |   * [ ] Publish DNAnexus app: `dx build --app --publish dnanexus/cis-x`
37 |   * [ ] Build St. Jude Cloud production workflow.
38 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-ase:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 5 | 
 6 | SAMPLE_ID=$1
 7 | WORKDIR=$2
 8 | HET_OUT=$3
 9 | MATRIX_OUT=$4
10 | ASE_RESULT_MARKER=$5
11 | ASE_RESULT_GENE=$6
12 | ASE_RESULT_RUN=$7
13 | CNV_LOH_ACTION=$8
14 | COVG_RNA=$9
15 | 
16 | #THRESH_AI=0.3
17 | THRESH_AI_DI=0.3
18 | THRESH_AI_CNV=0.2
19 | THRESH_PVALUE_ASE=0.05
20 | 
21 | GENE_MODEL=$CIS_X_HOME/refs/external/hg19_refGene
22 | GENE_MODEL_BED=$CIS_X_HOME/refs/external/hg19_refGene.bed
23 | 
24 | WGS_RNA_COUNT="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.txt"
25 | ASE_RUNS="$WORKDIR/$SAMPLE_ID.ase.runs.txt"
26 | ASE_RUNS_BED="$WORKDIR/$SAMPLE_ID.ase.runs.bed"
27 | ASE_RUNS_GENE="$WORKDIR/$SAMPLE_ID.ase.runs.genes.txt"
28 | GENE_MODEL_Temp1="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.binom.genemodel.summary.txt"
29 | GENE_MODEL_Temp2="$WORKDIR/$SAMPLE_ID.combine.WGS.RNAseq.goodmarkers.binom.genemodel.summary.merged.txt"
30 | 
31 | #perl $CIS_X_CORE_HOME/src/02.add.count.pl $SAMPLE_ID $HET_OUT $MATRIX_OUT $WGS_RNA_COUNT $COVG_RNA
32 | perl $CIS_X_CORE_HOME/src/02.add.count.pl $SAMPLE_ID $HET_OUT $MATRIX_OUT $WGS_RNA_COUNT
33 | Rscript $CIS_X_CORE_HOME/src/binom.R $WGS_RNA_COUNT $ASE_RESULT_MARKER
34 | 
35 | perl $CIS_X_CORE_HOME/src/07.gene.model.Oct2017.pl \
36 |     $SAMPLE_ID \
37 |     $ASE_RESULT_MARKER \
38 |     $GENE_MODEL \
39 |     $THRESH_AI_DI \
40 |     $THRESH_AI_CNV \
41 |     $THRESH_PVALUE_ASE \
42 |     $CNV_LOH_ACTION \
43 |     $GENE_MODEL_Temp1 \
44 |     $COVG_RNA
45 | 
46 | perl $CIS_X_CORE_HOME/src/05.merge.pl $GENE_MODEL_Temp1 $GENE_MODEL_Temp2
47 | Rscript $CIS_X_CORE_HOME/src/fdr.R $GENE_MODEL_Temp2 $ASE_RESULT_GENE
48 | 
49 | #perl $CIS_X_CORE_HOME/src/ase_runs.pl $ASE_RESULT_MARKER $THRESH_AI_DI $THRESH_AI_CNV $ASE_RUNS1 $ASE_RUNS_BED1 15 0.8 200000
50 | perl $CIS_X_CORE_HOME/src/ase_runs.pl $ASE_RESULT_MARKER $THRESH_AI_DI $THRESH_AI_CNV $ASE_RUNS $ASE_RUNS_BED 4 0.6 500000000
51 | #bedtools intersect -a $ASE_RUNS_BED1 -b $GENE_MODEL_BED -F 0.8 -wao > $ASE_RUNS_GENE1
52 | bedtools intersect -a $ASE_RUNS_BED -b $GENE_MODEL_BED -wao > $ASE_RUNS_GENE
53 | perl $CIS_X_CORE_HOME/src/proc_ase_runs.pl $ASE_RUNS $ASE_RUNS_GENE $ASE_RESULT_RUN
54 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/cleanup.bi.cases.R:
--------------------------------------------------------------------------------
 1 | 
 2 | argvs <- commandArgs(TRUE)
 3 | 
 4 | infile <- paste(argvs[1],"cis-X.refexp.step2.collect.filtered.txt",sep="/")
 5 | outfile <- paste(argvs[1],"cis-X.refexp.step2.collect.filtered.bi.samples.cleared.txt",sep="/")
 6 | dat.raw <- read.table(infile,sep="\t",header=T,row.names=1,quote="",stringsAsFactors=F)
 7 | dat <- dat.raw
 8 | 
 9 | out <- NULL
10 | for (i in 1:nrow(dat)) {
11 | 	s.i <- NULL
12 | 	s.clear <- NULL
13 | 	f.i <- NULL
14 | 	f.clear <- NULL
15 | 	s.count <- NULL
16 | 	trim <- 0
17 | 	tot.i <- dat[i,2]+dat[i,5]
18 | 	if (dat[i,5]>=10) {
19 | 		bi.i <- unlist(strsplit(dat[i,6],",",perl=T))
20 | 		bi.fpkm <- as.numeric(unlist(strsplit(dat[i,7],",",perl=T)))
21 | 		names(bi.fpkm) <- bi.i
22 | 		y.i <- as.numeric(log10(bi.fpkm+0.1))
23 | 		if (length(unique(bi.fpkm)) == 1) {
24 | 			s.i <- ""
25 | 			s.clear <- ""
26 | 			f.i <- ""
27 | 			f.clear <- ""
28 | 			s.count <- ""
29 | 		}else if (length(y.i[y.i<0])/length(y.i) == 1) {
30 | 			s.i <- ""
31 | 			s.clear <- ""
32 | 			f.i <- ""
33 | 			f.clear <- ""
34 | 			s.count <- ""
35 | 		}else {
36 | 			for (j in 1:length(bi.i)){
37 | 				x.j <- y.i[j]
38 | 				y.j <- y.i[-j]
39 | 				t.j <- (x.j-mean(y.j))/((1+(length(y.j)-2)^-1)*(sd(y.j)^2))^0.5
40 | 				p.j <- pt(t.j,length(y.j)-2,lower.tail=F)
41 | 				if (p.j < 0.05) {
42 | 					s.i <- c(s.i,bi.i[j])
43 | 					f.i <- c(f.i,as.numeric(bi.fpkm)[j])
44 | 					trim <- 1
45 | 				}else {
46 | 					s.clear <- c(s.clear,bi.i[j])
47 | 					f.clear <- c(f.clear,as.numeric(bi.fpkm)[j])
48 | 				}
49 | 			}
50 | 			if (trim == 1) {
51 | 				s.count <- length(s.clear)
52 | 			}else {
53 | 				s.count <- ""
54 | 			}
55 | 		}
56 | 	}else {
57 | 		s.i <- ""
58 | 		s.clear <- ""
59 | 		f.i <- ""
60 | 		f.clear <- ""
61 | 		s.count <- ""
62 | 	}
63 | 	s.i <- paste(s.i,collapse=",")
64 | 	f.i <- paste(f.i,collapse=",")
65 | 	s.clear <- paste(s.clear,collapse=",")
66 | 	f.clear <- paste(f.clear,collapse=",")
67 | 	out <- rbind(out,c(tot.i,trim,s.count,s.clear,f.clear,s.i,f.i))
68 | }
69 | colnames(out) <- c("num.total.samples","trim","num.bi.samples.cleared","bi.samples.cleared","bi.fpkm.cleared","bi.samples.excluded","bi.fpkm.excluded")
70 | out.r <- cbind(rownames(dat),dat,out)
71 | colnames(out.r)[1] <- "Gene"
72 | write.table(out.r,file=outfile,sep="\t",row.names=F,quote=F)
73 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/filter.cohort.v2.pl:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/perl -w
 2 | 
 3 | my $codedir = $ARGV[0];
 4 | my $workdir = $ARGV[1];
 5 | 
 6 | my (%gene,%imprint);
 7 | 
 8 | my $infile = "$codedir/../../../refs/external/hg19_refGene";
 9 | open IN, "< $infile" or die "$infile: $!";
10 | while(<IN>) {
11 |     chomp;
12 |     next if $. == 1;
13 |     my @F = split/\t/;
14 |     my $g = $F[12];
15 |     $gene{$g}{chrom} = $F[2];
16 | }
17 | close IN;
18 | 
19 | $infile = "$codedir/../../../refs/external/ImprintGenes.txt";
20 | open IN, "< $infile" or die "$infile: $!";
21 | while(<IN>) {
22 |     chomp;
23 |     next if $. == 1;
24 |     my @F = split/\t/;
25 |     $imprint{$F[0]} = $F[3];
26 | }
27 | close IN;
28 | 
29 | $infile = "$workdir/cis-X.refexp.step2.collect.txt";
30 | my $outfile = "$workdir/cis-X.refexp.step2.collect.filtered.txt";
31 | open IN, "< $infile" or die "$infile: $!";
32 | open OUT, "> $outfile" or die "$outfile: $!";
33 | while(<IN>) {
34 |     chomp;
35 |     if ($. == 1) {
36 |         print OUT "$_\tchrom\timprinting\tase_fpkm_max\tase_fpkm_gt1_count\tbi_fpkm_max\tbi_fpkm_gt1_count\n";
37 |         next;
38 |     }
39 |     my @F = split/\t/;
40 |     next unless $F[1] == 1;
41 |     my $g = $F[0];
42 |     my $ase_max = "na";
43 |     my $bi_max = "na";
44 |     my $ase_count = 0;
45 |     my $bi_count = 0;
46 |     if ($F[2] > 0) {
47 |         my @ase = split(/,/,$F[4]);
48 |         $ase_max = $ase[0];
49 |         for my $i (0 .. $#ase) {
50 |             if ($ase[$i] > $ase_max) {
51 |                 $ase_max = $ase[$i];
52 |             }
53 |             if ($ase[$i] >= 1) {
54 |                 $ase_count++;
55 |             }
56 |         }
57 |     }
58 |     if ($F[5] > 0) {
59 |         my @bi  = split(/,/,$F[7]);
60 |         $bi_max = $bi[0];
61 |         for my $j (0 .. $#bi) {
62 |             if ($bi[$j] > $bi_max) {
63 |                 $bi_max = $bi[$j];
64 |             }
65 |             if ($bi[$j] >= 1) {
66 |                 $bi_count++;
67 |             }
68 |         }
69 |     }
70 |     my $imprint = "";
71 |     if ($imprint{$g}) {
72 |         $imprint = $imprint{$g};
73 |     }
74 |     my $chrom = $gene{$g}{chrom};
75 |     next if $chrom =~ /hap/;
76 |     print OUT "$_\t$chrom\t$imprint\t$ase_max\t$ase_count\t$bi_max\t$bi_count\n";
77 | }
78 | close IN;
79 | close OUT;
80 | 
81 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | #Changelog
 2 | 
 3 | ## [1.5.0] - 2020-05-19
 4 | 
 5 | ### Added
 6 | 
 7 |   * core: Add option to use user-specified TAD (topologically associating
 8 |     domain) annotations. See `t` option. By default, this still uses hESC
 9 |     (Human ES Cell).
10 | 
11 |   * ref-exp: Add option to handle reference sequence names prefixed with "chr".
12 |     Set `chr-string` to either `TRUE` or `FALSE`.
13 | 
14 |   * ref-exp: Build normalized t-values for a gene across samples. See
15 |     `precal.tvalue.bin_gt1.txt`.
16 | 
17 | ### Changed
18 | 
19 |   * core: `cis-X-run` now uses short name arguments instead of unnamed
20 |     arguments. For example, instead of `cis-X run $SAMPLE_ID ...`, run `cis-X
21 |     run -s $SAMPLE_ID ...`.
22 | 
23 |   * Synced with 2020-02-08 and 2020-03-13 revisions.
24 | 
25 | ## [1.4.0] - 2019-07-10
26 | 
27 | ### Added
28 | 
29 |   * core: Identify regions with consecutive markers that exhibit ASE.
30 | 
31 | ### Changed
32 | 
33 |   * core: Tighten scoring method for when known oncogenes should be reevaluated.
34 | 
35 |   * core: Updated binomial distribution statistical model.
36 | 
37 |   * seed: `cancer_gene_census.txt` no longer has a version in its filename.
38 | 
39 | ## [1.3.0] - 2019-03-28
40 | 
41 | ### Added
42 | 
43 |   * core: Known oncogenes in the [COSMIC Cancer Gene Census] are used to
44 |     reevaluate cis-activated candidates.
45 | 
46 | [COSMIC Cancer Gene Census]: https://cancer.sanger.ac.uk/census
47 | 
48 | ### Changed
49 | 
50 |   * core: Increased default transcription factor FPKM value to 10 for
51 |     screening.
52 | 
53 |   * core: The motif for MYB (MYBL1 and MYBL2) are similar and treated as the
54 |     same gene.
55 | 
56 |   * core: SNV/indel candidates are sorted by FPKM value.
57 | 
58 | ## [1.2.0] - 2019-01-08
59 | 
60 | ### Added
61 | 
62 |   * core: Added argument to set the FPKM threshold for the nomination of
63 |     a cis-activated candidate.
64 | 
65 | ## [1.1.0] - 2018-12-17
66 | 
67 | ### Added
68 | 
69 |   * core: Added argument to handle markers in CNV/LOH regions. This can either
70 |     be `keep` or `drop`.
71 | 
72 |   * core: Added arguments to set the threshold for the minimal coverage in WGS
73 |     and RNA-seq when selecting heterozygous markers.
74 | 
75 | ### Fixed
76 | 
77 |   * seed: Update download location for `GRCh37-lite.fa.gz`.
78 | 
79 | ## 1.0.0 - 2018-07-23
80 | 
81 |   * Initial release
82 | 
83 | [1.4.0]: https://github.com/stjude/cis-x/compare/v1.4.0...v1.5.0
84 | [1.4.0]: https://github.com/stjude/cis-x/compare/v1.3.0...v1.4.0
85 | [1.3.0]: https://github.com/stjude/cis-x/compare/v1.2.0...v1.3.0
86 | [1.2.0]: https://github.com/stjude/cis-x/compare/v1.1.0...v1.2.0
87 | [1.1.0]: https://github.com/stjude/cis-x/compare/v1.0.0...v1.1.0
88 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-screen:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
 4 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
 5 | 
 6 | SAMPLE_ID=$1
 7 | CANDIDATES_RESULT=$2
 8 | CANDIDATES_RESULT_RUN=$3
 9 | SV_IN=$4
10 | CNA_IN=$5
11 | SNVINDEL_IN=$6
12 | FPKM_MATRIX=$7
13 | WORKDIR=$8
14 | SV_CAN=$9
15 | CNA_CAN=${10}
16 | SNVINDEL_CAN=${11}
17 | ANNO_USER=${12}
18 | TAD=${13}
19 | 
20 | LOG=$WORKDIR/log.txt
21 | 
22 | SV_WIN=1000000
23 | CNA_WIN=1000000
24 | CNA_SIZE=5000000
25 | SNVINDEL_WIN=200000
26 | #TF_FPKM_THRESH=3
27 | TF_FPKM_THRESH=10
28 | PERC_OVERLAP=0.3
29 | 
30 | #TAD=$CIS_X_HOME/refs/external/hESC.combined.domain.hg19.bed
31 | REFGENE=$CIS_X_HOME/refs/external/hg19_refGene.bed
32 | REF_2BIT=$CIS_X_HOME/refs/external/GRCh37-lite.2bit
33 | MOTIF=$CIS_X_HOME/refs/external/HOCOMOCOv10_HUMAN_mono_meme_format.meme
34 | ROADMAP_ENH=$CIS_X_HOME/refs/external/roadmapData.enhancer.merged.111.bed
35 | ROADMAP_PRO=$CIS_X_HOME/refs/external/roadmapData.promoter.merged.111.bed
36 | ROADMAP_DYA=$CIS_X_HOME/refs/external/roadmapData.dyadic.merged.111.bed
37 | 
38 | SV_TEMP1="$WORKDIR/$SAMPLE_ID.sv.candidates.temp1.txt"
39 | CNA_TEMP1="$WORKDIR/$SAMPLE_ID.cna.candidates.temp1.txt"
40 | SNVINDEL_VAR="$WORKDIR/$SAMPLE_ID.snvindel.varlist.txt"
41 | SNVINDEL_SEQLIST="$WORKDIR/$SAMPLE_ID.snvindel.seqlist.txt"
42 | SNVINDEL_FA="$WORKDIR/$SAMPLE_ID.snvindel.fa"
43 | FIMO_FA_IN="$WORKDIR/$SAMPLE_ID.snvindel.fimo.input.fa"
44 | FIMO_OUT="$WORKDIR/fimo_out/fimo.txt"
45 | FIMO_ACC2GSYM="$CIS_X_HOME/refs/external/HOCOMOCOv10_annotation_HUMAN_mono.tsv"
46 | 
47 | perl $CIS_X_CORE_HOME/src/scan.sv.pl $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $SV_IN $SV_TEMP1 $SV_WIN $REFGENE
48 | perl $CIS_X_CORE_HOME/src/check.TAD.pl $SAMPLE_ID $TAD $REFGENE $SV_TEMP1 $SV_CAN
49 | perl $CIS_X_CORE_HOME/src/scan.cnv.pl $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $CNA_IN $CNA_TEMP1 $CNA_WIN $CNA_SIZE $REFGENE $PERC_OVERLAP
50 | perl $CIS_X_CORE_HOME/src/check.TAD.cnv.pl $SAMPLE_ID $TAD $REFGENE $CNA_TEMP1 $CNA_CAN
51 | 
52 | perl $CIS_X_CORE_HOME/src/snvindel.prep.pl $SAMPLE_ID \
53 |     $SNVINDEL_IN \
54 |     $CANDIDATES_RESULT \
55 |     $CANDIDATES_RESULT_RUN \
56 |     $SV_CAN \
57 |     $CNA_CAN \
58 |     $TAD \
59 |     $SNVINDEL_VAR \
60 |     $SNVINDEL_SEQLIST \
61 |     $SNVINDEL_WIN \
62 |     $REFGENE
63 | 
64 | twoBitToFa -seqList=$SNVINDEL_SEQLIST $REF_2BIT $SNVINDEL_FA
65 | perl $CIS_X_CORE_HOME/src/merge.fa.pl $SAMPLE_ID $SNVINDEL_VAR $SNVINDEL_FA $FIMO_FA_IN
66 | fimo --verbosity 1 --thresh 1e-3 $MOTIF $FIMO_FA_IN 2>> $LOG
67 | 
68 | perl $CIS_X_CORE_HOME/src/snvindel.process.pl \
69 |     $SAMPLE_ID \
70 |     $FIMO_OUT \
71 |     $FIMO_ACC2GSYM \
72 |     $SNVINDEL_VAR \
73 |     $FPKM_MATRIX \
74 |     $TF_FPKM_THRESH \
75 |     $SNVINDEL_CAN \
76 |     $ROADMAP_ENH \
77 |     $ROADMAP_PRO \
78 |     $ROADMAP_DYA \
79 |     $ANNO_USER
80 | 


--------------------------------------------------------------------------------
/src/ref-exp/src/collect.cohort.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my (%sid,%dat,%gene,%g2fpkm,%checksid);
  4 | 
  5 | my $config  = $ARGV[0];
  6 | my $workdir = $ARGV[1];
  7 | my $expfile = $ARGV[2];
  8 | 
  9 | my $infile = $config;
 10 | open IN, "< $infile" or die "$infile: $!";
 11 | while(<IN>) {
 12 |     chomp;
 13 |     next if $. == 1;
 14 |     my @F = split/\t/;
 15 |     $sid{$F[0]} = 1;
 16 | }
 17 | close IN;
 18 | 
 19 | my %col2sid;
 20 | $infile = $expfile;
 21 | open IN, "< $infile" or die "$infile: $!";
 22 | while(<IN>) {
 23 |     chomp;
 24 |     my @F = split/\t/;
 25 |     if ($. == 1) {
 26 |         for my $i (7 .. $#F) {
 27 |             $col2sid{$i} = $F[$i];
 28 |             $checksid{$F[$i]} = 1;
 29 |         }
 30 |         next;
 31 |     }
 32 |     $gene{$F[1]} = 1;
 33 |     for my $i (7 .. $#F) {
 34 |         $g2fpkm{$F[1]}{$col2sid{$i}} = $F[$i];
 35 |     }
 36 | }
 37 | close IN;
 38 | 
 39 | my $NoneExistSID = 0;
 40 | for my $s (sort keys %sid) {
 41 |     unless ($checksid{$s}) {
 42 |         print "$s not exist in the exp matrix $expfile.\n";
 43 |         $NoneExistSID = 1;
 44 |     }
 45 | }
 46 | 
 47 | if ($NoneExistSID == 1) {
 48 |     die("Error: SID printed above not exist in the expression matrix.");
 49 | }
 50 | 
 51 | for my $sid (sort keys %sid) {
 52 |     my $infile = "$workdir/$sid/working_space/$sid.ase.gene.model.fdr.txt";
 53 |     if (! -e $infile) {
 54 |         print "$infile not exist.\n";
 55 |         next;
 56 |     }
 57 |     open IN, "< $infile" or die "$infile: $!";
 58 |     while(<IN>) {
 59 |         chomp;
 60 |         next if $. == 1;
 61 |         my @F = split/\t/;
 62 |         next unless $g2fpkm{$F[1]};
 63 |         if ($F[22]<0.05 and $F[19]>=0.3) {
 64 |             $dat{$F[1]}{ase}{sid} .= "$sid,";
 65 |             $dat{$F[1]}{ase}{fpkm} .= "$g2fpkm{$F[1]}{$sid},";
 66 |         }elsif ($F[22] >= 0.05 and $F[9] == 0) { ### criteria updated on Dec 27, 2017.
 67 |             $dat{$F[1]}{bi}{sid} .= "$sid,";
 68 |             $dat{$F[1]}{bi}{fpkm} .= "$g2fpkm{$F[1]}{$sid},";
 69 |         }else {
 70 |             1;
 71 |         }
 72 |     }
 73 |     close IN;
 74 | }
 75 | 
 76 | my $outfile = "$workdir/cis-X.refexp.step2.collect.txt";
 77 | open OUT, "> $outfile" or die "$outfile: $!";
 78 | print OUT "gene\tpresent.TARGET\tnum.ase.samples\tase.samples\tfpkm.ase.samples\tnum.bi.samples\tbi.samples\tfpkm.bi.samples\n";
 79 | for my $g (sort keys %dat) {
 80 |     my $present = 0;
 81 |     my $ase_count = 0;
 82 |     my $bi_count = 0;
 83 |     my $ase_sid = "";
 84 |     my $bi_sid = "";
 85 |     my $fpkm_ase = "";
 86 |     my $fpkm_bi = "";
 87 |     $present = 1 if $gene{$g};
 88 |     if ($dat{$g}{ase}) {
 89 |         $ase_sid = $dat{$g}{ase}{sid};
 90 |         $fpkm_ase = $dat{$g}{ase}{fpkm};
 91 |         $ase_sid =~ s/\,$//;
 92 |         $fpkm_ase =~ s/\,$//;
 93 |         my @s_a = split(/,/,$ase_sid);
 94 |         $ase_count = scalar(@s_a);
 95 |     }
 96 |     if ($dat{$g}{bi}) {
 97 |         $bi_sid = $dat{$g}{bi}{sid};
 98 |         $fpkm_bi = $dat{$g}{bi}{fpkm};
 99 |         $bi_sid =~ s/\,$//;
100 |         $fpkm_bi =~ s/\,$//;
101 |         my @s_b = split(/,/,$bi_sid);
102 |         $bi_count = scalar(@s_b);
103 |     }
104 |     print OUT "$g\t$present\t$ase_count\t$ase_sid\t$fpkm_ase\t$bi_count\t$bi_sid\t$fpkm_bi\n";
105 | }
106 | close OUT;
107 | 
108 | 


--------------------------------------------------------------------------------
/src/seed/README.md:
--------------------------------------------------------------------------------
 1 | # cis-X seed
 2 | 
 3 | `cis-X seed` downloads and generates a set of common reference files required
 4 | by cis-X.
 5 | 
 6 | ## Prerequisites
 7 | 
 8 |   * [Ruby] ^2.2.2
 9 |     * [nokogiri] ~1.8.3
10 |   * [faToTwoBit]\*
11 |   * [liftOver]\*
12 | 
13 | \* UCSC Genome Browser binaries are not versioned. The latest versions
14 | _should_ work.
15 | 
16 | [Ruby]: http://ruby-lang.org/
17 | [nokogiri]: http://www.nokogiri.org/
18 | [faToTwoBit]: https://genome.ucsc.edu/goldenpath/help/twoBit.html
19 | [liftOver]: https://genome.ucsc.edu/cgi-bin/hgLiftOver
20 | 
21 | ## Usage
22 | 
23 | ```
24 | $ cis-X seed <out-dir> [tmp-dir]
25 | ```
26 | 
27 | ## References
28 | 
29 | The following files (sans CGC) are created by `cis-X seed`. They are all
30 | required to run cis-X.
31 | 
32 |   * `cancer_gene_census.txt`: Tiers 1 and 2 [Cancer Gene Census] (CGC) from
33 |     COSMIC in TSV. While cis-X is tested with version 87, newer version should
34 |     work. This file requires an account and is not automatically downloaded.
35 | 
36 |   * `GRCh37-lite.2bit`: Converted from [`GRCh37-lite.fa`] to 2bit using
37 |     [faToTwoBit].
38 | 
39 |   * `hESC.combined.domain.hg19.bed`: Extracted from Hi-C's "[Human ES Cell (H1) topological domains]"
40 |     and preprocessed from hg18 to hg19 using [liftOver].
41 | 
42 |   * `hg19_refGene`: Downloaded from the [UCSC Table Browser] (assembly: Feb.
43 |     2009 (GRCH37/hg19), track: NCBI RefSeq, table: UCSC RefSeq (refGene)).
44 | 
45 |   * `hg19_refGene.bed`: Converted from `hg19_refGene` using
46 |     `bin/hg19_ref_gene_to_bed`.
47 | 
48 |   * `HOCOMOCOv10_HUMAN_mono_meme_format.meme`: Downloaded from [HOCOMOCO v10]
49 |     (Matrices in other formats > MEME).
50 | 
51 |   * `HOCOMOCOv10_annotation_HUMAN_mono.tsv`: Downloaded from [HOCOMOCO v10]
52 |     (Complete model annotation)
53 | 
54 |   * `ImprintGenes.txt`: Copied from Geneimprint [Human Imprinted Genes] as a
55 |     tab-delimited file.
56 | 
57 |   * `roadmapData.dyadic.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project]
58 |     (Delineation of DNaseI-accessible regulatory regions > Dyadic). All files
59 |     are merged with two extra columns: cell line name and tissue of origin.
60 | 
61 |   * `roadmapData.enhancer.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project]
62 |     (Delineation of DNaseI-accessible regulatory regions > Enhancer). All files
63 |     are merged with two extra columns: cell line name and tissue of origin.
64 | 
65 |   * `roadmapData.promoter.merged.111.bed`: Downloaded from the [NIH Roadmap Epigenomics Project]
66 |     (Delineation of DNaseI-accessible regulatory regions > Promoter). All files
67 |     are merged with two extra columns: cell line name and tissue of origin.
68 | 
69 | [Cancer Gene Census]: https://cancer.sanger.ac.uk/census
70 | [`GRCh37-lite.fa`]: https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/
71 | [Human ES Cell (H1) topological domains]: http://chromosome.sdsc.edu/mouse/hi-c/download.html
72 | [UCSC Table Browser]: http://genome.ucsc.edu/cgi-bin/hgTables
73 | [HOCOMOCO v10]: http://hocomoco11.autosome.ru/downloads_v10
74 | [Human Imprinted Genes]: http://www.geneimprint.com/site/genes-by-species.Homo+sapiens
75 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html
76 | 
77 | ## Example
78 | 
79 | `cis-X seed` will commonly be used to seed the `$CIS_X_HOME/refs/external`
80 | directory.
81 | 
82 | ```
83 | $ cis-X seed $CIS_X_HOME/refs/external
84 | ```
85 | 


--------------------------------------------------------------------------------
/dnanexus/cis-x/src/cis-x.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | main() {
 4 |     set -x
 5 | 
 6 |     REFERENCE_DATA_PROJECT_ID=project-F5444K89PZxXjBqVJ3Pp79B4
 7 |     DATA_DIR=$HOME/data
 8 |     REFS_DIR=$HOME/refs
 9 |     RESULTS_DIR=$HOME/results
10 |     CIS_X_EXTRA_ARGS=""
11 | 
12 |     mkdir $DATA_DIR $REFS_DIR $RESULTS_DIR
13 | 
14 |     gzip --decompress --stdout $RESOURCES/tmp/cis-x-latest.tar.gz | docker load
15 | 
16 |     dx download --output $DATA_DIR/wgs.markers.txt "$markers"
17 |     dx download --output $DATA_DIR/wgs.cnvloh.txt "$cnv_loh"
18 |     dx download --output $DATA_DIR/RNAseq.bam "$bam"
19 |     dx download --output $DATA_DIR/RNAseq.bam.bai "$bai"
20 |     dx download --output $DATA_DIR/RNAseq_all_fpkm.txt "$fpkm_matrix"
21 |     dx download --output $DATA_DIR/mut.txt "$snv_indel"
22 |     dx download --output $DATA_DIR/sv.txt "$sv"
23 |     dx download --output $DATA_DIR/cna.txt "$cna"
24 | 
25 |     if [[ ! -z "$user_annotation" ]]; then
26 |       dx download --output $DATA_DIR/user_annotation.bed "$user_annotation"
27 |       CIS_X_EXTRA_ARGS="$CIS_X_EXTRA_ARGS -u $DATA_DIR/user_annotation.bed"
28 |     fi
29 | 
30 |     if [[ ! -z "$tad_info" ]]; then
31 |       dx download --output $DATA_DIR/tad_info.bed "$tad_info"
32 |       CIS_X_EXTRA_ARGS="$CIS_X_EXTRA_ARGS -t $DATA_DIR/tad_info.bed"
33 |     fi
34 | 
35 |     dx download --recursive --output $REFS_DIR "$REFERENCE_DATA_PROJECT_ID:/pipeline/cis-X/*"
36 | 
37 |     docker run \
38 |         --mount type=bind,source=$DATA_DIR,target=/data,readonly \
39 |         --mount type=bind,source=$REFS_DIR,target=/opt/cis-x/refs/external,readonly \
40 |         --mount type=bind,source=$RESULTS_DIR,target=/results \
41 |         cis-x \
42 |         run \
43 |         -s $sample_id \
44 |         -o /results \
45 |         -l /data/wgs.markers.txt \
46 |         -g /data/wgs.cnvloh.txt \
47 |         -b /data/RNAseq.bam \
48 |         -e /data/RNAseq_all_fpkm.txt \
49 |         -m /data/mut.txt \
50 |         -v /data/sv.txt \
51 |         -c /data/cna.txt \
52 |         -d $disease \
53 |         -a $cnv_loh_action \
54 |         -w $min_coverage_wgs \
55 |         -r $min_coverage_rna_seq \
56 |         -f $fpkm_threshold_candidate \
57 |         -h $chr_string $CIS_X_EXTRA_ARGS
58 | 
59 |     cis_activated_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.cisActivated.candidates.txt)
60 |     sv_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.sv.candidates.txt)
61 |     cna_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.cna.candidates.txt)
62 |     snv_indel_candidates=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.snvindel.candidates.txt)
63 |     ohe_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.OHE.results.txt)
64 |     ase_gene_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.ase.gene.model.fdr.txt)
65 |     ase_marker_results=$(dx upload --brief $RESULTS_DIR/$sample_id/$sample_id.ase.combine.WGS.RNAseq.goodmarkers.binom.txt)
66 | 
67 |     dx-jobutil-add-output --class file cis_activated_candidates "$cis_activated_candidates"
68 |     dx-jobutil-add-output --class file sv_candidates "$sv_candidates"
69 |     dx-jobutil-add-output --class file cna_candidates "$cna_candidates"
70 |     dx-jobutil-add-output --class file snv_indel_candidates "$snv_indel_candidates"
71 |     dx-jobutil-add-output --class file ohe_results "$ohe_results"
72 |     dx-jobutil-add-output --class file ase_gene_results "$ase_gene_results"
73 |     dx-jobutil-add-output --class file ase_marker_results "$ase_marker_results"
74 | }
75 | 


--------------------------------------------------------------------------------
/src/other/mergeData_geneName.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | use strict;
  3 | 
  4 | my $input = $ARGV[0];
  5 | my $GTF   = $ARGV[1];
  6 | 
  7 | my $flst = "file.lst";
  8 | my $fbase = "RNAseq_GENCODEv19";
  9 | 
 10 | system "echo $input > $flst";
 11 | 
 12 | open(IN, "<$flst");
 13 | my @flst = <IN>;
 14 | close (IN);
 15 | chomp @flst;
 16 | 
 17 | my %sum = ();
 18 | my %count = ();
 19 | my @samples = ();
 20 | foreach my $f (@flst) {
 21 |   my @a = split(/\//, $f);
 22 |   my $nm = $a[$#a];
 23 |   $nm =~ s/counts//g;
 24 |   $nm =~ s/^\.//g;
 25 |   $nm =~ s/\.txt//g;
 26 |   push @samples, $nm;
 27 |   open(IN, "<$f");
 28 |   while (my $line = <IN>) {
 29 |     chomp $line;
 30 |     my @a = split(/\t/, $line);
 31 |     if ($a[0] eq '__no_feature') {$count{$nm}{'no_feature'} = $a[1]; last;}
 32 |     $count{$nm}{$a[0]} = $a[1];
 33 |     $sum{$nm} += $a[1];
 34 |   }
 35 |   close (IN);
 36 | }
 37 | 
 38 | open(OUT1, ">${fbase}_mRNA_count.txt");
 39 | open(OUT2, ">${fbase}_lincRNA_count.txt");
 40 | open(OUT3, ">${fbase}_antisense_count.txt");
 41 | open(OUT, ">${fbase}_all_count.txt");
 42 | print OUT1 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 43 | print OUT2 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 44 | print OUT3 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 45 | print OUT "GeneID\tGeneName\tType\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 46 | open(FPM1, ">${fbase}_mRNA_fpkm.txt");
 47 | open(FPM2, ">${fbase}_lincRNA_fpkm.txt");
 48 | open(FPM3, ">${fbase}_antisense_fpkm.txt");
 49 | open(FPM, ">${fbase}_all_fpkm.txt");
 50 | print FPM1 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 51 | print FPM2 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 52 | print FPM3 "GeneID\tGeneName\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 53 | print FPM "GeneID\tGeneName\tType\tStatus\tChr\tStart\tEnd\t", join("\t", @samples), "\n";
 54 | open(IN, "< $GTF");   ### for v19.
 55 | while (my $line = <IN>) {
 56 |   chomp $line;
 57 |   my @a = split(/\t/, $line);
 58 |   my $ip = 0;
 59 |   foreach my $s (@samples) {
 60 |     if ($count{$s}{$a[0]} > 0) {$ip = 1; last;}
 61 |   }
 62 |   if ($ip == 0) {next;}
 63 |   my @tmp1 = ();
 64 |   my @tmp2 = ();
 65 |   print OUT "$a[2]\t$a[0]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]";
 66 |   print FPM "$a[2]\t$a[0]\t$a[3]\t$a[4]\t$a[5]\t$a[6]\t$a[7]";
 67 |   foreach my $s (@samples) {
 68 |     print OUT "\t$count{$s}{$a[0]}";
 69 |     push @tmp1, $count{$s}{$a[0]};
 70 |     my $fpkm = sprintf("%.4f", 1000000000*$count{$s}{$a[0]}/($a[$#a]*$sum{$s}));
 71 |     print FPM "\t$fpkm";
 72 |     push @tmp2, $fpkm;
 73 |   }
 74 |   print OUT "\n";
 75 |   print FPM "\n";
 76 |   if ($a[3] =~ /protein_coding/) {
 77 |     print OUT1 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 78 |     print OUT1 join("\t", @tmp1), "\n";
 79 |     print FPM1 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 80 |     print FPM1 join("\t", @tmp2), "\n";
 81 |   }
 82 |   if ($a[3] =~ /lincRNA/) {
 83 |     print OUT2 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 84 |     print OUT2 join("\t", @tmp1), "\n";
 85 |     print FPM2 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 86 |     print FPM2 join("\t", @tmp2), "\n";
 87 |   }
 88 |   if ($a[3] =~ /antisense/) {
 89 |     print OUT3 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 90 |     print OUT3 join("\t", @tmp1), "\n";
 91 |     print FPM3 "$a[2]\t$a[0]\t$a[4]\t$a[5]\t$a[6]\t$a[7]\t";
 92 |     print FPM3 join("\t", @tmp2), "\n";
 93 |   }
 94 | }
 95 | close (IN);
 96 | close (OUT);
 97 | close (OUT1);
 98 | close (OUT2);
 99 | close (OUT3);
100 | close (FPM);
101 | close (FPM1);
102 | close (FPM2);
103 | close (FPM3);
104 | 
105 | open(OUT, ">nofeature-summary.txt");
106 | print OUT "Sample\tNo-Feature\tMapped\n";
107 | foreach my $s (@samples) {
108 |   print OUT "$s\t$count{$s}{'no_feature'}\t$sum{$s}\n";
109 | }
110 | 
111 | 


--------------------------------------------------------------------------------
/src/ref-exp/README.md:
--------------------------------------------------------------------------------
  1 | # cis-X ref-exp
  2 | 
  3 | **cis-X ref-exp** generates reference expression matrices used for outlier
  4 | high expression (OHE) tests.
  5 | 
  6 | cis-X uses precalculated reference expression matrices for finding outlier high
  7 | expression (OHE) signals, which is disease specific. cis-X includes references
  8 | for pediatric neuroblastoma (NBL) and T-cell acute lymphoblastic leukaemia
  9 | (T-ALL), but user-defined references can be added as well.
 10 | 
 11 | This command helps generate the biallelic expression cases as described
 12 | below (`exp.ref.bi.txt`).
 13 | 
 14 | ## Usage
 15 | 
 16 | ```
 17 | cis-X-ref-exp
 18 | 
 19 | USAGE:
 20 |     cis-X ref-exp <SUBCOMMAND> [args...]
 21 | 
 22 | SUBCOMMANDS:
 23 |     generate    Generate a biallelic reference expression matrix
 24 |     prepare     Create a batch script for preprocessing inputs
 25 |     preprocess  Runs allelic specific expression (ASE) tests on inputs
 26 | ```
 27 | 
 28 | ## Reference matrices
 29 | 
 30 | cis-X performs independent tests with three reference expression matrices per
 31 | disease:
 32 | 
 33 |   * `exp.ref.entire.txt`: the unfiltered cohort.
 34 |   * `exp.ref.bi.txt`: cases with a biallelic expression for a given gene.
 35 |   * `exp.ref.white.txt`: cases without known noncoding regulatory variants for
 36 |     a given gene.
 37 | 
 38 | Reference matrices are tab-delimited files, including a header.
 39 | 
 40 |   * `exp.ref.entire.txt`: gene_id, gene_name, type, status, chr, start, end, id...
 41 |   * `exp.ref.bi.txt`: gene_name, num_cases, id, fpkm
 42 |   * `exp.ref.white.txt`: gene_name, num_cases, id, fpkm
 43 | 
 44 | An extra file `precal.tvalue.bin_gt1.txt` is also used, which contains a
 45 | line-delimited vector of normalized t-values across samples.
 46 | 
 47 | If there is no prior knowledge, it is valid to create a reference matrix with
 48 | no rows (but include the header). Note that having both empty biallelic
 49 | expression and whitelist matrices will result in higher false negative rates
 50 | for cis-activated candidates during analysis.
 51 | 
 52 | Each disease under `$CIS_X_HOME/refs/diseases/$DISEASE` must have these four
 53 | files. `$DISEASE` is the name given when running `cis-X run`. See the `NBL` and
 54 | `TALL` directories for examples of the reference expression matrices and
 55 | normalized t-values vector.
 56 | 
 57 | ## Example
 58 | 
 59 | `cis-X ref-exp` will commonly be used in a three step process: prepare,
 60 | preprocess, and generate.
 61 | 
 62 | ### `prepare`
 63 | 
 64 | The preparation stage creates a batch script from a list of inputs.
 65 | 
 66 | ```
 67 | $ cis-X ref-exp prepare /path/to/config.txt /results true
 68 | ```
 69 | 
 70 | It requires a tab-delimited configuration file with four columns:
 71 | 
 72 |   * sample_id
 73 |   * markers: path to a list of single nucleotide markers
 74 |   * rna_bam: path to a RNA-Seq BAM file
 75 |   * cnv_loh: path to CNV/LOH regions
 76 | 
 77 | The last argument `chr-string` is a boolean (`TRUE` or `FALSE`) for whether
 78 | the reference sequence dictionary names start with "chr".
 79 | 
 80 | The resulting batch script is saved to
 81 | `$RESULTS_DIR/cis-X.refexp.step1.commands.sh`.
 82 | 
 83 | ### `preprocess`
 84 | 
 85 | It is unlikely that the `preprocess` subcommand will be called manually, as
 86 | the resulting batch script from the `prepare` stage creates a list of
 87 | commands that calls it with inputs from the configuration file. This batch
 88 | can be submitted to a job runner or executed as a normal script.
 89 | 
 90 | ```
 91 | $ bash /results/cis-X.refexp.step1.commands.sh
 92 | ```
 93 | 
 94 | ### `generate`
 95 | 
 96 | The final generation stage creates a biallelic expression matrix from the
 97 | preprocessed outputs.
 98 | 
 99 | ```
100 | $ cis-X ref-exp generate /path/to/config.txt /results /path/to/gene-exp-table.txt
101 | ```
102 | 
103 | To use this with cis-X, copy the output in `$RESULTS_DIR/refexp` to
104 | `$CIS_X_HOME/refs/diseases/$DISEASE`, where `$DISEASE` is any name. Copy
105 | `$GENE_EXP_TABLE` to the same disease directory as the unfiltered cohort,
106 | named `exp.ref.entire.txt`
107 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM ubuntu:18.04
  2 | 
  3 | # fimo: libxml2-dev libxslt1-dev zlib1g-dev
  4 | # multtest: r-base build-essential
  5 | # nokogiri: build-essential ruby-dev zlib1g-dev liblzma-dev
  6 | # twoBitToFa: libkrb5-3
  7 | #
  8 | # Set the timezone before updating to avoid having to interact with tzdata (r-base dep).
  9 | RUN ln -fs /usr/share/zoneinfo/UTC /etc/localtime \
 10 |     && apt-get update \
 11 |     && apt-get install -y \
 12 |         # common
 13 |         build-essential \
 14 |         libkrb5-3 \
 15 |         wget \
 16 |         zlib1g-dev \
 17 |         # bedtools
 18 |         python-minimal \
 19 |         # variants2matrix
 20 |         openjdk-8-jre-headless \
 21 |         # ase, test-outliers
 22 |         r-base \
 23 |         # screen
 24 |         libxml2-dev \
 25 |         libxslt1-dev  \
 26 |         zlib1g-dev \
 27 |         # seed
 28 |         liblzma-dev \
 29 |         ruby \
 30 |         ruby-dev \
 31 |     && rm -r /var/lib/apt/lists/*
 32 | 
 33 | # core
 34 | 
 35 | RUN wget -O - https://cpanmin.us | perl - App::cpanminus \
 36 |     && cpanm Data::Compare \
 37 |     && chown --recursive root:root /root/.cpanm
 38 | 
 39 | COPY src/other/meme_glam2_fix_new_gcc.patch tmp/
 40 | 
 41 | RUN cd /tmp \
 42 |     && wget http://meme-suite.org/meme-software/4.9.0/meme_4.9.0_4.tar.gz \
 43 |     && echo "3feed2e28a5d17aa5fc04e226b7473a0d5a443055993365bf2116708be68c7fe *meme_4.9.0_4.tar.gz" | sha256sum --check \
 44 |     && tar xf meme_4.9.0_4.tar.gz \
 45 |     && cd meme_4.9.0 \
 46 |     && patch -p1 < /tmp/meme_glam2_fix_new_gcc.patch \
 47 |     && ./configure --prefix=/opt/meme \
 48 |     && make -j$(nproc) \
 49 |     && make install \
 50 |     && rm -r /tmp/meme*
 51 | 
 52 | RUN cd /tmp \
 53 |     && wget https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools-2.28.0.tar.gz \
 54 |     && echo "15af6d10ed28fb3113cd3edce742fd4275f224bc06ecb98d70d869940220bc32 *bedtools-2.28.0.tar.gz" | sha256sum --check \
 55 |     && tar xf bedtools-2.28.0.tar.gz \
 56 |     && cd bedtools2 \
 57 |     && make -j$(nproc) \
 58 |     && cp bin/* /usr/local/bin \
 59 |     && rm -r /tmp/bedtools*
 60 | 
 61 | RUN echo 'source("https://raw.githubusercontent.com/Bioconductor/LegacyInstall/827129e25128453f19a61ce0e8f99d903155ad01/biocLite.R"); biocLite("multtest")' \
 62 |     | R --vanilla
 63 | 
 64 | RUN cd /usr/local/bin \
 65 |     && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/twoBitToFa \
 66 |     && chmod +x twoBitToFa
 67 | 
 68 | # seed
 69 | 
 70 | RUN gem install --no-document nokogiri --version 1.12.5
 71 | 
 72 | RUN cd /usr/local/bin \
 73 |     && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/faToTwoBit \
 74 |     && chmod +x faToTwoBit
 75 | 
 76 | RUN cd /usr/local/bin \
 77 |     && wget https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v385/liftOver \
 78 |     && chmod +x liftOver
 79 | 
 80 | # variants2matrix
 81 | 
 82 | ENV V2M_HOME /opt/variants2matrix
 83 | ENV PERL5LIB ${V2M_HOME}/lib/perl
 84 | ENV CLASSPATH ${V2M_HOME}/lib/java/bambino-1.0.jar:${V2M_HOME}/lib/java/indelxref-1.0.jar:${V2M_HOME}/lib/java/picard.jar:${V2M_HOME}/lib/java/samplenamelib-1.0.jar
 85 | 
 86 | RUN cd /tmp \
 87 |     && wget http://ftp.stjude.org/pub/software/cis-x/variants2matrix.tar.gz \
 88 |     && echo "6502f1bd5d8ec64d357092c21b5eb3b9cefc135a41b8b0d0d3124c2ba2f80311 *variants2matrix.tar.gz" | sha256sum --check \
 89 |     && tar xf variants2matrix.tar.gz --directory /opt --no-same-owner \
 90 |     && rm variants2matrix.tar.gz
 91 | 
 92 | RUN cd /tmp \
 93 |     && wget https://sjr-redesign.stjude.org/content/dam/research-redesign/labs/zhang-lab/cis-x-refs-20200212.tar.gz \
 94 |     && echo "1074dd48157cd00dc407ff06e0bca01c0546d1886e6c1f6fb7d25e1d42b060c0 *cis-x-refs-20200212.tar.gz" | sha256sum --check \
 95 |     && mkdir -p /opt/cis-x/refs \
 96 |     && tar xf cis-x-refs-20200212.tar.gz --strip-components 1 --directory /opt/cis-x/refs --no-same-owner \
 97 |     && rm cis-x-refs-20200212.tar.gz
 98 | 
 99 | # set for ruby
100 | ENV LANG C.UTF-8
101 | 
102 | ENV PATH /opt/cis-x/bin:/opt/meme/bin:${V2M_HOME}/bin:${PATH}
103 | 
104 | COPY bin /opt/cis-x/bin
105 | COPY src /opt/cis-x/src
106 | 
107 | ENTRYPOINT ["/opt/cis-x/bin/cis-X"]
108 | 


--------------------------------------------------------------------------------
/src/core/src/01.get.markder.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | my $sid       = $ARGV[0];
  3 | my $high20    = $ARGV[1];  ### high20 from WGS.
  4 | my $cnvloh_in = $ARGV[2];
  5 | my $snv4_out  = $ARGV[3];
  6 | my $het_out   = $ARGV[4];
  7 | my $bad_lst   = $ARGV[5];
  8 | my $covg      = $ARGV[6];
  9 | 
 10 | my $upper   = 0.7;
 11 | my $lower   = 0.3;
 12 | #my $covg    = 10;
 13 | my %badlst  = ();
 14 | my %chrom   = ();
 15 | my %col2snv = ();
 16 | my %cnvloh  = ();
 17 | 
 18 | for my $i (1 .. 22) {
 19 |     my $c = "chr" . $i;
 20 |     $chrom{$c} = 1;
 21 | }
 22 | 
 23 | my $infile = $bad_lst;
 24 | open IN, "< $infile" or die "$infile: $!";
 25 | while(<IN>) {
 26 |     chomp;
 27 |     next if $. == 1;
 28 |     $badlst{$_} = 1;
 29 | }
 30 | close IN;
 31 | 
 32 | $infile = $cnvloh_in;
 33 | open IN, "< $infile" or die "$infile: $!";
 34 | while(<IN>) {
 35 |     chomp;
 36 |     next if $. == 1;
 37 |     my @F = split/\t/;
 38 |     my $chr = $F[0];
 39 |     unless ($chr =~ /^chr/) {
 40 |         $chr = "chr" . $chr;
 41 |     }
 42 |     my $id = "$chr.$F[1].$F[2]";
 43 |     $cnvloh{chrom}{$chr}{$id}   = 1;
 44 |     $cnvloh{region}{$id}{chr}   = $chr;
 45 |     $cnvloh{region}{$id}{start} = $F[1];
 46 |     $cnvloh{region}{$id}{end}   = $F[2];
 47 | }
 48 | close IN;
 49 | 
 50 | #my $outfile = "$sid.heterozygous.txt";
 51 | my $outfile = $het_out;
 52 | open OUT, "> $outfile" or die "$outfile: $!";
 53 | open SNV4OUT, "> $snv4_out" or die "$snv4_out: $!";
 54 | print OUT "chrom\tposition\tref\tmut\tref_T\tref_G\tmut_T\tmut_G\tcnvlohTag\n";
 55 | open H20, "< $high20" or die "$high20: $!";
 56 | while(<H20>) {
 57 |     chomp;
 58 |     my @F = split/\t/;
 59 |     if ($. == 1) {
 60 |         for my $i (0 .. $#F) {
 61 |             if ($F[$i] eq "Chr") {
 62 |                 $col2snv{chr} = $i;
 63 |             }
 64 |             if ($F[$i] eq "Pos") {
 65 |                 $col2snv{pos} = $i;
 66 |             }
 67 |             if ($F[$i] eq "reference_tumor_count") {
 68 |                 $col2snv{ref_tum_num} = $i;
 69 |             }
 70 |             if ($F[$i] eq "alternative_tumor_count") {
 71 |                 $col2snv{mut_tum_num} = $i;
 72 |             }
 73 |             if ($F[$i] eq "Chr_Allele") {
 74 |                 $col2snv{ref_g} = $i;
 75 |             }
 76 |             if ($F[$i] eq "Alternative_Allele") {
 77 |                 $col2snv{mut_g} = $i;
 78 |             }
 79 |             if ($F[$i] eq "reference_normal_count") {
 80 |                 $col2snv{ref_norm_num} = $i;
 81 |             }
 82 |             if ($F[$i] eq "alternative_normal_count") {
 83 |                 $col2snv{mut_norm_num} = $i;
 84 |             }
 85 |         }
 86 |         next;
 87 |     }
 88 | #    next unless $F[5] eq "SNP";
 89 |     next unless ($F[$col2snv{ref_g}] =~ /[ATCG]/ and $F[$col2snv{mut_g}] =~ /[ATCG]/); ### make sure only SNP was included.
 90 |     my $chrom_i = $F[$col2snv{chr}];
 91 |     unless ($chrom_i =~ /^chr/) {
 92 |         $chrom_i = "chr" . $chrom_i;
 93 |     }
 94 |     next unless $chrom{$chrom_i};
 95 |     my $cvg = $F[$col2snv{ref_tum_num}] + $F[$col2snv{mut_tum_num}];
 96 |     next unless $cvg >= $covg;
 97 |     next unless ($F[$col2snv{ref_tum_num}] >= 3 and $F[$col2snv{mut_tum_num}] >= 3); ### Require minimal 3 reads for each allele. This is for low covg sites, like 8. 2019-04-08.
 98 |     my $snv4 = "$chrom_i.$F[$col2snv{pos}].$F[$col2snv{ref_g}].$F[$col2snv{mut_g}]";
 99 |     next if $badlst{$snv4};  ## drop the BAD markers.
100 | 
101 |     ### filter markers in cnv-loh regions
102 |     ### updated 2018-12-04. no filter at this stage, give a tag instead indicating if a marker sits inside cnvloh region.
103 | #    my $hit = 0;
104 |     my $tag = "diploid";
105 |     for my $id (sort keys %{$cnvloh{chrom}{$chrom_i}}) {
106 |         if ($F[$col2snv{pos}] >= $cnvloh{region}{$id}{start} and $F[$col2snv{pos}] <= $cnvloh{region}{$id}{end}) {
107 | #            $hit = 1;
108 |             $tag = "cnvloh";
109 |         }
110 |     }
111 | #    next if $hit == 1;
112 |     ### end of cnv-loh filter
113 | 
114 |     my $maf = $F[$col2snv{mut_tum_num}] / $cvg;
115 |     if ($lower <= $maf and $maf <= $upper) {
116 |         print OUT "$chrom_i\t$F[$col2snv{pos}]\t$F[$col2snv{ref_g}]\t$F[$col2snv{mut_g}]\t$F[$col2snv{ref_tum_num}]\t$F[$col2snv{ref_norm_num}]\t$F[$col2snv{mut_tum_num}]\t$F[$col2snv{mut_norm_num}]\t$tag\n";
117 |         print SNV4OUT "$snv4\n";
118 |     }
119 | }
120 | close H20;
121 | close OUT;
122 | close SNV4OUT;
123 | 
124 | 


--------------------------------------------------------------------------------
/src/core/src/07.gene.model.Oct2017.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid       = $ARGV[0];
  4 | my $input     = $ARGV[1];
  5 | my $refgene   = $ARGV[2];
  6 | my $ai_thresh_di = $ARGV[3];
  7 | my $ai_thresh_cnv = $ARGV[4];
  8 | my $pvalue    = $ARGV[5];
  9 | my $cnv_loh_action = $ARGV[6];
 10 | my $output    = $ARGV[7];
 11 | my $covg_rna = $ARGV[8];
 12 | 
 13 | $covg_rna = 3 if $covg_rna < 3;  ### 2019-04-08.
 14 | 
 15 | my (%gene,@gene,%chr2g,%g2ase,$head);
 16 | my $infile = $refgene;
 17 | open IN, "< $infile" or die "$infile: $!";
 18 | while(<IN>) {
 19 |     chomp;
 20 |     next if $. == 1;
 21 |     my @F = split/\t/;
 22 |     push @gene, $F[1];
 23 |     $chr2g{$F[2]}{$F[1]} = 1;
 24 |     $gene{$F[1]}{chrom} = $F[2];
 25 |     $gene{$F[1]}{strand} = $F[3];
 26 |     $gene{$F[1]}{start} = $F[4];
 27 |     $gene{$F[1]}{end} = $F[5];
 28 |     $gene{$F[1]}{name} = $F[12];
 29 |     $gene{$F[1]}{cdsstartstat} = $F[13];
 30 |     $gene{$F[1]}{cdsendstat} = $F[14];
 31 | }
 32 | close IN;
 33 | 
 34 | open IN, "< $input" or die "$input: $!";
 35 | while(<IN>) {
 36 |     chomp;
 37 |     if ($. == 1) {
 38 |         $head = $_;
 39 |         next;
 40 |     }
 41 |     my @F = split/\t/;
 42 |     my $ai = $F[12];
 43 |     my $pval = $F[11];
 44 |     my $tag = $F[8];
 45 |     my $chrom = $F[0];
 46 |     my $pos = $F[1];
 47 |     my $maf_rna = $F[7];
 48 |     my $ase = "no";
 49 |     my $snv4 = "$F[0].$F[1].$F[2].$F[3]";
 50 |     next unless $F[6] >= $covg_rna;  ### 2019-04-08.
 51 |     if ($cnv_loh_action eq "drop" and $tag eq "cnvloh") {
 52 |         next;
 53 |     }
 54 |     if ($tag eq "diploid") {
 55 |         if ($ai >= $ai_thresh_di and $pval < $pvalue) {
 56 |             $ase = "yes";
 57 |         }
 58 |     }elsif ($tag eq "cnvloh") {
 59 |         if ($ai >= $ai_thresh_cnv and $pval < $pvalue) {
 60 |             $ase = "yes";
 61 |         }
 62 |     }else {
 63 |         print "Error: wrong tag $tag for $snv4.\n";
 64 |     }
 65 |     for my $g (keys %{$chr2g{$chrom}}) {
 66 |         if ($pos >= $gene{$g}{start} and $pos <= $gene{$g}{end}) {
 67 |             $g2ase{$g}{$snv4}{ai} = $ai;
 68 |             $g2ase{$g}{$snv4}{ase} = $ase;
 69 |             $g2ase{$g}{$snv4}{pval} = $pval;
 70 |             $g2ase{$g}{$snv4}{tag} = $tag;
 71 |             $g2ase{$g}{$snv4}{mafrna} = $maf_rna;
 72 |         }
 73 |     }
 74 | }
 75 | close IN;
 76 | 
 77 | my $outfile = $output;
 78 | open OUT, "> $outfile" or die "$outfile: $!";
 79 | print OUT "gene\tgsym\tchrom\tstrand\tstart\tend\tcdsStartStat\tcdsEndStat\tmarkers\tase_markers\taverage_ai_all\taverage_ai_ase\tpval_all_markers\tpval_ase_markers\tai_all_markers\tai_ase_markers\ttag_all_markers\tmaf_rna_all_markers\n";
 80 | for my $g (@gene) {
 81 |     my $markers = 0;
 82 |     my $ase_markers = 0;
 83 |     my $avg_all = "na";
 84 |     my $avg_ase = "na";
 85 |     my $p_all = "na";
 86 |     my $p_ase = "na";
 87 |     my $ai_all = "na";
 88 |     my $ai_ase = "na";
 89 |     my $tag_all = "na";
 90 |     my $maf_rna_all = "na";
 91 |     if ($g2ase{$g}) {
 92 |         my @markers = sort keys %{$g2ase{$g}};
 93 |         my $sum1 = 0;
 94 |         my $sum2 = 0;
 95 |         for my $m (@markers) {
 96 |             $markers++;
 97 |             $sum1 += $g2ase{$g}{$m}{ai};
 98 |             if ($p_all eq "na") {
 99 |                 $p_all = $g2ase{$g}{$m}{pval};
100 |                 $ai_all = $g2ase{$g}{$m}{ai};
101 |                 $tag_all = $g2ase{$g}{$m}{tag};
102 |                 $maf_rna_all = $g2ase{$g}{$m}{mafrna};
103 |             }else {
104 |                 $p_all .= ",$g2ase{$g}{$m}{pval}";
105 |                 $ai_all .= ",$g2ase{$g}{$m}{ai}";
106 |                 $tag_all .= ",$g2ase{$g}{$m}{tag}";
107 |                 $maf_rna_all .= ",$g2ase{$g}{$m}{mafrna}";
108 |             }
109 |             if ($g2ase{$g}{$m}{ase} eq "yes") {
110 |                 $ase_markers++;
111 |                 $sum2 += $g2ase{$g}{$m}{ai};
112 |                 if ($p_ase eq "na") {
113 |                     $p_ase = $g2ase{$g}{$m}{pval};
114 |                     $ai_ase = $g2ase{$g}{$m}{ai};
115 |                 }else {
116 |                     $p_ase .= ",$g2ase{$g}{$m}{pval}";
117 |                     $ai_ase .= ",$g2ase{$g}{$m}{ai}";
118 |                 }
119 |             }
120 |         }
121 |         $avg_all = sprintf("%.3f",$sum1/$markers);
122 |         if ($ase_markers > 0) {
123 |             $avg_ase = sprintf("%.3f",$sum2/$ase_markers);
124 |         }
125 |     }
126 |     print OUT "$g\t$gene{$g}{name}\t$gene{$g}{chrom}\t$gene{$g}{strand}\t$gene{$g}{start}\t$gene{$g}{end}\t$gene{$g}{cdsstartstat}\t$gene{$g}{cdsendstat}\t$markers\t$ase_markers\t$avg_all\t$avg_ase\t$p_all\t$p_ase\t$ai_all\t$ai_ase\t$tag_all\t$maf_rna_all\n";
127 | }
128 | close OUT;
129 | 


--------------------------------------------------------------------------------
/src/seed/bin/cis-X-seed:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | set -e
  4 | 
  5 | CIS_X_SEED_HOME=$(realpath $(dirname $0)/..)
  6 | 
  7 | OUT_DIR=$1
  8 | TMP_DIR=${2:-/tmp}
  9 | 
 10 | if [ $# -lt 1 ]; then
 11 |     basename $0
 12 |     echo
 13 |     echo "USAGE:"
 14 |     echo "    cis-X seed <out-dir> [tmp-dir]"
 15 |     exit 1
 16 | fi
 17 | 
 18 | if [ ! -f $OUT_DIR/GRCh37-lite.2bit ]; then
 19 |     wget --directory-prefix $TMP_DIR \
 20 |         https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/GRCh37-lite.fa.gz
 21 |     # wget --directory-prefix $TMP_DIR \
 22 |     #     https://ftp.ncbi.nih.gov/genomes/archive/old_genbank/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37/special_requests/GRCh37-lite.md5.checksum
 23 |     # md5sum --check $TMP_DIR/GRCh37-lite.fa.gz.md5
 24 |     gzip --decompress $TMP_DIR/GRCh37-lite.fa.gz
 25 |     faToTwoBit $TMP_DIR/GRCh37-lite.fa $OUT_DIR/GRCh37-lite.2bit
 26 | fi
 27 | 
 28 | if [ ! -f $OUT_DIR/hg19_refGene ]; then
 29 |     wget --output-document $OUT_DIR/hg19_refGene \
 30 |         --post-data 'clade=mammal&org=Human&db=hg19&hgta_group=genes&hgta_track=refSeqComposite&hgta_table=refGene&hgta_regionType=genome&position=chr21%3A33031597-33041570&hgta_outputType=primaryTable&boolshad.sendToGalaxy=0&boolshad.sendToGreat=0&boolshad.sendToGenomeSpace=0&hgta_outFileName=hg19_refGene&hgta_compressType=none&hgta_doTopSubmit=get+output' \
 31 |         http://genome.ucsc.edu/cgi-bin/hgTables
 32 | fi
 33 | 
 34 | if [ ! -f $OUT_DIR/hg19_refGene.bed ]; then
 35 |     $CIS_X_SEED_HOME/bin/hg19_ref_gene_to_bed $OUT_DIR/hg19_refGene > $OUT_DIR/hg19_refGene.bed
 36 | fi
 37 | 
 38 | if [ ! -f $OUT_DIR/HOCOMOCOv10_HUMAN_mono_meme_format.meme ]; then
 39 |     wget --directory-prefix $OUT_DIR \
 40 |         http://hocomoco10.autosome.ru/final_bundle/HUMAN/mono/HOCOMOCOv10_HUMAN_mono_meme_format.meme
 41 | fi
 42 | 
 43 | if [ ! -f $OUT_DIR/HOCOMOCOv10_annotation_HUMAN_mono.tsv ]; then
 44 |     wget --directory-prefix $OUT_DIR \
 45 |         http://hocomoco10.autosome.ru/final_bundle/HUMAN/mono/HOCOMOCOv10_annotation_HUMAN_mono.tsv
 46 | fi
 47 | 
 48 | if [ ! -f $OUT_DIR/hESC.combined.domain.hg19.bed ]; then
 49 |     wget --directory-prefix $TMP_DIR http://chromosome.sdsc.edu/mouse/hi-c/hESC.domain.tar.gz
 50 |     tar xf $TMP_DIR/hESC.domain.tar.gz --directory $TMP_DIR
 51 | 
 52 |     wget --directory-prefix $TMP_DIR \
 53 |         http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
 54 | 
 55 |     liftOver \
 56 |         $TMP_DIR/hESC/combined/total.combined.domain \
 57 |         $TMP_DIR/hg18ToHg19.over.chain.gz \
 58 |         $OUT_DIR/hESC.combined.domain.hg19.bed \
 59 |         /dev/null
 60 | fi
 61 | 
 62 | if [ ! -f $OUT_DIR/ImprintGenes.txt ]; then
 63 |     $CIS_X_SEED_HOME/bin/scrape_geneimprint \
 64 |         http://www.geneimprint.com/site/genes-by-species.Homo+sapiens \
 65 |         > $OUT_DIR/ImprintGenes.txt
 66 | fi
 67 | 
 68 | if [ ! -f $OUT_DIR/roadmapData.promoter.merged.111.bed ] \
 69 |     || [ ! -f $OUT_DIR/roadmapData.enhancer.mergeducsc.111.bed ] \
 70 |     || [ ! -f $OUT_DIR/roadmapData.dyadic.merged.111.bed ]
 71 | then
 72 |     wget --output-document $TMP_DIR/jul2013.roadmapData.qc.csv \
 73 |         https://docs.google.com/spreadsheets/d/1yikGx4MsO9Ei36b64yOy9Vb6oPC5IBGlFbYEt-N6gOM/export?format=csv
 74 | fi
 75 | 
 76 | if [ ! -f $OUT_DIR/roadmapData.promoter.merged.111.bed ]; then
 77 |     wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/prom \
 78 |         https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_prom/
 79 |     gzip --decompress $TMP_DIR/prom/*.bed.gz
 80 |     $CIS_X_SEED_HOME/bin/merge_roadmap \
 81 |         $TMP_DIR/prom \
 82 |         $TMP_DIR/jul2013.roadmapData.qc.csv \
 83 |         > $OUT_DIR/roadmapData.promoter.merged.111.bed
 84 | fi
 85 | 
 86 | if [ ! -f $OUT_DIR/roadmapData.enhancer.merged.111.bed ]; then
 87 |     wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/enh \
 88 |         https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_enh/
 89 |     gzip --decompress $TMP_DIR/enh/*.bed.gz
 90 |     $CIS_X_SEED_HOME/bin/merge_roadmap \
 91 |         $TMP_DIR/enh \
 92 |         $TMP_DIR/jul2013.roadmapData.qc.csv \
 93 |         > $OUT_DIR/roadmapData.enhancer.merged.111.bed
 94 | fi
 95 | 
 96 | if [ ! -f $OUT_DIR/roadmapData.dyadic.merged.111.bed ]; then
 97 |     wget --no-directories --no-parent --recursive --level 1 --accept bed.gz --directory-prefix $TMP_DIR/dyadic \
 98 |         https://egg2.wustl.edu/roadmap/data/byDataType/dnase/BED_files_dyadic/
 99 |     gzip --decompress $TMP_DIR/dyadic/*.bed.gz
100 |     $CIS_X_SEED_HOME/bin/merge_roadmap \
101 |         $TMP_DIR/enh \
102 |         $TMP_DIR/jul2013.roadmapData.qc.csv \
103 |         > $OUT_DIR/roadmapData.dyadic.merged.111.bed
104 | fi
105 | 


--------------------------------------------------------------------------------
/src/core/src/exp.check.R:
--------------------------------------------------------------------------------
  1 | 
  2 | argv <- commandArgs(TRUE)
  3 | 
  4 | sample <- argv[1]
  5 | fpkm_in <- argv[2]
  6 | ref_bi_in <- argv[3]
  7 | ref_cohort_in <- argv[4]
  8 | ref_white_in <- argv[5]
  9 | outfile <- argv[6]
 10 | precalt <- argv[7]
 11 | 
 12 | fpkm.raw <- read.table(fpkm_in,sep="\t",header=T,row.names=2,quote="",stringsAsFactors=F)
 13 | if (!sample %in% colnames(fpkm.raw)) {
 14 |     stop("sample not present in fpkm matrix.")
 15 | }
 16 | fpkm.sid <- fpkm.raw[,sample]
 17 | names(fpkm.sid) <- rownames(fpkm.raw)
 18 | 
 19 | ref.bi <- read.table(ref_bi_in,sep="\t",row.names=1,quote="",header=T,stringsAsFactors=F)
 20 | ref.cohort <- read.table(ref_cohort_in,sep="\t",header=T,row.names=2,quote="",stringsAsFactors=F)
 21 | ref.white <- read.table(ref_white_in,sep="\t",row.names=1,quote="",header=T,stringsAsFactors=F)
 22 | ref.rawtval <- read.table(precalt,sep="\t",stringsAsFactors=F)
 23 | ref.t <- ref.rawtval[,1]
 24 | ref.t.total <- length(ref.t)
 25 | 
 26 | out <- NULL
 27 | for (i in 1:length(fpkm.sid)) {
 28 |   x.i <- fpkm.sid[i]
 29 |   x.i.raw <- x.i
 30 |   g.i <- names(fpkm.sid)[i]
 31 |   y.bi.raw <- NULL
 32 |   y.bi.sid <- NULL
 33 |   y.bi <- NULL
 34 |   t.bi <- NULL
 35 |   r.bi <- NULL
 36 |   p.bi <- NULL
 37 |   l.bi <- NULL
 38 |   t.bi.perc <- NULL
 39 |   y.cohort.raw <- NULL
 40 |   y.cohort.sid <- NULL
 41 |   y.cohort <- NULL
 42 |   t.cohort <- NULL
 43 |   r.cohort <- NULL
 44 |   p.cohort <- NULL
 45 |   l.cohort <- NULL
 46 |   t.cohort.perc <- NULL
 47 |   y.white.raw <- NULL
 48 |   y.white.sid <- NULL
 49 |   y.white <- NULL
 50 |   t.white <- NULL
 51 |   r.white <- NULL
 52 |   p.white <- NULL
 53 |   l.white <- NULL
 54 |   t.white.perc <- NULL
 55 |   if (x.i > 0) {
 56 |     x.i <- log10(x.i+0.1)
 57 |     if (g.i %in% rownames(ref.bi)) {
 58 |       y.bi.raw <- log10(as.numeric(unlist(strsplit(ref.bi[g.i,3],",",perl=T)))+0.1)
 59 |             y.bi.sid <- unlist(strsplit(ref.bi[g.i,2],",",perl=T))
 60 |             if (sample %in% y.bi.sid) {
 61 |               y.bi <- y.bi.raw[!y.bi.sid %in% sample]
 62 |             }else {
 63 |               y.bi <- y.bi.raw
 64 |             }
 65 |       t.bi <- (x.i-mean(y.bi))/((1+(length(y.bi)-2)^-1)*(sd(y.bi)^2))^0.5
 66 |       p.bi <- pt(t.bi,length(y.bi)-2,lower.tail=F)
 67 |       r.bi <- length(y.bi[y.bi>x.i])+1
 68 |       l.bi <- length(y.bi)
 69 |       t.bi.perc <- (length(ref.t[ref.t>t.bi])+1)/ref.t.total
 70 |     }
 71 |     if (g.i %in% rownames(ref.white)) {
 72 |       y.white.raw <- log10(as.numeric(unlist(strsplit(ref.white[g.i,3],",",perl=T)))+0.1)
 73 |             y.white.sid <- unlist(strsplit(ref.white[g.i,2],",",perl=T))
 74 |             if (sample %in% y.white.sid) {
 75 |               y.white <- y.white.raw[!y.white.sid %in% sample]
 76 |             }else {
 77 |               y.white <- y.white.raw
 78 |             }
 79 |       t.white <- (x.i-mean(y.white))/((1+(length(y.white)-2)^-1)*(sd(y.white)^2))^0.5
 80 |       p.white <- pt(t.white,length(y.white)-2,lower.tail=F)
 81 |       r.white <- length(y.white[y.white>x.i])+1
 82 |       l.white <- length(y.white)
 83 |       t.white.perc <- (length(ref.t[ref.t>t.white])+1)/ref.t.total
 84 |     }
 85 |     if (g.i %in% rownames(ref.cohort)) {
 86 |       y.cohort.raw <- log10(as.numeric(ref.cohort[g.i,7:ncol(ref.cohort)])+0.1)
 87 |       y.cohort.sid <- colnames(ref.cohort)[7:ncol(ref.cohort)]
 88 |       if (sample %in% y.cohort.sid) {
 89 |               y.cohort <- y.cohort.raw[!y.cohort.sid %in% sample]
 90 |             }else {
 91 |               y.cohort <- y.cohort.raw
 92 |             }
 93 |       t.cohort <- (x.i-mean(y.cohort))/((1+(length(y.cohort)-2)^-1)*(sd(y.cohort)^2))^0.5
 94 |       p.cohort <- pt(t.cohort,length(y.cohort)-2,lower.tail=F)
 95 |       r.cohort <- length(y.cohort[y.cohort>x.i])+1
 96 |       l.cohort <- length(y.cohort)
 97 |       t.cohort.perc <- (length(ref.t[ref.t>t.cohort])+1)/ref.t.total
 98 |     }
 99 |     if (is.null(p.bi)) {p.bi <- "na"}
100 |     if (is.null(r.bi)) {r.bi <- "na"}
101 |     if (is.null(l.bi)) {l.bi <- "na"}
102 |     if (is.null(t.bi)) {t.bi <- "na"}
103 |     if (is.null(t.bi.perc)) {t.bi.perc <- "na"}
104 |     if (is.null(p.white)) {p.white <- "na"}
105 |     if (is.null(r.white)) {r.white <- "na"}
106 |     if (is.null(l.white)) {l.white <- "na"}
107 |     if (is.null(t.white)) {t.white <- "na"}
108 |     if (is.null(t.white.perc)) {t.white.perc <- "na"}
109 |     if (is.null(p.cohort)) {p.cohort <- "na"}
110 |     if (is.null(r.cohort)) {r.cohort <- "na"}
111 |     if (is.null(l.cohort)) {l.cohort <- "na"}
112 |     if (is.null(t.cohort)) {t.cohort <- "na"}
113 |     if (is.null(t.cohort.perc)) {t.cohort.perc <- "na"}
114 |     out <- rbind(out,c(g.i,x.i.raw,l.bi,p.bi,r.bi,t.bi,t.bi.perc,l.cohort,p.cohort,r.cohort,t.cohort,t.cohort.perc,l.white,p.white,r.white,t.white,t.white.perc))
115 |   }
116 | }
117 | colnames(out) <- c("Gene","fpkm.raw","size.bi","p.bi","rank.bi","tstatistic.bi","qval.bi","size.cohort","p.cohort","rank.cohort","tstatistic.cohort","qval.cohort","size.white","p.white","rank.white","tstatistic.white","qval.white")
118 | write.table(out,file=outfile,sep="\t",row.names=F,quote=F)
119 | 


--------------------------------------------------------------------------------
/src/core/src/scan.sv.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid         = $ARGV[0];
  4 | my $ase_result  = $ARGV[1];
  5 | my $ase_result_run  = $ARGV[2];
  6 | my $sv_result   = $ARGV[3];
  7 | my $output      = $ARGV[4];
  8 | my $win         = $ARGV[5];
  9 | my $refgene     = $ARGV[6];
 10 | 
 11 | my (%genes,%chr2g,%candidates,%g2query);
 12 | 
 13 | my $infile = $ase_result;
 14 | open IN, "< $infile" or die "$infile: $!";
 15 | while(<IN>) {
 16 |     chomp;
 17 |     next if $. == 1;
 18 |     my @F = split/\t/;
 19 |     $genes{$F[0]}{gsym}   = $F[1];
 20 |     $genes{$F[0]}{chrom}  = $F[2];
 21 |     $genes{$F[0]}{strand} = $F[3];
 22 |     $genes{$F[0]}{start}  = $F[4];
 23 |     $genes{$F[0]}{end}    = $F[5];
 24 |     $chr2g{$F[2]}{$F[0]}  = 1;
 25 |     $candidates{$F[1]}    = 1;
 26 | }
 27 | close IN;
 28 | 
 29 | $infile = $ase_result_run;
 30 | open IN, "< $infile" or die "$infile: $!";
 31 | while(<IN>) {
 32 |     chomp;
 33 |     next if $. == 1;
 34 |     my @F = split/\t/;
 35 |     if ($F[8]) {
 36 |         my @G = split(/,/,$F[8]);
 37 |         for my $g (@G) {
 38 |             next if $candidates{$g};
 39 |             $g2query{$g}{tag} = 1;
 40 |         }
 41 |     }
 42 | }
 43 | close IN;
 44 | 
 45 | $infile = $refgene;
 46 | open IN, "< $infile" or die "$infile: $!";
 47 | while(<IN>) {
 48 |     chomp;
 49 |     my @F = split/\t/;
 50 |     if ($g2query{$F[3]}) {
 51 |         my $len = $F[2] - $F[1];
 52 |         if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) {
 53 |             $g2query{$F[3]}{acc}    = $F[4];
 54 |             $g2query{$F[3]}{chrom}  = $F[0];
 55 |             $g2query{$F[3]}{strand} = $F[5];
 56 |             $g2query{$F[3]}{start}  = $F[1];
 57 |             $g2query{$F[3]}{end}    = $F[2];
 58 |             $g2query{$F[3]}{len}    = $len;
 59 |             $g2query{$F[3]}{tag}    = 2;
 60 |         }else {
 61 |             1;
 62 |         }
 63 |     }
 64 | }
 65 | close IN;
 66 | 
 67 | for my $g (sort keys %g2query) {
 68 |     if ($g2query{$g}{tag} == 1) {
 69 |         print "Error, $g not annotated.\n";
 70 |     }else {
 71 |         $candidates{$g} = 1;
 72 |         my $acc = $g2query{$g}{acc};
 73 |         $genes{$acc}{gsym}   = $g;
 74 |         $genes{$acc}{chrom}  = $g2query{$g}{chrom};
 75 |         $genes{$acc}{strand} = $g2query{$g}{strand};
 76 |         $genes{$acc}{start}  = $g2query{$g}{start};
 77 |         $genes{$acc}{end}    = $g2query{$g}{end};
 78 |         $chr2g{$g2query{$g}{chrom}}{$acc} = 1;
 79 |     }
 80 | }
 81 | 
 82 | $infile = $sv_result;
 83 | open IN, "< $infile" or die "$infile: $!";
 84 | open OUT, "> $output" or die "$output: $!";
 85 | while(<IN>) {
 86 |     chomp;
 87 |     if ($. == 1) {
 88 |         print OUT "gsym_left\tdist_left\tgsym_right\tdist_right\t$_\n";
 89 |         next;
 90 |     }
 91 |     my @F = split/\t/;
 92 |     my $record = 0;
 93 |     my %left = ();
 94 |     my %right = ();
 95 | ### check breakpoint on left.
 96 |     my $chrom = $F[0];
 97 |     unless ($chrom =~ /^chr/) {
 98 |         $chrom = "chr" . $chrom;
 99 |     }
100 |     my $pos = $F[1];
101 |     if ($chr2g{$chrom}) {
102 |         my @g = keys %{$chr2g{$chrom}};
103 |         for my $g (@g) {
104 |             if ($pos >= $genes{$g}{start}-$win and $pos <= $genes{$g}{end}+$win) {
105 |                 my $dist = 0;
106 |                 $record = 1;
107 |                 if ($genes{$g}{strand} eq "+") {
108 |                     $dist = $pos - $genes{$g}{start};
109 |                 }else {
110 |                     $dist = $pos - $genes{$g}{end};
111 |                 }
112 |                 $left{$genes{$g}{gsym}} = $dist;
113 |             }
114 |         }
115 |     }
116 | ### check breakpoint on right.
117 |     $chrom = $F[3];
118 |     unless ($chrom =~ /^chr/) {
119 |         $chrom = "chr" . $chrom;
120 |     }
121 |     $pos = $F[4];
122 |     if ($chr2g{$chrom}) {
123 |         my @g = keys %{$chr2g{$chrom}};
124 |         for my $g (@g) {
125 |             if ($pos >= $genes{$g}{start}-$win and $pos <= $genes{$g}{end}+$win) {
126 |                 my $dist = 0;
127 |                 $record = 1;
128 |                 if ($genes{$g}{strand} eq "+") {
129 |                     $dist = $pos - $genes{$g}{start};
130 |                 }else {
131 |                     $dist = $pos - $genes{$g}{end};
132 |                 }
133 |                 $right{$genes{$g}{gsym}} = $dist;
134 |             }
135 |         }
136 |     }
137 |     next unless $record;
138 |     my $left_gsym = "";
139 |     my $left_dist = "";
140 |     my $right_gsym = "";
141 |     my $right_dist = "";
142 |     for my $l (sort keys %left) {
143 |         next unless $l;
144 |         $left_gsym .= "$l,";
145 |         $left_dist .= "$left{$l},";
146 |     }
147 |     for my $r (sort keys %right) {
148 |         next unless $r;
149 |         $right_gsym .= "$r,";
150 |         $right_dist .= "$right{$r},";
151 |     }
152 |     $left_gsym =~ s/\,$//;
153 |     $left_dist =~ s/\,$//;
154 |     $right_gsym =~ s/\,$//;
155 |     $right_dist =~ s/\,$//;
156 |     print OUT "$left_gsym\t$left_dist\t$right_gsym\t$right_dist\t$_\n";
157 | }
158 | close IN;
159 | close OUT;
160 | 


--------------------------------------------------------------------------------
/src/core/src/ase.candidate.byrun.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid                  = $ARGV[0];
  4 | my $thresh_fpkm          = $ARGV[1];
  5 | my $thresh_loo_pvalue    = $ARGV[2];
  6 | my $ase_result_run       = $ARGV[3];
  7 | my $ohe_result           = $ARGV[4];
  8 | my $outfile              = $ARGV[5];
  9 | my $imprinting_genes     = $ARGV[6];
 10 | my $oncogenes            = $ARGV[7];
 11 | my $num_markers          = $ARGV[8];
 12 | my $ase_result_gene      = $ARGV[9];
 13 | my $thresh_ase_delta_di  = $ARGV[10];
 14 | my $thresh_ase_delta_cnv = $ARGV[11];
 15 | 
 16 | my (%imprint,%g2loo,%glst,%oncog,%g2ase);
 17 | 
 18 | my $infile = $imprinting_genes;
 19 | open IN, "< $infile" or die "$infile: $!";
 20 | while(<IN>) {
 21 |     chomp;
 22 |     next if $. == 1;
 23 |     my @F = split/\t/;
 24 |     $imprint{$F[0]} = $F[3];
 25 | }
 26 | close IN;
 27 | 
 28 | $infile = $oncogenes;
 29 | open IN, "< $infile" or die "$infile: $!";
 30 | while(<IN>) {
 31 |     chomp;
 32 |     next if $. == 1;
 33 |     $_ =~ s/\"//g;
 34 |     my @F = split/\t/;
 35 |     next unless $F[14];
 36 |     next if $F[14] eq "TSG";
 37 |     next if $F[0] eq "IGH" or $F[0] eq "IGK" or $F[0] eq "IGL" or $F[0] eq "HLA-A";
 38 |     $oncog{$F[0]} = $F[14];
 39 | }
 40 | close IN;
 41 | 
 42 | $infile = $ase_result_gene;
 43 | open IN, "< $infile" or die "$infile: $!";
 44 | while(<IN>) {
 45 |     chomp;
 46 |     next if $. == 1;
 47 |     my @F = split/\t/;
 48 |     $g2ase{$F[1]}{pval} = $F[20];   ### using raw p here.
 49 |     $g2ase{$F[1]}{delta} = $F[19];
 50 |     $g2ase{$F[1]}{tag} = $F[16];
 51 | }
 52 | close IN;
 53 | 
 54 | $infile = $ohe_result;
 55 | open IN, "< $infile" or die "$infile: $!";
 56 | while(<IN>) {
 57 |     chomp;
 58 |     next if $. == 1;
 59 |     my @F = split/\t/;
 60 |     $g2loo{$F[0]}{fpkm} = $F[1];
 61 |     if ($F[12] ne "na") {
 62 |         $g2loo{$F[0]}{tval} = $F[15];
 63 |         $g2loo{$F[0]}{tperc} = $F[16];
 64 |         $g2loo{$F[0]}{rank} = $F[14];
 65 |         $g2loo{$F[0]}{size} = $F[12];
 66 |         $g2loo{$F[0]}{source} = "white_list";
 67 |     }elsif ($F[2] ne "na") {
 68 |         $g2loo{$F[0]}{tval} = $F[5];
 69 |         $g2loo{$F[0]}{tperc} = $F[6];
 70 |         $g2loo{$F[0]}{rank} = $F[4];
 71 |         $g2loo{$F[0]}{size} = $F[2];
 72 |         $g2loo{$F[0]}{source} = "bi_cohort";
 73 |     }else {
 74 |         $g2loo{$F[0]}{tval} = $F[10];
 75 |         $g2loo{$F[0]}{tperc} = $F[11];
 76 |         $g2loo{$F[0]}{rank} = $F[9];
 77 |         $g2loo{$F[0]}{size} = $F[7];
 78 |         $g2loo{$F[0]}{source} = "entire_cohort";
 79 |     }
 80 | }
 81 | close IN;
 82 | 
 83 | $infile = $ase_result_run;
 84 | open IN, "< $infile" or die "$infile: $!";
 85 | open OUT, "> $outfile" or die "$outfile: $!";
 86 | while(<IN>) {
 87 |     chomp;
 88 |     if ($. == 1) {
 89 |         print OUT "$_\tCandidates\n";
 90 |         next;
 91 |     }
 92 |     my @F = split/\t/;
 93 |     next unless $F[5] >= $num_markers;
 94 |     if ($F[7]) {
 95 |         my @G = split(/,/,$F[7]);
 96 |         my $candidates = "";
 97 |         for my $g (@G) {
 98 |             if ($imprint{$g} and ($imprint{$g} eq "Imprinted")) {
 99 |                 next;
100 |             }
101 |             next unless $g2loo{$g};
102 |             my $keep = 0;
103 |             if ($g2loo{$g}{fpkm} >= $thresh_fpkm and $g2loo{$g}{tperc} < $thresh_loo_pvalue) {
104 |                 $keep = 1;
105 |             }elsif ($oncog{$g} and $g2loo{$g}{fpkm} >= 1 and $g2loo{$g}{tperc} < $thresh_loo_pvalue) {
106 |                  $keep = 1;
107 |             }else {
108 |                 next;
109 |             }
110 |             if ($keep == 1) {
111 |                 if ($g2ase{$g}) {
112 |                     my @tag = split(/,/,$g2ase{$g}{tag});
113 |                     my $tagnum = scalar(@tag);
114 |                     my $tagcnv = 0;
115 |                     for my $t (@tag) {
116 |                         if ($t eq "cnvloh") {
117 |                             $tagcnv++;
118 |                         }
119 |                     }
120 |                     my $class = "diploid";
121 |                     if ($tagcnv/$tagnum > 0.3) {
122 |                         $class = "cnvloh";
123 |                     }
124 |                     if ($class eq "diploid") {
125 |                         if ($g2ase{$g}{pval} < 0.1 and $g2ase{$g}{delta} >= $thresh_ase_delta_di) {
126 |                             $candidates .= "$g,";
127 |                         }
128 |                     }elsif ($class eq "cnvloh") {
129 |                         if ($g2ase{$g}{pval} < 0.1 and $g2ase{$g}{delta} >= $thresh_ase_delta_cnv) {
130 |                             $candidates .= "$g,";
131 |                         }
132 |                     }else {
133 |                         print "Wrong class for $class.\n";
134 |                         next;
135 |                     }
136 |                 }else {
137 |                     $candidates .= "$g,";
138 |                 }
139 |             }
140 |         }
141 |         $candidates =~ s/\,$//;
142 |         print OUT "$_\t$candidates\n";
143 |     }else {
144 |         print OUT "$_\t\n";
145 |     }
146 | }
147 | close IN;
148 | close OUT;
149 | 


--------------------------------------------------------------------------------
/src/core/src/check.TAD.cnv.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | ### to be consistant with previous filter, the promoter was used.
  4 | ### promoter was defined as 2kb upstream and 200bp downstream of tss, as in doi:10.1038/ng.3101
  5 | #### TAD was combined hESC and IMR90 from Bing Ren's paper.
  6 | 
  7 | my $sampleid = $ARGV[0];
  8 | my $tad_ref  = $ARGV[1];
  9 | my $refgene  = $ARGV[2];
 10 | my $input    = $ARGV[3];
 11 | my $output   = $ARGV[4];
 12 | 
 13 | my (%tad,%g2pro);
 14 | 
 15 | my $infile = $tad_ref;
 16 | open IN, "< $infile" or die "$infile: $!";
 17 | while(<IN>) {
 18 |     chomp;
 19 |     my @F = split/\t/;
 20 |     my $id = "$F[0].$F[1].$F[2]";
 21 |     $tad{$F[0]}{$id}{start} = $F[1];
 22 |     $tad{$F[0]}{$id}{end}   = $F[2];
 23 |     $tad{$F[0]}{$id}{source} = "hESC";
 24 | }
 25 | close IN;
 26 | 
 27 | $infile = $refgene;
 28 | open IN, "< $infile" or die "$infile: $!";
 29 | while(<IN>) {
 30 |     chomp;
 31 |     my @F = split/\t/;
 32 |     next if $F[0] =~ /_random/;
 33 |     next if $F[0] =~ /_hap/;
 34 |     next if $F[0] =~ /chrUn/;
 35 |     if ($F[5] eq "+") {
 36 |         $g2pro{$F[3]}{$F[4]}{start} = $F[1] - 2000;
 37 |         $g2pro{$F[3]}{$F[4]}{end}   = $F[1] + 200;
 38 |         $g2pro{$F[3]}{$F[4]}{chrom} = $F[0];
 39 |     }elsif ($F[5] eq "-") {
 40 |         $g2pro{$F[3]}{$F[4]}{end}   = $F[2] + 2000;
 41 |         $g2pro{$F[3]}{$F[4]}{start} = $F[2] - 200;
 42 |         $g2pro{$F[3]}{$F[4]}{chrom} = $F[0];
 43 |     }else {
 44 |         print "Wrong strand info: $F[5] for $F[4].\n";
 45 |     }
 46 | }
 47 | close IN;
 48 | 
 49 | $infile = $input;
 50 | my $outfile = $output;
 51 | open OUT, "> $outfile" or die "$outfile: $!";
 52 | open IN, "< $infile" or die "$infile: $!";
 53 | while(<IN>) {
 54 |     chomp;
 55 |     my @F = split/\t/;
 56 |     if ($. == 1) {
 57 |         print OUT "candidate.inTAD";
 58 |         for my $i (2 .. $#F) {
 59 |             print OUT "\t$F[$i]";
 60 |         }
 61 |         print OUT "\n";
 62 |         next;
 63 |     }
 64 |     my %targets = ();
 65 |     my $target = "";
 66 |     if ($F[0]) {
 67 |         my $chrom = $F[2];
 68 |         unless ($chrom =~ /^chr/) {
 69 |             $chrom = "chr" . $chrom;
 70 |         }
 71 |         my $pos_l = $F[3];
 72 |         my $pos_r = $F[4];
 73 |         my @g = split(/,/,$F[0]);
 74 |         for my $tad (sort keys %{$tad{$chrom}}) {
 75 |             if ($pos_l <= $tad{$chrom}{$tad}{end} and $pos_l >= $tad{$chrom}{$tad}{start}) {
 76 |                 for my $g (@g) {
 77 |                     my $overlap = 0;
 78 |                     if ($g2pro{$g}) {
 79 |                         for my $acc (sort keys %{$g2pro{$g}}) {
 80 |                             if ($g2pro{$g}{$acc}{chrom} ne $chrom) {
 81 |                                 print "Wrong chromosome for gene $g.\n";
 82 |                             }else {
 83 |                                 if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) {
 84 |                                     1;
 85 |                                 }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) {
 86 |                                     $overlap = 1;
 87 |                                 }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) {
 88 |                                     $overlap = 1;
 89 |                                 }else {
 90 |                                     1;
 91 |                                 }
 92 |                             }
 93 |                         }
 94 |                     }else {
 95 |                         print "No promoter info for $g.\n";
 96 |                     }
 97 |                     if ($overlap == 1) {
 98 |                         $targets{$g} = 1;
 99 |                     }
100 |                 }
101 |             }
102 |             if ($pos_r <= $tad{$chrom}{$tad}{end} and $pos_r >= $tad{$chrom}{$tad}{start}) {
103 |                 for my $g (@g) {
104 |                     my $overlap = 0;
105 |                     if ($g2pro{$g}) {
106 |                         for my $acc (sort keys %{$g2pro{$g}}) {
107 |                             if ($g2pro{$g}{$acc}{chrom} ne $chrom) {
108 |                                 print "Wrong chromosome for gene $g.\n";
109 |                             }else {
110 |                                 if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) {
111 |                                     1;
112 |                                 }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) {
113 |                                     $overlap = 1;
114 |                                 }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) {
115 |                                     $overlap = 1;
116 |                                 }else {
117 |                                     1;
118 |                                 }
119 |                             }
120 |                         }
121 |                     }else {
122 |                         print "No promoter info for $g.\n";
123 |                     }
124 |                     if ($overlap == 1) {
125 |                         $targets{$g} = 1;
126 |                     }
127 |                 }
128 |             }
129 |         }
130 |     }
131 |     my @targets = sort keys %targets;
132 |     $target = join(',',@targets);
133 |     if ($target) {
134 |         print OUT "$target";
135 |         for my $i (2 .. $#F) {
136 |             print OUT "\t$F[$i]";
137 |         }
138 |         print OUT "\n";
139 |     }
140 | }
141 | close IN;
142 | close OUT;
143 | 
144 | 


--------------------------------------------------------------------------------
/src/core/src/scan.cnv.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid             = $ARGV[0];
  4 | my $ase_result      = $ARGV[1];
  5 | my $ase_result_run  = $ARGV[2];
  6 | my $cnv_result      = $ARGV[3];
  7 | my $output          = $ARGV[4];
  8 | my $win             = $ARGV[5];
  9 | my $size            = $ARGV[6];
 10 | my $refgene         = $ARGV[7];
 11 | my $perc_overlap    = $ARGV[8];
 12 | 
 13 | my (%genes,%chr2g,%candidates,%g2query);
 14 | 
 15 | my $infile = $ase_result;
 16 | open IN, "< $infile" or die "$infile: $!";
 17 | while(<IN>) {
 18 |     chomp;
 19 |     next if $. == 1;
 20 |     my @F = split/\t/;
 21 |     $genes{$F[0]}{gsym}   = $F[1];
 22 |     $genes{$F[0]}{chrom}  = $F[2];
 23 |     $genes{$F[0]}{strand} = $F[3];
 24 |     $genes{$F[0]}{start}  = $F[4];
 25 |     $genes{$F[0]}{end}    = $F[5];
 26 |     $chr2g{$F[2]}{$F[0]}  = 1;
 27 |     $candidates{$F[1]}    = 1;
 28 | }
 29 | close IN;
 30 | 
 31 | $infile = $ase_result_run;
 32 | open IN, "< $infile" or die "$infile: $!";
 33 | while(<IN>) {
 34 |     chomp;
 35 |     next if $. == 1;
 36 |     my @F = split/\t/;
 37 |     if ($F[8]) {
 38 |         my @G = split(/,/,$F[8]);
 39 |         for my $g (@G) {
 40 |             next if $candidates{$g};
 41 |             $g2query{$g}{tag} = 1;
 42 |         }
 43 |     }
 44 | }
 45 | close IN;
 46 | 
 47 | $infile = $refgene;
 48 | open IN, "< $infile" or die "$infile: $!";
 49 | while(<IN>) {
 50 |     chomp;
 51 |     my @F = split/\t/;
 52 |     if ($g2query{$F[3]}) {
 53 |         my $len = $F[2] - $F[1];
 54 |         if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) {
 55 |             $g2query{$F[3]}{acc}    = $F[4];
 56 |             $g2query{$F[3]}{chrom}  = $F[0];
 57 |             $g2query{$F[3]}{strand} = $F[5];
 58 |             $g2query{$F[3]}{start}  = $F[1];
 59 |             $g2query{$F[3]}{end}    = $F[2];
 60 |             $g2query{$F[3]}{len}    = $len;
 61 |             $g2query{$F[3]}{tag}    = 2;
 62 |         }else {
 63 |             1;
 64 |         }
 65 |     }
 66 | }
 67 | close IN;
 68 | 
 69 | for my $g (sort keys %g2query) {
 70 |     if ($g2query{$g}{tag} == 1) {
 71 |         print "Error, $g not annotated.\n";
 72 |     }else {
 73 |         $candidates{$g} = 1;
 74 |         my $acc = $g2query{$g}{acc};
 75 |         $genes{$acc}{gsym}   = $g;
 76 |         $genes{$acc}{chrom}  = $g2query{$g}{chrom};
 77 |         $genes{$acc}{strand} = $g2query{$g}{strand};
 78 |         $genes{$acc}{start}  = $g2query{$g}{start};
 79 |         $genes{$acc}{end}    = $g2query{$g}{end};
 80 |         $chr2g{$g2query{$g}{chrom}}{$acc} = 1;
 81 |     }
 82 | }
 83 | 
 84 | $infile = $cnv_result;
 85 | open IN, "< $infile" or die "$infile: $!";
 86 | open OUT, "> $output" or die "$output: $!";
 87 | while(<IN>) {
 88 |     chomp;
 89 |     if ($. == 1) {
 90 |         print OUT "gsym\tdist\t$_\n";
 91 |         next;
 92 |     }
 93 |     my $record = 0;
 94 |     my @F = split/\t/;
 95 |     my $length = $F[2] - $F[1];
 96 |     next unless $length <= $size;
 97 |     my %target = ();
 98 | ### check for intersection.
 99 |     my $chrom = $F[0];
100 |     unless ($chrom =~ /^chr/) {
101 |         $chrom = "chr" . $chrom;
102 |     }
103 |     my $pos_left = $F[1];
104 |     my $pos_right = $F[2];
105 | #    my $left_pos = $F[1];
106 | #    my $right_pos = $F[2];
107 |     if ($chr2g{$chrom}) {
108 |         my @g = keys %{$chr2g{$chrom}};
109 |         for my $g (@g) {
110 |             if ($pos_left > $genes{$g}{end}+$win) {
111 |                 1;
112 |             }elsif ($pos_right < $genes{$g}{start}-$win) {
113 |                 1;
114 |             }else {
115 |                 my $overlap = 0;
116 |                 my $glen = $genes{$g}{end} - $genes{$g}{start};
117 |                 if ($pos_left <= $genes{$g}{start} and $pos_right >= $genes{$g}{start}) {
118 |                     if ($pos_right < $genes{$g}{end}) {
119 |                         $overlap = $pos_right - $genes{$g}{start};
120 |                     }else {
121 |                         $overlap = $genes{$g}{end} - $genes{$g}{start};
122 |                     }
123 |                 }elsif ($pos_left >= $genes{$g}{start} and $pos_left <= $genes{$g}{end}) {
124 |                     if ($pos_right <= $genes{$g}{end}) {
125 |                         $overlap = $pos_right - $pos_left;
126 |                     }else {
127 |                         $overlap = $genes{$g}{end} - $pos_left;
128 |                     }
129 |                 }
130 |                 if ($overlap/$glen < $perc_overlap) {
131 |                     $record = 1;
132 |                     my $dist = 0;
133 |                     if ($genes{$g}{strand} eq "+") {
134 |                         $dist = $pos_left - $genes{$g}{start};
135 |                         $dist = $pos_right - $genes{$g}{start} if (abs($pos_right - $genes{$g}{start}) < abs($pos_left - $genes{$g}{start}));
136 |                     }else {
137 |                         $dist = $pos_left - $genes{$g}{end};
138 |                         $dist = $pos_right - $genes{$g}{end} if (abs($pos_right - $genes{$g}{end}) < abs($pos_left - $genes{$g}{end}));
139 |                     }
140 |                     $target{$genes{$g}{gsym}} = $dist;
141 |                 }
142 |             }
143 |         }
144 |     }
145 |     next unless $record;
146 |     my $target_gsym = "";
147 |     my $target_dist = "";
148 |     for my $t (sort keys %target) {
149 |         next unless $t;
150 |         $target_gsym .= "$t,";
151 |         $target_dist .= "$target{$t},";
152 |     }
153 |     $target_gsym =~ s/\,$//;
154 |     $target_dist =~ s/\,$//;
155 |     print OUT "$target_gsym\t$target_dist\t$_\n";
156 | }
157 | close IN;
158 | close OUT;
159 | 
160 | 


--------------------------------------------------------------------------------
/src/core/src/check.TAD.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | ### to be consistant with previous filter, the promoter was used.
  4 | ### promoter was defined as 2kb upstream and 200bp downstream of tss, as in doi:10.1038/ng.3101
  5 | #### TAD was combined hESC and IMR90 from Bing Ren's paper.
  6 | 
  7 | my $sampleid = $ARGV[0];
  8 | my $tad_ref  = $ARGV[1];
  9 | my $refgene  = $ARGV[2];
 10 | my $input    = $ARGV[3];
 11 | my $output   = $ARGV[4];
 12 | 
 13 | my (%tad,%g2pro);
 14 | 
 15 | my $infile = $tad_ref;
 16 | open IN, "< $infile" or die "$infile: $!";
 17 | while(<IN>) {
 18 |     chomp;
 19 |     my @F = split/\t/;
 20 |     my $id = "$F[0].$F[1].$F[2]";
 21 |     $tad{$F[0]}{$id}{start} = $F[1];
 22 |     $tad{$F[0]}{$id}{end}   = $F[2];
 23 |     $tad{$F[0]}{$id}{source} = "hESC";
 24 | }
 25 | close IN;
 26 | 
 27 | $infile = $refgene;
 28 | open IN, "< $infile" or die "$infile: $!";
 29 | while(<IN>) {
 30 |     chomp;
 31 |     my @F = split/\t/;
 32 |     next if $F[0] =~ /_random/;
 33 |     next if $F[0] =~ /_hap/;
 34 |     next if $F[0] =~ /chrUn/;
 35 |     if ($F[5] eq "+") {
 36 |         $g2pro{$F[3]}{$F[4]}{start} = $F[1] - 2000;
 37 |         $g2pro{$F[3]}{$F[4]}{end}   = $F[1] + 200;
 38 |         $g2pro{$F[3]}{$F[4]}{chrom} = $F[0];
 39 |     }elsif ($F[5] eq "-") {
 40 |         $g2pro{$F[3]}{$F[4]}{end}   = $F[2] + 2000;
 41 |         $g2pro{$F[3]}{$F[4]}{start} = $F[2] - 200;
 42 |         $g2pro{$F[3]}{$F[4]}{chrom} = $F[0];
 43 |     }else {
 44 |         print "Wrong strand info: $F[5] for $F[4].\n";
 45 |     }
 46 | }
 47 | close IN;
 48 | 
 49 | $infile = $input;
 50 | my $outfile = $output;
 51 | open OUT, "> $outfile" or die "$outfile: $!";
 52 | open IN, "< $infile" or die "$infile: $!";
 53 | while(<IN>) {
 54 |     chomp;
 55 |     my @F = split/\t/;
 56 |     if ($. == 1) {
 57 |         print OUT "left.candidate.inTAD\tright.candidate.inTAD";
 58 |         for my $i (4 .. $#F) {
 59 |             print OUT "\t$F[$i]";
 60 |         }
 61 |         print OUT "\n";
 62 |         next;
 63 |     }
 64 |     my $left = "";
 65 |     my $right = "";
 66 |     if ($F[0]) {
 67 |         my $chrom = $F[4];
 68 |         unless ($chrom =~ /^chr/) {
 69 |             $chrom = "chr" . $chrom;
 70 |         }
 71 |         my $pos = $F[5];
 72 |         my @g = split(/,/,$F[0]);
 73 |         for my $tad (sort keys %{$tad{$chrom}}) {
 74 |             if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) {
 75 |                 for my $g (@g) {
 76 |                     my $overlap = 0;
 77 |                     if ($g2pro{$g}) {
 78 |                         for my $acc (sort keys %{$g2pro{$g}}) {
 79 |                             if ($g2pro{$g}{$acc}{chrom} ne $chrom) {
 80 |                                 print "Wrong chromosome for gene $g.\n";
 81 |                             }else {
 82 |                                 if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) {
 83 |                                     1;
 84 |                                 }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) {
 85 |                                     $overlap = 1;
 86 |                                 }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) {
 87 |                                     $overlap = 1;
 88 |                                 }else {
 89 |                                     1;
 90 |                                 }
 91 |                             }
 92 |                         }
 93 |                     }else {
 94 |                         print "No promoter info for $g.\n";
 95 |                     }
 96 |                     if ($overlap == 1) {
 97 |                         $left .= "$g,";
 98 |                     }
 99 |                 }
100 |             }
101 |         }
102 |     }
103 |     if ($F[2]) {
104 |         my $chrom = $F[7];
105 |         unless ($chrom =~ /^chr/) {
106 |             $chrom = "chr" . $chrom;
107 |         }
108 |         my $pos = $F[8];
109 |         my @g = split(/,/,$F[2]);
110 |         for my $tad (sort keys %{$tad{$chrom}}) {
111 |             if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) {
112 |                 for my $g (@g) {
113 |                     my $overlap = 0;
114 |                     if ($g2pro{$g}) {
115 |                         for my $acc (sort keys %{$g2pro{$g}}) {
116 |                             if ($g2pro{$g}{$acc}{chrom} ne $chrom) {
117 |                                 print "Wrong chromosome for gene $g.\n";
118 |                             }else {
119 |                                 if ($g2pro{$g}{$acc}{start} > $tad{$chrom}{$tad}{end}) {
120 |                                     1;
121 |                                 }elsif ($g2pro{$g}{$acc}{start} >= $tad{$chrom}{$tad}{start}) {
122 |                                     $overlap = 1;
123 |                                 }elsif ($g2pro{$g}{$acc}{end} >= $tad{$chrom}{$tad}{start}) {
124 |                                     $overlap = 1;
125 |                                 }else {
126 |                                     1;
127 |                                 }
128 |                             }
129 |                         }
130 |                     }else {
131 |                         print "No promoter info for $g.\n";
132 |                     }
133 |                     if ($overlap == 1) {
134 |                         $right .= "$g,";
135 |                     }
136 |                 }
137 |             }
138 |         }
139 |     }
140 |     $left =~ s/\,$//;
141 |     $right =~ s/\,$//;
142 |     if ($left or $right) {
143 |         print OUT "$left\t$right";
144 |         for my $i (4 .. $#F) {
145 |             print OUT "\t$F[$i]";
146 |         }
147 |         print OUT "\n";
148 |     }
149 | }
150 | close IN;
151 | close OUT;
152 | 
153 | 


--------------------------------------------------------------------------------
/dnanexus/cis-x/dxapp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "name": "cis-x",
  3 |   "title": "St. Jude cis-X",
  4 |   "summary": "Search for activating regulatory variants in the tumor genome",
  5 |   "dxapi": "1.0.0",
  6 |   "version": "1.5.0-2",
  7 |   "openSource": true,
  8 |   "details": {
  9 |     "upstreamUrl": "https://github.com/stjude/cis-x",
 10 |     "upstreamVersion": "1.5.0",
 11 |     "upstreamLicenses": ["Apache-2.0"]
 12 |   },
 13 |   "inputSpec": [
 14 |     {
 15 |       "name": "sample_id",
 16 |       "label": "Sample ID",
 17 |       "class": "string"
 18 |     },
 19 |     {
 20 |       "name": "markers",
 21 |       "label": "Single nucleotide variants",
 22 |       "class": "file",
 23 |       "help": "Tab-delimited text file containing raw sequence variants"
 24 |     },
 25 |     {
 26 |       "name": "cnv_loh",
 27 |       "label": "CNV/LOH regions",
 28 |       "class": "file",
 29 |       "help": "Tab-delimited text file containing any aneuploidy region existing in the tumor genome under analysis"
 30 |     },
 31 |     {
 32 |       "name": "bam",
 33 |       "label": "RNA-seq BAM",
 34 |       "class": "file"
 35 |     },
 36 |     {
 37 |       "name": "bai",
 38 |       "label": "RNA-seq BAM index",
 39 |       "class": "file"
 40 |     },
 41 |     {
 42 |       "name": "fpkm_matrix",
 43 |       "label": "Gene expression table",
 44 |       "class": "file",
 45 |       "help": "Tab-delimited text file containing gene level expressions for the tumor under analysis in FPKM (fragments per kilobase of transcript per million mapped reads)"
 46 |     },
 47 |     {
 48 |       "name": "snv_indel",
 49 |       "label": "Somatic SNV/indel list",
 50 |       "class": "file",
 51 |       "help": "Tab-delimited file containing somatic SNV/indels in the tumor genome"
 52 |     },
 53 |     {
 54 |       "name": "sv",
 55 |       "label": "Somatic SV",
 56 |       "class": "file",
 57 |       "help": "Tab-delimited file containing somatic acquired structural variants in the tumor genome"
 58 |     },
 59 |     {
 60 |       "name": "cna",
 61 |       "label": "Somatic CNV",
 62 |       "class": "file",
 63 |       "help": "Tab-delimited file containing copy number aberrations in the tumor genome"
 64 |     },
 65 |     {
 66 |       "name": "disease",
 67 |       "label": "Disease subtype",
 68 |       "class": "string",
 69 |       "choices": ["NBL", "TALL"]
 70 |     },
 71 |     {
 72 |       "name": "cnv_loh_action",
 73 |       "label": "CNV/LOH action",
 74 |       "class": "string",
 75 |       "choices": ["keep", "drop"],
 76 |       "default": "keep"
 77 |     },
 78 |     {
 79 |       "name": "min_coverage_wgs",
 80 |       "label": "Minimum coverage in WGS",
 81 |       "class": "int",
 82 |       "default": 10
 83 |     },
 84 |     {
 85 |       "name": "min_coverage_rna_seq",
 86 |       "label": "Minimum coverage in WGS",
 87 |       "class": "int",
 88 |       "default": 10
 89 |     },
 90 |     {
 91 |       "name": "fpkm_threshold_candidate",
 92 |       "label": "Candidate FPKM threshold",
 93 |       "class": "float",
 94 |       "default": 5.0
 95 |     },
 96 |     {
 97 |       "name": "user_annotation",
 98 |       "label": "User annotations",
 99 |       "class": "file",
100 |       "optional": true
101 |     },
102 |     {
103 |       "name": "chr_string",
104 |       "label": "Reference sequence names have 'chr' prefix",
105 |       "class": "string",
106 |       "choices": ["TRUE", "FALSE"],
107 |       "default": "TRUE"
108 |     },
109 |     {
110 |       "name": "tad_info",
111 |       "label": "TAD annotations",
112 |       "class": "file",
113 |       "optional": true
114 |     }
115 |   ],
116 |   "outputSpec": [
117 |     {
118 |       "name": "cis_activated_candidates",
119 |       "label": "cis-activated candidates",
120 |       "class": "file",
121 |       "help": "cis-activated candidates in the tumor genome under analysis"
122 |     },
123 |     {
124 |       "name": "sv_candidates",
125 |       "label": "SV candidates",
126 |       "class": "file",
127 |       "help": "Structural variants (SV) candidates predicted as the causal for the cis-activated genes in the regulatory territory"
128 |     },
129 |     {
130 |       "name": "cna_candidates",
131 |       "label": "CNA candidates",
132 |       "class": "file",
133 |       "help": "Copy number aberrations (CNA) predicted as the causal for the cis-activated genes in the regulatory territory"
134 |     },
135 |     {
136 |       "name": "snv_indel_candidates",
137 |       "label": "SNV/indel candidates",
138 |       "class": "file",
139 |       "help": "SNV/indel candidates predicted as functional and predicted transcription factors"
140 |     },
141 |     {
142 |       "name": "ohe_results",
143 |       "label": "OHE results",
144 |       "class": "file",
145 |       "help": "Raw outlier high expression (OHE) results"
146 |     },
147 |     {
148 |       "name": "ase_gene_results",
149 |       "label": "Gene level ASE results",
150 |       "class": "file",
151 |       "help": "Raw gene level allelic specific expression (ASE) results"
152 |     },
153 |     {
154 |       "name": "ase_marker_results",
155 |       "label": "Single marker ASE results",
156 |       "class": "file",
157 |       "help": "Raw single marker allelic specific expression (ASE) results"
158 |     }
159 |   ],
160 |   "runSpec": {
161 |     "timeoutPolicy": {
162 |       "*": {
163 |         "hours": 48
164 |       }
165 |     },
166 |     "interpreter": "bash",
167 |     "distribution": "Ubuntu",
168 |     "release": "16.04",
169 |     "version": "1",
170 |     "file": "src/cis-x.sh"
171 |   },
172 |   "regionalOptions": {
173 |     "azure:westus": {
174 |       "systemRequirements": {
175 |         "*": {
176 |           "instanceType": "azure:mem1_ssd1_x4"
177 |         }
178 |       }
179 |     }
180 |   },
181 |   "access": {
182 |     "allProjects": "VIEW",
183 |     "project": "CONTRIBUTE"
184 |   },
185 |   "authorizedUsers": [
186 |     "org-stjude_cloud",
187 |     "org-stjude_cloud_app_runners"
188 |   ]
189 | }
190 | 


--------------------------------------------------------------------------------
/src/core/bin/cis-X-run:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | set -e
  4 | 
  5 | CIS_X_HOME=$(realpath $(dirname $0)/../../..)
  6 | CIS_X_CORE_HOME=$(realpath $(dirname $0)/..)
  7 | PATH=$CIS_X_CORE_HOME/bin:$PATH
  8 | 
  9 | usage() {
 10 |     basename $0
 11 |     echo
 12 |     echo "USAGE:"
 13 |     echo "    cis-X run -s <sample-id> -o <results-dir> -l <markers> -g <cnv-loh> -b <bam> -e <fpkm-matrix> -m <snv-indel> -v <sv> -c <cna> -d <disease> -a <cnv-loh-action> -w <min coverage in WGS> -r <min coverage in RNA-seq> -f <FPKM threshold for nominate cis-activated candidate> -u <user-annotation> -h <chr-string> -t <tad-info>"
 14 |     echo
 15 |     echo "ARGS:"
 16 |     echo "    -s <sample-id>       Sample ID"
 17 |     echo "    -o <results-dir>     Output directory"
 18 |     echo "    -l <markers>         Path to single nucleotide markers"
 19 |     echo "    -g <cnv-loh>         Path to CNV/LOH regions"
 20 |     echo "    -b <bam>             Path to a RNA-Seq BAM (index must be in same directory)"
 21 |     echo "    -e <fpkm-matrix>     Path to gene expression table"
 22 |     echo "    -m <snv-indel>       Path to somatic SNV/indels"
 23 |     echo "    -v <sv>              Path to somatic SVs"
 24 |     echo "    -c <cna>             Path to somatic CNVs"
 25 |     echo "    -d <disease>         Disease name"
 26 |     echo "    -a <cnv-loh-action>  Action of markers in CNV/LOH regions, either keep or drop (default=keep)"
 27 |     echo "    -w <min coverage in WGS>          Minimal coverage in WGS to include a heterozygous marker (default=10)"
 28 |     echo "    -r <min coverage in RNA-seq>      Minimal coverage in RNA-seq to include a heterozygous marker (default=10)"
 29 |     echo "    -f <fpkm threshold for candidate> FPKM threshold for nominate cis-activated candidate (default=5)"
 30 |     echo "    -u <user-annotation> User applied annotation file in BED format (default=NotSpecified)"
 31 |     echo "    -h <chr-string>      if the RNA-seq BAM with 'chr' in name, TRUE|FALSE (default=TRUE)"
 32 |     echo "    -t <tad-info>         Path to the TAD annotation file in BED format in hg19 (default=hESC)"
 33 | }
 34 | 
 35 | CNV_LOH_ACTION=keep
 36 | COVG_WGS=10
 37 | COVG_RNA=10
 38 | THRESH_CANDIDATE_FPKM=5
 39 | ANNO_USER=NotSpecified
 40 | CHR_STRING=TRUE
 41 | TAD_INFO=hESC
 42 | 
 43 | while getopts s:o:l:g:b:e:m:v:c:d:a:w:r:f:u:h:t: option
 44 | do
 45 | case "${option}"
 46 | in
 47 | s) SAMPLE_ID=${OPTARG};;
 48 | o) ROOTDIR=${OPTARG};;
 49 | l) HIGH20=${OPTARG};;
 50 | g) CNV_LOH=${OPTARG};;
 51 | b) RNABAM=${OPTARG};;
 52 | e) FPKM_MATRIX=${OPTARG};;
 53 | m) SNVINDEL_IN=${OPTARG};;
 54 | v) SV_IN=${OPTARG};;
 55 | c) CNA_IN=${OPTARG};;
 56 | d) DISEASE=${OPTARG};;
 57 | a) CNV_LOH_ACTION=${OPTARG};;
 58 | w) COVG_WGS=${OPTARG};;
 59 | r) COVG_RNA=${OPTARG};;
 60 | f) THRESH_CANDIDATE_FPKM=${OPTARG};;
 61 | u) ANNO_USER=${OPTARG};;
 62 | h) CHR_STRING=${OPTARG};;
 63 | t) TAD_INFO=${OPTARG};;
 64 | esac
 65 | done
 66 | 
 67 | echo "INFO: $(date): cis-X-run: checking parameters"
 68 | if [[ ! $SAMPLE_ID ]]; then
 69 |     echo "ERROR: sample-id not specified."
 70 |     usage
 71 |     exit 1
 72 | elif [[ ! $ROOTDIR ]]; then
 73 |     echo "ERROR: results-dir not specified."
 74 |     usage
 75 |     exit 1
 76 | elif [[ ! -f $HIGH20 ]]; then
 77 |     echo "ERROR: marker file $HIGH20 not exist."
 78 |     usage
 79 |     exit 1
 80 | elif [[ ! -f $CNV_LOH ]]; then
 81 |     echo "ERROR: cnv-loh file $CNV_LOH not exist."
 82 |     usage
 83 |     exit 1
 84 | elif [[ ! -f $RNABAM ]]; then
 85 |     echo "ERROR: RNA-seq BAM file $RNABAM not exist."
 86 |     usage
 87 |     exit 1
 88 | elif [[ ! -f $FPKM_MATRIX ]]; then
 89 |     echo "ERROR: fpkm-matrix file $FPKM_MATRIX not exist."
 90 |     usage
 91 |     exit 1
 92 | elif [[ ! -f $SNVINDEL_IN ]]; then
 93 |     echo "ERROR: snv-indel list file $SNVINDEL_IN not exist."
 94 |     usage
 95 |     exit 1
 96 | elif [[ ! -f $SV_IN ]]; then
 97 |     echo "ERROR: sv list file $SV_IN not exist."
 98 |     usage
 99 |     exit 1
100 | elif [[ ! -f $CNA_IN ]]; then
101 |     echo "ERROR: cna list file $CNA_IN not exist."
102 |     usage
103 |     exit 1
104 | elif [[ ! $DISEASE ]]; then
105 |     echo "ERROR: disease type not specified."
106 |     usage
107 |     exit 1
108 | elif [[ $ANNO_USER != NotSpecified && ! -f $ANNO_USER ]]; then
109 |     echo "ERROR: user specified annotation file $ANNO_USER not exist."
110 |     usage
111 |     exit 1
112 | elif [[ $TAD_INFO != hESC && ! -f $TAD_INFO ]]; then
113 |     echo "ERROR: user specified TAD annotation file $TAD_INFO not exist."
114 |     usage
115 |     exit 1
116 | fi
117 | 
118 | if [[ $TAD_INFO != hESC ]]; then
119 |     TAD=$TAD_INFO
120 | else
121 |     TAD=$CIS_X_HOME/refs/external/hESC.combined.domain.hg19.bed
122 | fi
123 | 
124 | #SAMPLE_ID=$1
125 | #ROOTDIR=$2
126 | #HIGH20=$3
127 | #CNV_LOH=$4
128 | #RNABAM=$5
129 | #FPKM_MATRIX=$6
130 | #SNVINDEL_IN=$7
131 | #SV_IN=$8
132 | #CNA_IN=$9
133 | #DISEASE=${10}
134 | #CNV_LOH_ACTION=${11}
135 | #COVG_WGS=${12}
136 | #COVG_RNA=${13}
137 | #THRESH_CANDIDATE_FPKM=${14}
138 | 
139 | echo "INFO: $(date): cis-X-run: start"
140 | 
141 | WORKDIR=$ROOTDIR/$SAMPLE_ID/working_space
142 | mkdir -p $WORKDIR
143 | cd $WORKDIR
144 | 
145 | SNV4_OUT="$WORKDIR/$SAMPLE_ID.snv4.txt"
146 | HET_OUT="$WORKDIR/$SAMPLE_ID.heterozygous.markers.txt"
147 | 
148 | echo "INFO: $(date): cis-X-run: getting markers"
149 | cis-X-mark $SAMPLE_ID $HIGH20 $CNV_LOH $SNV4_OUT $HET_OUT $COVG_WGS
150 | 
151 | echo "INFO: $(date): cis-X-run: building matrices"
152 | MATRIX_OUT="$WORKDIR/matrix_combined_matrix_simple.tab"
153 | cis-X-build-matrix $RNABAM $SNV4_OUT $WORKDIR $MATRIX_OUT $CHR_STRING
154 | 
155 | echo "INFO: $(date): cis-X-run: running allelic specific expression tests"
156 | ASE_RESULT_MARKER="$WORKDIR/$SAMPLE_ID.ase.combine.WGS.RNAseq.goodmarkers.binom.txt"
157 | ASE_RESULT_GENE="$WORKDIR/$SAMPLE_ID.ase.gene.model.fdr.txt"
158 | ASE_RESULT_RUN="$WORKDIR/$SAMPLE_ID.ase.candidates.runs.txt"
159 | cis-X-ase $SAMPLE_ID $WORKDIR $HET_OUT $MATRIX_OUT $ASE_RESULT_MARKER $ASE_RESULT_GENE $ASE_RESULT_RUN $CNV_LOH_ACTION $COVG_RNA
160 | 
161 | echo "INFO: $(date): cis-X-run: running outlier high expression tests"
162 | OHE_RESULT="$WORKDIR/$SAMPLE_ID.OHE.results.txt"
163 | cis-X-test-outliers $SAMPLE_ID $DISEASE $FPKM_MATRIX $OHE_RESULT
164 | 
165 | echo "INFO: $(date): cis-X-run: nominating candidates"
166 | CANDIDATES_RESULT="$WORKDIR/$SAMPLE_ID.cisActivated.candidates.txt"
167 | CANDIDATES_RESULT_RUN="$WORKDIR/$SAMPLE_ID.cisActivated.candidates.byRuns.txt"
168 | cis-X-nominate $SAMPLE_ID $ASE_RESULT_GENE $OHE_RESULT $CANDIDATES_RESULT $THRESH_CANDIDATE_FPKM $ASE_RESULT_RUN $CANDIDATES_RESULT_RUN
169 | 
170 | echo "INFO: $(date): cis-X-run: screening candidates"
171 | SV_CAN="$SAMPLE_ID.sv.candidates.txt"
172 | CNA_CAN="$SAMPLE_ID.cna.candidates.txt"
173 | SNVINDEL_CAN="$SAMPLE_ID.snvindel.candidates.txt"
174 | cis-X-screen $SAMPLE_ID $CANDIDATES_RESULT $CANDIDATES_RESULT_RUN $SV_IN $CNA_IN $SNVINDEL_IN $FPKM_MATRIX $WORKDIR $SV_CAN $CNA_CAN $SNVINDEL_CAN $ANNO_USER $TAD
175 | 
176 | RESDIR="$ROOTDIR/$SAMPLE_ID"
177 | 
178 | cp $ASE_RESULT_GENE $RESDIR
179 | cp $ASE_RESULT_MARKER $RESDIR
180 | cp $OHE_RESULT $RESDIR
181 | cp $CANDIDATES_RESULT $RESDIR
182 | cp $CANDIDATES_RESULT_RUN $RESDIR
183 | cp $SNVINDEL_CAN $RESDIR
184 | cp $CNA_CAN $RESDIR
185 | cp $SV_CAN $RESDIR
186 | 
187 | echo "INFO: $(date): cis-X-run: done"
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # cis-X
  2 | 
  3 | **cis-X** searches for activating regulatory variants in the tumor genome.
  4 | 
  5 | Activating regular variants usually cause the cis-activation of target genes.
  6 | To find cis-activated genes, allelic specific/imbalance expressions (ASE) and
  7 | outlier high expression (OHE) signals are used. Variants in the same
  8 | topologically associated domains with the candidates can then be searched,
  9 | including structural variants (SV), copy number aberrations (CNA), and single
 10 | nucleotide variations (SNV) and insertion/deletions (indel).
 11 | 
 12 | A transcription factor binding analysis is also done, using motifs from
 13 | [HOCOMOCO] v10 models.
 14 | 
 15 | cis-X currently only works with hg19 (GRCh37).
 16 | 
 17 | More details and examples on running cis-X can be found in the [user guide].
 18 | 
 19 | [HOCOMOCO]: http://hocomoco11.autosome.ru/
 20 | [user guide]: https://sjr-redesign.stjude.org/content/dam/research-redesign/labs/zhang-lab/cis-x-instructions.pdf
 21 | 
 22 | ## Installation
 23 | 
 24 | Installation is simply unpacking the source to a working directory and adding
 25 | `$CIS_X_HOME/bin` to `PATH`.
 26 | 
 27 | ### Prerequisites
 28 | 
 29 | See [cis-X run][run] and [cis-X seed][seed] for the required tools and
 30 | references.
 31 | 
 32 | ## Usage
 33 | 
 34 | ```
 35 | cis-X
 36 | 
 37 | USAGE:
 38 |     cis-X <SUBCOMMAND> [args...]
 39 | 
 40 | SUBCOMMANDS:
 41 |     ref-exp  Generate reference expression matrices
 42 |     run      Search for activating regulatory variants in the tumor genome
 43 |     seed     Download and generate a set of common references
 44 | ```
 45 | 
 46 | For more details on how to run each command, see its respective README:
 47 | [ref-exp], [run], and [seed].
 48 | 
 49 | ### Docker
 50 | 
 51 | cis-X has a `Dockerfile` to create a [Docker] image, which sets up and installs
 52 | all the required dependencies (sans references). To use this image, [install
 53 | Docker] for your platform.
 54 | 
 55 | For typical inputs, cis-X requires at least 4 GiB of RAM. This resource can
 56 | be increased for the desktop version of Docker by going to Docker preferences
 57 | \> Advanced \> Memory.
 58 | 
 59 | [Docker]: https://www.docker.com/
 60 | [install Docker]: https://docs.docker.com/get-started/get-docker/
 61 | 
 62 | #### Build
 63 | 
 64 | In the cis-X project directory, build the Docker image.
 65 | 
 66 | ```
 67 | $ docker image build --tag cis-x .
 68 | ```
 69 | 
 70 | #### Run
 71 | 
 72 | The Docker image uses `bin/cis-X` as its entrypoint, giving access to all of its
 73 | commands.
 74 | 
 75 | The image assumes two working directories: `/data` for inputs and `/results`
 76 | for outputs. `/data` can be read-only, whereas `/results` needs write access.
 77 | External references (see [cis-X seed][seed]) also need to be mounted to
 78 | `/opt/cis-x/refs/external`. For example, mounting to these directories requires
 79 | three flags:
 80 | 
 81 | ```
 82 | --mount type=bind,source=$HOME/research/data,target=/data,readonly \
 83 | --mount type=bind,source=/tmp/references,target=/opt/cis-x/refs/external,readonly \
 84 | --mount type=bind,source=$(pwd)/cis-x-out,target=/results \
 85 | ```
 86 | 
 87 | The source directives can point to any absolute path that can be accessed
 88 | locally. They do not need to match their target directory. Also note that the
 89 | results directory must exist before running the command.
 90 | 
 91 | ##### Examples
 92 | 
 93 | ###### cis-X seed
 94 | 
 95 | > [!NOTE]
 96 | > **cis-X seed** will likely not work due to [link rot](https://en.wikipedia.org/wiki/Link_rot). An alternative is to use the references included in the [demo data](#demo) instead.
 97 | 
 98 | A basic example is running [cis-X seed][seed], which downloads and preprocesses
 99 | required reference files to a directory. To run this locally, the `seed`
100 | subcommand is used, passing the destination directory of the resulting files.
101 | 
102 | ```
103 | $ cis-X seed /tmp/refs/external
104 | ```
105 | 
106 | To run this in a container using the Docker image, pass the subcommand and arguments
107 | as the command the container runs.
108 | 
109 | ```
110 | $ docker container run cis-x seed /tmp/refs/external
111 | ```
112 | 
113 | This, however, writes files to the container, rather than the host. To write
114 | files to the host from the container, mount the host destination directory to
115 | the container, e.g., `$(pwd)/refs/external` to `/opt/cis-x/refs/external`. The
116 | argument passed to the command must match the target directory.
117 | 
118 | ```
119 | $ docker container run \
120 |     --mount type=bind,source=$(pwd)/refs/external,target=/opt/cis-x/refs/external \
121 |     cis-x \
122 |     seed \
123 |     /opt/cis-x/refs/external
124 | ```
125 | 
126 | ###### cis-X run
127 | 
128 | The following template shows the minimum set of arguments to execute the `run`
129 | command, with variables showing what needs to be set.
130 | 
131 | ```
132 | $ docker container run \
133 |     --mount type=bind,source=$DATA_DIR,target=/data,readonly \
134 |     --mount type=bind,source=$REFS_DIR,target=/opt/cis-x/refs/external,readonly \
135 |     --mount type=bind,source=$RESULT_DIR,target=/results \
136 |     cis-x \
137 |     run \
138 |     -s $SAMPLE_ID \
139 |     -o /results \
140 |     -l /data/$MARKERS \
141 |     -g /data/$CNV_LOH_REGIONS \
142 |     -b /data/$BAM \
143 |     -e /data/$GENE_EXPRESSION_TABLE \
144 |     -m /data/$SOMATIC_SNV_INDEL \
145 |     -v /data/$SOMATIC_SV \
146 |     -c /data/$SOMATIC_CNV \
147 |     -d $DISEASE
148 | ```
149 | 
150 | Note that pathname arguments are relative to the container's target. For
151 | example, mounting `$HOME/research` and with an input located at
152 | `$HOME/research/sample-001/markers.txt`, the corresponding argument is
153 | `/data/sample-001/markers.txt`.
154 | 
155 | See the [Docker reference for `run`][docker-run] for more container run
156 | options. See [cis-X run][run] for more runtime options.
157 | 
158 | [docker-run]: https://docs.docker.com/engine/containers/run/
159 | 
160 | ## Demo
161 | 
162 | The next example runs cis-X with [demo data] (`cis-X-demo.tar.gz`).
163 | 
164 | Set up the project home directory with the demo data. The following commands
165 | assume the demo is extracted to a `tmp` directory in the root of the project.
166 | 
167 | ```
168 | $ git clone https://github.com/stjude/cis-x.git
169 | $ cd cis-x
170 | $ docker image build --tag cis-x .
171 | $ mkdir tmp
172 | $ wget --directory-prefix tmp http://ftp.stjude.org/pub/software/cis-x/cis-X-demo.tar.gz
173 | $ tar xf tmp/cis-X-demo.tar.gz --directory tmp
174 | ```
175 | 
176 | Then run cis-X.
177 | 
178 | ```
179 | $ docker container run \
180 |     --mount type=bind,source=$(pwd)/tmp/demo/data,target=/data,readonly \
181 |     --mount type=bind,source=$(pwd)/tmp/demo/ref,target=/opt/cis-x/refs/external,readonly \
182 |     --mount type=bind,source=$(pwd)/tmp,target=/results \
183 |     cis-x \
184 |     run \
185 |     -s SJALL018373_D1 \
186 |     -o /results \
187 |     -l /data/SJALL018373_D1.test.wgs.markers.txt \
188 |     -g /data/SJALL018373_D1.test.wgs.cnvloh.txt \
189 |     -b /data/SJALL018373_D1.test.RNAseq.bam \
190 |     -e /data/SJALL018373_D1.test.RNASEQ_all_fpkm.txt \
191 |     -m /data/SJALL018373_D1.test.mut.txt \
192 |     -v /data/SJALL018373_D1.test.sv.txt \
193 |     -c /data/SJALL018373_D1.test.cna.txt \
194 |     -d TALL \
195 |     -a drop \
196 |     -w 10 \
197 |     -r 10 \
198 |     -f 5
199 | ```
200 | 
201 | [demo data]: http://ftp.stjude.org/pub/software/cis-x/cis-X-demo.tar.gz
202 | 
203 | [ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp
204 | [run]: https://github.com/stjude/cis-x/tree/master/src/core
205 | [seed]: https://github.com/stjude/cis-x/tree/master/src/seed
206 | 


--------------------------------------------------------------------------------
/src/core/src/ase.candidate.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $thresh_ase_pvalue    = $ARGV[0];
  4 | my $thresh_ase_delta_di  = $ARGV[1];
  5 | my $thresh_ase_delta_cnv = $ARGV[2];
  6 | my $thresh_fpkm          = $ARGV[3];
  7 | my $thresh_loo_pvalue    = $ARGV[4];
  8 | my $sid                  = $ARGV[5];
  9 | my $outfile              = $ARGV[6];
 10 | my $ase_result_gene      = $ARGV[7];
 11 | my $ohe_result           = $ARGV[8];
 12 | my $thresh_loo_hi_perc   = $ARGV[9];
 13 | my $imprinting_genes     = $ARGV[10];
 14 | my $oncogenes            = $ARGV[11];
 15 | 
 16 | my (%imprint,%g2loo,%glst,%oncog);
 17 | 
 18 | my $infile = $imprinting_genes;
 19 | open IN, "< $infile" or die "$infile: $!";
 20 | while(<IN>) {
 21 |     chomp;
 22 |     next if $. == 1;
 23 |     my @F = split/\t/;
 24 |     $imprint{$F[0]} = $F[3];
 25 | }
 26 | close IN;
 27 | 
 28 | $infile = $oncogenes;
 29 | open IN, "< $infile" or die "$infile: $!";
 30 | while(<IN>) {
 31 |     chomp;
 32 |     next if $. == 1;
 33 |     $_ =~ s/\"//g;
 34 |     my @F = split/\t/;
 35 |     next unless $F[14];
 36 |     next if $F[14] eq "TSG";
 37 |     next if $F[0] eq "IGH" or $F[0] eq "IGK" or $F[0] eq "IGL" or $F[0] eq "HLA-A";
 38 |     $oncog{$F[0]} = $F[14];
 39 | }
 40 | close IN;
 41 | 
 42 | $infile = $ohe_result;
 43 | open IN, "< $infile" or die "$infile: $!";
 44 | while(<IN>) {
 45 |     chomp;
 46 |     next if $. == 1;
 47 |     my @F = split/\t/;
 48 |     $g2loo{$F[0]}{fpkm} = $F[1];
 49 |     if ($F[12] ne "na") {
 50 |         $g2loo{$F[0]}{tval} = $F[15];
 51 |         $g2loo{$F[0]}{tperc} = $F[16];
 52 |         $g2loo{$F[0]}{rank} = $F[14];
 53 |         $g2loo{$F[0]}{size} = $F[12];
 54 |         $g2loo{$F[0]}{source} = "white_list";
 55 |     }elsif ($F[2] ne "na") {
 56 |         $g2loo{$F[0]}{tval} = $F[5];
 57 |         $g2loo{$F[0]}{tperc} = $F[6];
 58 |         $g2loo{$F[0]}{rank} = $F[4];
 59 |         $g2loo{$F[0]}{size} = $F[2];
 60 |         $g2loo{$F[0]}{source} = "bi_cohort";
 61 |     }else {
 62 |         $g2loo{$F[0]}{tval} = $F[10];
 63 |         $g2loo{$F[0]}{tperc} = $F[11];
 64 |         $g2loo{$F[0]}{rank} = $F[9];
 65 |         $g2loo{$F[0]}{size} = $F[7];
 66 |         $g2loo{$F[0]}{source} = "entire_cohort";
 67 |     }
 68 | }
 69 | close IN;
 70 | 
 71 | $infile = $ase_result_gene;
 72 | open OUT, "> $outfile" or die "$outfile: $!";
 73 | open IN, "< $infile" or die "$infile: $!";
 74 | while(<IN>) {
 75 |     chomp;
 76 |     if ($. == 1) {
 77 |         print OUT "$_\tFPKM\tloo.source\tloo.cohort.size\tloo.tstatistic\tloo.qval\tloo.rank\timprinting.status\tcandidate.group\tdescription\n";
 78 |         next;
 79 |     }
 80 |     my @F = split/\t/;
 81 |     my $imprint = "";
 82 |     my $candidate_group = "";
 83 |     my $description = "";
 84 |     if ($imprint{$F[1]}) {
 85 |         $imprint = $imprint{$F[1]};
 86 |     }
 87 |     next if $imprint eq "Imprinted";
 88 |     next unless $g2loo{$F[1]};
 89 |     $glst{$F[1]} = 1;
 90 |     ### 2018/12/25, for the known oncogenes in cosmic, keep for next step if p-value pass the threshold and fpkm >= 1.
 91 |     if ($g2loo{$F[1]}{fpkm} >= $thresh_fpkm) {
 92 |         1;
 93 |     }elsif ($oncog{$F[1]} and $g2loo{$F[1]}{fpkm} >= 1) {
 94 |         $description = "rescued-ohe";
 95 |     }else {
 96 |         next;
 97 |     }
 98 | #    next unless $g2loo{$F[1]}{fpkm} >= $thresh_fpkm;
 99 |     my $tag = $F[16];
100 |     my @tag = split(/,/,$tag);
101 |     my $tagnum = scalar(@tag);
102 |     my $tagcnv = 0;
103 |     for my $t (@tag) {
104 |         if ($t eq "cnvloh") {
105 |             $tagcnv++;
106 |         }
107 |     }
108 |     my $class = "diploid";
109 |     if ($tagcnv/$tagnum > 0.3) {
110 |         $class = "cnvloh";
111 |     }
112 | #    next unless $g2loo{$F[1]}{pval} < $thresh_loo_pvalue;
113 |     if ($F[22] < $thresh_ase_pvalue) {
114 |         if ($class eq "cnvloh") {
115 |             ### use $thresh_ase_delta_cnv if > 30% of the markers sits in cnvloh region.
116 |             if ($F[19] >= $thresh_ase_delta_cnv) {
117 |                 if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
118 |                     $candidate_group = "ase_outlier";
119 |                 }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) {
120 |                     $candidate_group = "ase_high";
121 |                 }else {
122 |                     1;
123 |                 }
124 |             }else {
125 |                 if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
126 |                     $candidate_group = "uncertain_outlier";
127 |                 }else {
128 |                     1;
129 |                 }
130 |             }
131 |         }else {
132 |             if ($F[19] >= $thresh_ase_delta_di) {
133 |                 if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
134 |                     $candidate_group = "ase_outlier";
135 |                 }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) {
136 |                     $candidate_group = "ase_high";
137 |                 }else {
138 |                     1;
139 |                 }
140 |             }else {
141 |                 if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
142 |                     $candidate_group = "uncertain_outlier";
143 |                 }else {
144 |                     1;
145 |                 }
146 |             }
147 |         }
148 |         if ($oncog{$F[1]}) {
149 |             if ($description) {
150 |                 $description = "$oncog{$F[1]}, $description";
151 |             }else {
152 |                 $description = "$oncog{$F[1]}";
153 |             }
154 |         }
155 |     }else {
156 |         ### for the known oncogenes in cosmic, rescue if raw p-value < 0.05 && over 90% of markers show mono-allelic transcription (maf-rna < 0.1 || maf-rna > 0.9).
157 |         my $keep = 1;
158 |         if ($oncog{$F[1]} and $F[20] < 0.05) {
159 |             my @mafrna = split(/,/,$F[17]);
160 |             my $rnasig = 0;
161 |             my $rnatot = scalar(@mafrna);
162 |             for my $f (@mafrna) {
163 |                 if ($f > 0.9 or $f < 0.1) {
164 |                     $rnasig++;
165 |                 }
166 |             }
167 |             if ($rnasig/$rnatot < 0.9) {
168 |                 $keep = 0;
169 |             }
170 |         }else {
171 |             $keep = 0;
172 |         }
173 |         if ($keep == 1) {
174 |             if ($class eq "cnvloh") {
175 |             ### use $thresh_ase_delta_cnv if > 30% of the markers sits in cnvloh region.
176 |                 if ($F[19] >= $thresh_ase_delta_cnv) {
177 |                     if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
178 |                         $candidate_group = "ase_outlier";
179 |                     }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) {
180 |                         $candidate_group = "ase_high";
181 |                     }else {
182 |                         1;
183 |                     }
184 |                 }else {
185 |                     if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
186 |                         $candidate_group = "uncertain_outlier";
187 |                     }else {
188 |                         1;
189 |                     }
190 |                 }
191 |             }else {
192 |                 if ($F[19] >= $thresh_ase_delta_di) {
193 |                     if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
194 |                         $candidate_group = "ase_outlier";
195 |                     }elsif ($g2loo{$F[1]}{rank}/$g2loo{$F[1]}{size} <= $thresh_loo_hi_perc) {
196 |                         $candidate_group = "ase_high";
197 |                     }else {
198 |                         1;
199 |                     }
200 |                 }else {
201 |                     if ($g2loo{$F[1]}{tperc} < $thresh_loo_pvalue) {
202 |                         $candidate_group = "uncertain_outlier";
203 |                     }else {
204 |                         1;
205 |                     }
206 |                 }
207 |             }
208 |             if ($description) {
209 |                 $description = "$oncog{$F[1]}, rescued-ase, $description";
210 |             }else {
211 |                 $description = "$oncog{$F[1]}, rescued-ase";
212 |             }
213 |         }
214 |     }
215 |     next unless $candidate_group eq "ase_outlier";
216 |     print OUT "$_\t$g2loo{$F[1]}{fpkm}\t$g2loo{$F[1]}{source}\t$g2loo{$F[1]}{size}\t$g2loo{$F[1]}{tval}\t$g2loo{$F[1]}{tperc}\t$g2loo{$F[1]}{rank}\t$imprint\t$candidate_group\t$description\n";
217 | }
218 | close IN;
219 | 


--------------------------------------------------------------------------------
/src/core/src/snvindel.prep.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid           = $ARGV[0];
  4 | my $snvindel_in   = $ARGV[1];
  5 | my $ase_result    = $ARGV[2];
  6 | my $ase_result_run  = $ARGV[3];
  7 | my $sv_result     = $ARGV[4];
  8 | my $cna_result    = $ARGV[5];
  9 | my $tad_ref       = $ARGV[6];
 10 | my $snvindel_list = $ARGV[7];
 11 | my $seqlist       = $ARGV[8];
 12 | my $snvindel_win  = $ARGV[9];
 13 | my $refgene       = $ARGV[10];
 14 | 
 15 | my (%genes,%solved,%chr2g,%g2pro,%snvindel,%tad,%g2query);
 16 | 
 17 | my $infile = $tad_ref;
 18 | open IN, "< $infile" or die "$infile: $!";
 19 | while(<IN>) {
 20 |     chomp;
 21 |     my @F = split/\t/;
 22 |     my $id = "$F[0].$F[1].$F[2]";
 23 |     $tad{$F[0]}{$id}{start} = $F[1];
 24 |     $tad{$F[0]}{$id}{end}   = $F[2];
 25 |     $tad{$F[0]}{$id}{source} = "hESC";
 26 | }
 27 | close IN;
 28 | 
 29 | $infile = $ase_result;
 30 | open IN, "< $infile" or die "$infile: $!";
 31 | while(<IN>) {
 32 |     chomp;
 33 |     next if $. == 1;
 34 |     my @F = split/\t/;
 35 |     $genes{$F[1]}{gsym}   = $F[1];
 36 |     $genes{$F[1]}{chrom}  = $F[2];
 37 |     $genes{$F[1]}{strand} = $F[3];
 38 |     $genes{$F[1]}{start}  = $F[4];
 39 |     $genes{$F[1]}{end}    = $F[5];
 40 |     $chr2g{$F[2]}{$F[1]}  = 1;
 41 |     if ($F[3] eq "+") {
 42 |         $g2pro{$F[1]}{start} = $F[4] - 2000;
 43 |         $g2pro{$F[1]}{end}   = $F[4] + 200;
 44 |     }elsif ($F[3] eq "-") {
 45 |         $g2pro{$F[1]}{start} = $F[5] - 200;
 46 |         $g2pro{$F[1]}{end}   = $F[5] + 2000;
 47 |     }
 48 | }
 49 | close IN;
 50 | 
 51 | $infile = $ase_result_run;
 52 | open IN, "< $infile" or die "$infile: $!";
 53 | while(<IN>) {
 54 |     chomp;
 55 |     next if $. == 1;
 56 |     my @F = split/\t/;
 57 |     if ($F[8]) {
 58 |         my @G = split(/,/,$F[8]);
 59 |         for my $g (@G) {
 60 |             next if $genes{$g};
 61 |             $g2query{$g}{tag} = 1;
 62 |         }
 63 |     }
 64 | }
 65 | close IN;
 66 | 
 67 | $infile = $refgene;
 68 | open IN, "< $infile" or die "$infile: $!";
 69 | while(<IN>) {
 70 |     chomp;
 71 |     my @F = split/\t/;
 72 |     if ($g2query{$F[3]}) {
 73 |         my $len = $F[2] - $F[1];
 74 |         if ($g2query{$F[3]}{tag} == 1 or ($len > $g2query{$F[3]}{len})) {
 75 |             $g2query{$F[3]}{acc}    = $F[4];
 76 |             $g2query{$F[3]}{chrom}  = $F[0];
 77 |             $g2query{$F[3]}{strand} = $F[5];
 78 |             $g2query{$F[3]}{start}  = $F[1];
 79 |             $g2query{$F[3]}{end}    = $F[2];
 80 |             $g2query{$F[3]}{len}    = $len;
 81 |             $g2query{$F[3]}{tag}    = 2;
 82 |         }else {
 83 |             1;
 84 |         }
 85 |     }
 86 | }
 87 | close IN;
 88 | 
 89 | for my $g (sort keys %g2query) {
 90 |     if ($g2query{$g}{tag} == 1) {
 91 |         print "Error, $g not annotated.\n";
 92 |     }else {
 93 |         $genes{$g}{gsym}   = $g;
 94 |         $genes{$g}{chrom}  = $g2query{$g}{chrom};
 95 |         $genes{$g}{strand} = $g2query{$g}{strand};
 96 |         $genes{$g}{start}  = $g2query{$g}{start};
 97 |         $genes{$g}{end}    = $g2query{$g}{end};
 98 |         $chr2g{$g2query{$g}{chrom}}{$g} = 1;
 99 |         if ($g2query{$g}{strand} eq "+") {
100 |             $g2pro{$g}{start} = $g2query{$g}{start} - 2000;
101 |             $g2pro{$g}{end}   = $g2query{$g}{start} + 200;
102 |         }elsif ($g2query{$g}{strand} eq "-") {
103 |             $g2pro{$g}{start} = $g2query{$g}{end} - 200;
104 |             $g2pro{$g}{end}   = $g2query{$g}{end} + 2000;
105 |         }
106 |     }
107 | }
108 | 
109 | $infile = $sv_result;
110 | open IN, "< $infile" or die "$infile: $!";
111 | while(<IN>) {
112 |     chomp;
113 |     next if $. == 1;
114 |     my @F = split/\t/;
115 |     if ($F[0]) {
116 |         my @g = split(/,/,$F[0]);
117 |         for my $g (@g) {
118 |             $solved{$g} = 1;
119 |         }
120 |     }
121 |     if ($F[1]) {
122 |         my @g = split(/,/,$F[1]);
123 |         for my $g (@g) {
124 |             $solved{$g} = 1;
125 |         }
126 |     }
127 | }
128 | close IN;
129 | 
130 | $infile = $cna_result;
131 | open IN, "< $infile" or die "$infile: $!";
132 | while(<IN>) {
133 |     chomp;
134 |     next if $. == 1;
135 |     my @F = split/\t/;
136 |     if ($F[0]) {
137 |         my @g = split(/,/,$F[0]);
138 |         for my $g (@g) {
139 |             $solved{$g} = 1;
140 |         }
141 |     }
142 | }
143 | close IN;
144 | 
145 | $infile = $snvindel_in;
146 | open IN, "< $infile" or die "$infile: $!";
147 | while(<IN>) {
148 |     chomp;
149 |     next if $. == 1;
150 |     my @F = split/\t/;
151 |     my $chrom = $F[0];
152 |     unless ($chrom =~ /^chr/) {
153 |         $chrom = "chr" . $chrom;
154 |     }
155 |     my $pos = $F[1];
156 |     my $target = "";
157 |     my $dist_o = "";
158 |     my @g = sort keys %{$chr2g{$chrom}};
159 |     ### filter TAD.
160 |     for my $tad (sort keys %{$tad{$chrom}}) {
161 |         if ($pos <= $tad{$chrom}{$tad}{end} and $pos >= $tad{$chrom}{$tad}{start}) {
162 |             for my $g (@g) {
163 |                 next if $solved{$g};
164 |                 my $overlap = 0;
165 |                 if ($g2pro{$g}) {
166 |                     if ($g2pro{$g}{start} > $tad{$chrom}{$tad}{end}) {
167 |                         1;
168 |                     }elsif ($g2pro{$g}{start} >= $tad{$chrom}{$tad}{start}) {
169 |                         $overlap = 1;
170 |                     }elsif ($g2pro{$g}{end} >= $tad{$chrom}{$tad}{start}) {
171 |                         $overlap = 1;
172 |                     }else {
173 |                         1;
174 |                     }
175 |                 }else {
176 |                     print "No promoter info for $g.\n";
177 |                 }
178 |                 if ($overlap == 1) {
179 |                     ### require distance between target gene tss less than $snvindel_win.
180 |                     my $dist = abs($pos - $genes{$g}{start});
181 |                     if ($genes{$g}{strand} eq "-") {
182 |                         $dist = abs($pos - $genes{$g}{end});
183 |                     }
184 |                     if ($dist <= $snvindel_win) {
185 |                         $target .= "$g,";
186 |                         $dist_o .= "$dist,";
187 |                     }
188 |                 }
189 |             }
190 |         }
191 |     }
192 |     if ($target) {
193 |         $target =~ s/\,$//;
194 |         $dist_o =~ s/\,$//;
195 |         my $snv4 = "$chrom.$F[1].$F[2].$F[3]";
196 |         $snvindel{$snv4}{target} = $target;
197 |         $snvindel{$snv4}{type}   = $F[4];
198 |         $snvindel{$snv4}{dist}   = $dist_o;
199 |     }
200 | }
201 | close IN;
202 | 
203 | my $outfile = $snvindel_list;
204 | my $seq_out = $seqlist;
205 | open OUT, "> $outfile" or die "$outfile: $!";
206 | open SEQLST, "> $seq_out" or die "$seq_out: $!";
207 | print OUT "snv4\ttype\tref\tmut\tleft_name\tright_name\ttarget\tdist\tstart\tlength\n";
208 | for my $snv4 (sort keys %snvindel) {
209 |     my @s = split(/\./,$snv4);
210 |     my $chrom  = $s[0];
211 |     my $pos    = $s[1];
212 |     my $ref    = $s[2];
213 |     my $mut    = $s[3];
214 |     my $target = $snvindel{$snv4}{target};
215 |     my $type   = $snvindel{$snv4}{type};
216 |     my $dist   = $snvindel{$snv4}{dist};
217 |     my $left   = "";
218 |     my $right  = "";
219 |     my $start  = 21;
220 |     my $length = 1;
221 |     if ($type eq "snv") {
222 |         $start_l = $pos - 1 - 20;
223 |         $end_l   = $pos - 1;
224 |         $start_r = $pos - 1 + 1;
225 |         $end_r   = $pos - 1 + 1 + 20;
226 |         $left    = $chrom . ":" . $start_l . "-" . $end_l;
227 |         $right   = $chrom . ":" . $start_r . "-" . $end_r;
228 |     }else {
229 |         if ($ref eq "-") {
230 |             $type = "ins";
231 |             $start_l = $pos - 1 - 20;
232 |             $end_l   = $pos - 1;
233 |             $start_r = $pos - 1;
234 |             $end_r   = $pos - 1 + 20;
235 |             $left    = $chrom . ":" . $start_l . "-" . $end_l;
236 |             $right   = $chrom . ":" . $start_r . "-" . $end_r;
237 |             my @seq = split(//,$mut);
238 |             $length = scalar(@seq);
239 |         }elsif ($mut eq "-") {
240 |             $type = "del";
241 |             my @seq = split(//,$ref);
242 |             my $len = scalar(@seq);
243 |             $length = $len;
244 |             $start_l = $pos - 1 - 20;
245 |             $end_l   = $pos - 1;
246 |             $start_r = $pos - 1 + $len;
247 |             $end_r   = $pos - 1 + $len + 20;
248 |             $left    = $chrom . ":" . $start_l . "-" . $end_l;
249 |             $right   = $chrom . ":" . $start_r . "-" . $end_r;
250 |         }else {
251 |             $type = "complex_indel";
252 |             my @seq = split(//,$ref);
253 |             my $len = scalar(@seq);
254 |             $start_l = $pos - 1 - 20;
255 |             $end_l   = $pos - 1;
256 |             $start_r = $pos - 1 + $len;
257 |             $end_r   = $pos - 1 + $len + 20;
258 |             $left    = $chrom . ":" . $start_l . "-" . $end_l;
259 |             $right   = $chrom . ":" . $start_r . "-" . $end_r;
260 |         }
261 |     }
262 |     $left =~ s/^chr//;
263 |     $right =~ s/^chr//;
264 |     print OUT "$snv4\t$type\t$ref\t$mut\t$left\t$right\t$target\t$dist\t$start\t$length\n";
265 |     print SEQLST "$left\n$right\n";
266 | }
267 | close OUT;
268 | close SEQLST;
269 | 


--------------------------------------------------------------------------------
/src/core/src/snvindel.process.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | 
  3 | my $sid            = $ARGV[0];
  4 | my $fimo_pred      = $ARGV[1];
  5 | my $fimo_acc2gsym  = $ARGV[2];
  6 | my $snvindel_input = $ARGV[3];
  7 | my $fpkm_res       = $ARGV[4];
  8 | my $tf_fpkm_thresh = $ARGV[5];
  9 | my $output         = $ARGV[6];
 10 | my $roadmap_enh    = $ARGV[7];
 11 | my $roadmap_pro    = $ARGV[8];
 12 | my $roadmap_dya    = $ARGV[9];
 13 | my $anno_user      = $ARGV[10];
 14 | 
 15 | my (%tf2gsym,%tflst,%g2fpkm,%var,%var2tf,%chr2var);
 16 | 
 17 | my $infile = $fimo_acc2gsym;
 18 | open IN, "< $infile" or die "$infile: $!";
 19 | while(<IN>) {
 20 |     chomp;
 21 |     next if $. == 1;
 22 |     my @F = split/\t/;
 23 |     next if $F[0] =~ /RETRACTED/;
 24 | #    $tf2gsym{$F[0]} = $F[2];
 25 | #    $tflst{$F[2]} = 1;
 26 |     $tf2gsym{$F[0]} = $F[1];
 27 |     $tflst{$F[1]} = 1;
 28 | }
 29 | close IN;
 30 | 
 31 | ### coded with current format, may need improve.
 32 | $infile = $fpkm_res;
 33 | open IN, "< $infile" or die "$infile: $!";
 34 | while(<IN>) {
 35 |     chomp;
 36 |     next if $. == 1;
 37 |     my @F = split/\t/;
 38 |     next unless $tflst{$F[1]};
 39 |     if ($g2fpkm{$F[1]}) {
 40 |         if ($g2fpkm{$F[1]} < $F[7]) {
 41 |             $g2fpkm{$F[1]} = $F[7];
 42 |         }
 43 |     }else {
 44 |         $g2fpkm{$F[1]} = $F[7];
 45 |     }
 46 | }
 47 | close IN;
 48 | 
 49 | $infile = $snvindel_input;
 50 | open IN, "< $infile" or die "$infile: $!";
 51 | while(<IN>) {
 52 |     chomp;
 53 |     next if $. == 1;
 54 |     my @F = split/\t/;
 55 |     my @f = split(/\./,$F[0]);
 56 |     $var{$F[0]}{target} = $F[6];
 57 |     $var{$F[0]}{dist}   = $F[7];
 58 |     $var{$F[0]}{type}   = $F[1];
 59 |     $var{$F[0]}{start}  = $F[8];
 60 |     $var{$F[0]}{len}    = $F[9];
 61 |     $var{$F[0]}{mut}    = $F[3];
 62 |     $var{$F[0]}{ref}    = $F[2];
 63 |     $var{$F[0]}{chrom}  = $f[0];
 64 |     $var{$F[0]}{pos}    = $f[1];
 65 |     $chr2var{$f[0]}{$F[0]} = 1;
 66 | }
 67 | close IN;
 68 | 
 69 | open IN, "< $roadmap_enh" or die "$roadmap_enh: $!";
 70 | while(<IN>) {
 71 |     chomp;
 72 |     my @F = split/\t/;
 73 |     for my $var (sort keys %{$chr2var{$F[0]}}) {
 74 |         if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) {
 75 |             $var{$var}{enh}{$F[7]} = 1;
 76 |         }
 77 |     }
 78 | }
 79 | close IN;
 80 | 
 81 | open IN, "< $roadmap_pro" or die "$roadmap_pro: $!";
 82 | while(<IN>) {
 83 |     chomp;
 84 |     my @F = split/\t/;
 85 |     for my $var (sort keys %{$chr2var{$F[0]}}) {
 86 |         if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) {
 87 |             $var{$var}{pro}{$F[7]} = 1;
 88 |         }
 89 |     }
 90 | }
 91 | close IN;
 92 | 
 93 | open IN, "< $roadmap_dya" or die "$roadmap_dya: $!";
 94 | while(<IN>) {
 95 |     chomp;
 96 |     my @F = split/\t/;
 97 |     for my $var (sort keys %{$chr2var{$F[0]}}) {
 98 |         if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) {
 99 |             $var{$var}{dya}{$F[7]} = 1;
100 |         }
101 |     }
102 | }
103 | close IN;
104 | 
105 | if ($anno_user ne "NotSpecified") {
106 |     open IN, "< $anno_user" or die "$anno_user: $!";
107 |     while(<IN>) {
108 |         chomp;
109 |         my @F = split/\t/;
110 |         if ($. == 1) {
111 |             if ($#F < 2) {
112 |                 print "Error: please provide the annotation file with BED format.\n";
113 |                 exit;
114 |             }
115 |         }
116 |         my $uchrom = $F[0];
117 |         unless ($uchrom =~ /^chr/) {
118 |             $uchrom = "chr" . $uchrom;
119 |         }
120 |         for my $var (sort keys %{$chr2var{$uchrom}}) {
121 |             if ($var{$var}{pos} >= $F[1] and $var{$var}{pos} <= $F[2]) {
122 |                 my $uname = "$F[0].$F[1].$F[2]";
123 |                 $var{$var}{usr}{$uname} = 1;
124 |             }
125 |         }
126 |     }
127 |     close IN;
128 | }
129 | 
130 | my @var = sort keys %var;
131 | my $varnum = scalar(@var);
132 | if ($varnum == 0) {
133 |     open OUT, "> $output" or die "$output: $!";
134 |     print OUT "chrom\tpos\tref\tmut\ttarget\tdist\ttf\tEpiRoadmap_enhancer\tEpiRoadmap_promoter\tEpiRoadmap_dyadic\tUser_Annot\n";
135 |     close OUT;
136 | }else {
137 |     $infile = $fimo_pred;
138 |     open IN, "< $infile" or die "$infile: $!";
139 |     while(<IN>) {
140 |         chomp;
141 |         next if $. == 1;
142 |         my @F = split/\t/;
143 |         my $tf = "";
144 |         my $fpkm = 0;
145 |         if ($tf2gsym{$F[0]}) {
146 |             $tf = $tf2gsym{$F[0]};
147 |         }
148 |         next unless $tf;
149 |         if ($tf eq "MYBL1" or $tf eq "MYBL2") { ### 2019-03-11, The motif for MYB, MYBL1/2 are very similar in this version of db. May update in later version.
150 |             $tf = "MYB";
151 |         }
152 |         if ($g2fpkm{$tf}) {
153 |             $fpkm = $g2fpkm{$tf};
154 |         }
155 |         next unless $fpkm > $tf_fpkm_thresh;
156 |         my $mut = 0;
157 |         my $ref = 0;
158 |         my @f = split(/\./,$F[1]);
159 |         my $var = "$f[0].$f[1].$f[2].$f[3]";
160 |         my $type = $var{$var}{type};
161 |         my $pos = $var{$var}{start};
162 |         my $len = $var{$var}{len};
163 |         if ($type eq "snv") {
164 |             if ($pos >= $F[2] and $pos <= $F[3]) {
165 |                 if ($f[4] eq "mut") {
166 |                     $mut = 1;
167 |                 }elsif ($f[4] eq "ref") {
168 |                     $ref = 1;
169 |                 }else {
170 |                     print "Wrong type for $F[1].\n";
171 |                 }
172 |             }
173 |         }elsif ($type eq "ins") {
174 |             if ($f[4] eq "mut") {
175 |                 my $start = $pos;
176 |                 my $end = $pos + $len - 1;
177 |                 if ($start > $F[3]) {
178 |                     1;
179 |                 }elsif ($start >= $F[2]) {
180 |                     $mut = 1;
181 |                 }elsif ($end >= $F[2]) {
182 |                     $mut = 1;
183 |                 }else {
184 |                     1;
185 |                 }
186 |             }elsif ($f[4] eq "ref") {
187 |                 if ($pos >= $F[2] and $pos <= $F[3]) {
188 |                     $ref = 1;
189 |                 }
190 |             }else {
191 |                 print "Wrong type for $F[1].\n";
192 |             }
193 |         }elsif ($type eq "del") {
194 |             if ($f[4] eq "mut") {
195 |                 if ($pos >= $F[2] and $pos <= $F[3]) {
196 |                     $mut = 1;
197 |                 }
198 |             }elsif ($f[4] eq "ref") {
199 |                 my $start = $pos;
200 |                 my $end = $pos + $len - 1;
201 |                 if ($start > $F[3]) {
202 |                     1;
203 |                 }elsif ($start >= $F[2]) {
204 |                     $ref = 1;
205 |                 }elsif ($end >= $F[2]) {
206 |                     $ref = 1;
207 |                 }else {
208 |                     1;
209 |                 }
210 |             }else {
211 |                 print "Wrong type for $F[1].\n";
212 |             }
213 |         }elsif ($type eq "complex_indel") {
214 |             if ($f[4] eq "mut") {
215 |                 my @mutseq = split(//,$var{$var}{mut});
216 |                 my $seqlen = scalar(@mutseq);
217 |                 my $start = $pos;
218 |                 my $end = $pos + $seqlen - 1;
219 |                 if ($start > $F[3]) {
220 |                     1;
221 |                 }elsif ($start >= $F[2]) {
222 |                     $mut = 1;
223 |                 }elsif ($end >= $F[2]) {
224 |                     $mut = 1;
225 |                 }else {
226 |                     1;
227 |                 }
228 |             }elsif ($f[4] eq "ref") {
229 |                 my @refseq = split(//,$var{$var}{ref});
230 |                 my $seqlen = scalar(@refseq);
231 |                 my $start = $pos;
232 |                 my $end = $pos + $seqlen - 1;
233 |                 if ($start > $F[3]) {
234 |                     1;
235 |                 }elsif ($start >= $F[2]) {
236 |                     $ref = 1;
237 |                 }elsif ($end >= $F[2]) {
238 |                     $ref = 1;
239 |                 }else {
240 |                     1;
241 |                 }
242 |             }else {
243 |                 print "Wrong type for $F[1].\n";
244 |             }
245 |         }else {
246 |             print "Wrong type for $type.\n";
247 |         }
248 |         if ($f[4] eq "mut") {
249 |             if ($var{$var}{tf}{$tf}{mut} and $var{$var}{tf}{$tf}{mut} == 1) {
250 |                 1;
251 |             }else {
252 |                 $var{$var}{tf}{$tf}{mut} = $mut;
253 |             }
254 |         }elsif ($f[4] eq "ref") {
255 |             if ($var{$var}{tf}{$tf}{ref} and $var{$var}{tf}{$tf}{ref} == 1) {
256 |                 1;
257 |             }else {
258 |                 $var{$var}{tf}{$tf}{ref} = $ref;
259 |             }
260 |         }else {
261 |             print "Wrong type for $F[1].\n";
262 |         }
263 |         $var{$var}{tf}{$tf}{fpkm} = $fpkm;
264 |     }
265 |     close IN;
266 | 
267 |     open OUT, "> $output" or die "$output: $!";
268 |     print OUT "chrom\tpos\tref\tmut\ttype\ttarget\tdist\ttf\tEpiRoadmap_enhancer\tEpiRoadmap_promoter\tEpiRoadmap_dyadic\tUser_Annot\n";
269 |     for my $var (sort keys %var) {
270 |         my $pred_tf = "";
271 |         for my $tf (sort {$var{$var}{tf}{$b}{fpkm} <=> $var{$var}{tf}{$a}{fpkm}} keys %{$var{$var}{tf}}) {
272 |             my $ref = 0;
273 |             my $mut = 0;
274 |             $ref = $var{$var}{tf}{$tf}{ref} if $var{$var}{tf}{$tf}{ref};
275 |             $mut = $var{$var}{tf}{$tf}{mut} if $var{$var}{tf}{$tf}{mut};
276 |             if ($mut == 1 and $ref == 0) {
277 |                 $pred_tf .= "$tf,";
278 |             }
279 |         }
280 |         if ($pred_tf) {
281 |             my @var = split(/\./,$var);
282 |             $pred_tf =~ s/\,$//;
283 |             my $enh = "";
284 |             my $pro = "";
285 |             my $dya = "";
286 |             my $user = "";
287 |             if ($var{$var}{enh}) {
288 |                 my @enh = sort keys %{$var{$var}{enh}};
289 |                 $enh = join(',',@enh);
290 |             }
291 |             if ($var{$var}{pro}) {
292 |                 my @pro = sort keys %{$var{$var}{pro}};
293 |                 $pro = join(',',@pro);
294 |             }
295 |             if ($var{$var}{dya}) {
296 |                 my @dya = sort keys %{$var{$var}{dya}};
297 |                 $dya = join(',',@dya);
298 |             }
299 |             if ($var{$var}{usr}) {
300 |                 my @user = sort keys %{$var{$var}{usr}};
301 |                 $user = join(',',@user);
302 |             }
303 |             print OUT "$var[0]\t$var[1]\t$var[2]\t$var[3]\t$var{$var}{type}\t$var{$var}{target}\t$var{$var}{dist}\t$pred_tf\t$enh\t$pro\t$dya\t$user\n";
304 |         }
305 |     }
306 |     close OUT;
307 | }
308 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/core/src/ase_runs.pl:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/perl -w
  2 | ### updated 2019-04-08.
  3 | 
  4 | my $infile    = $ARGV[0];
  5 | my $delta_di  = $ARGV[1];
  6 | my $delta_cnv = $ARGV[2];
  7 | my $outfile   = $ARGV[3];
  8 | my $bedfile   = $ARGV[4];
  9 | my $num_markers = $ARGV[5];
 10 | my $frac_markers = $ARGV[6];
 11 | my $dist_markers = $ARGV[7];
 12 | 
 13 | my $cluster = 0;
 14 | 
 15 | print "$frac_markers\n";
 16 | 
 17 | #my $num_markers  = 15;      ### at least 4 markers to call a run.
 18 | #my $frac_markers = 0.75;    ### minimal 75% of markers to be "s" or "e" to call a run.
 19 | #my $dist_markers = 200000; ### markers separated over 200kb will not be joined into a run.
 20 | 
 21 | my (%runs,%buff,$last1,$last2);
 22 | my $buff = 0;
 23 | 
 24 | open IN, "< $infile" or die "$infile: $!";
 25 | while(<IN>) {
 26 |     chomp;
 27 |     next if $. == 1;
 28 |     my @F = split/\t/;
 29 |     my $tag = "f";
 30 | 
 31 |     if ($F[10] == 0 or $F[9] == 0) {
 32 |         $tag = "e";  ### extreme
 33 |     }elsif (($F[10] == 1 and $F[9] >= 4) or ($F[9] == 1 and $F[10] >= 4)) {
 34 |         if ($F[11] <= 0.05) {
 35 |             $tag = "e";  ### extreme with error but significant
 36 |         }else {
 37 |             $tag = "E";  ### extreme with error
 38 |         }
 39 |     }elsif ($F[11] <= 0.05) {
 40 |         if ($F[8] eq "diploid") {
 41 |             if ($F[12] >= $delta_di) {
 42 |                 $tag = "s";  ### significant
 43 |             }
 44 |         }elsif ($F[8] eq "cnvloh") {
 45 |             if ($F[12] >= $delta_cnv) {
 46 |                 $tag = "s";  ### significant
 47 |             }
 48 |         }else {
 49 |             print "Error: wrong tag for $F[8] at $F[0] $F[1].\n";
 50 |         }
 51 |     }else {
 52 |         1;
 53 |     }
 54 | 
 55 |     if ($tag eq "s" or $tag eq "e" or $tag eq "E") {
 56 |         if ($buff == 0) {
 57 |             $buff = 1;
 58 |             $buff{chrom} = $F[0];
 59 |             $buff{pos}   = $F[1];
 60 |             $buff{tags}  = $tag;
 61 |         }elsif ($F[0] ne $buff{chrom}) {
 62 |             my @t = split(//,$buff{tags});
 63 |             my $num = 0;
 64 |             my $perc = 0;
 65 |             my $numE = 0;
 66 |             my $percE = 0;
 67 |             my $totm = scalar(@t);
 68 |             for my $m (@t) {
 69 |                 $num++ if ($m eq "s" or $m eq "e");
 70 |                 $numE++ if $m eq "E";
 71 |             }
 72 |             $perc = $num / $totm;
 73 |             $percE = $numE / $totm;
 74 |             if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) {
 75 |                 $cluster++;
 76 |                 if ($t[$#t] ne "f" and $t[$#t-1] ne "f") {
 77 |                     my @pos = split(/,/,$buff{pos});
 78 |                     $runs{$cluster}{chrom} = $buff{chrom};
 79 |                     $runs{$cluster}{start} = $pos[0];
 80 |                     $runs{$cluster}{end}   = $pos[$#pos];
 81 |                     $runs{$cluster}{tags}  = $buff{tags};
 82 |                 }elsif ($t[$#t] eq "f") {
 83 |                     my @pos = split(/,/,$buff{pos});
 84 |                     $runs{$cluster}{chrom} = $buff{chrom};
 85 |                     $runs{$cluster}{start} = $pos[0];
 86 |                     $runs{$cluster}{end}   = $pos[$#pos-1];
 87 |                     for my $i (0 .. $#t-1) {
 88 |                         $runs{$cluster}{tags}  .= "$t[$i]";
 89 |                     }
 90 |                 }elsif ($t[$#t-1] eq "f") {
 91 |                     my @pos = split(/,/,$buff{pos});
 92 |                     $runs{$cluster}{chrom} = $buff{chrom};
 93 |                     $runs{$cluster}{start} = $pos[0];
 94 |                     $runs{$cluster}{end}   = $pos[$#pos-2];
 95 |                     for my $i (0 .. $#t-2) {
 96 |                         $runs{$cluster}{tags}  .= "$t[$i]";
 97 |                     }
 98 |                 }else {
 99 |                     print "Warning: $buff{tags}\n";
100 |                 }
101 |             }
102 |             %buff = ();
103 |             $buff = 1;
104 |             $buff{chrom} = $F[0];
105 |             $buff{pos}   = $F[1];
106 |             $buff{tags}  = $tag;
107 |         }else {
108 |             my @pos = split(/,/,$buff{pos});
109 |             if ($F[1] - $pos[$#pos] <= $dist_markers) {
110 |                 $buff{pos}  .= ",$F[1]";
111 |                 $buff{tags} .= $tag;
112 |             }else {
113 |                 my @t = split(//,$buff{tags});
114 |                 my $num = 0;
115 |                 my $perc = 0;
116 |                 my $numE = 0;
117 |                 my $percE = 0;
118 |                 my $totm = scalar(@t);
119 |                 for my $m (@t) {
120 |                     $num++ if ($m eq "s" or $m eq "e");
121 |                     $numE++ if $m eq "E";
122 |                 }
123 |                 $perc = $num / $totm;
124 |                 $percE = $numE / $totm;
125 |                 if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) {
126 |                     $cluster++;
127 |                     if ($t[$#t] ne "f" and $t[$#t-1] ne "f") {
128 |                         my @pos = split(/,/,$buff{pos});
129 |                         $runs{$cluster}{chrom} = $buff{chrom};
130 |                         $runs{$cluster}{start} = $pos[0];
131 |                         $runs{$cluster}{end}   = $pos[$#pos];
132 |                         $runs{$cluster}{tags}  = $buff{tags};
133 |                     }elsif ($t[$#t] eq "f") {
134 |                         my @pos = split(/,/,$buff{pos});
135 |                         $runs{$cluster}{chrom} = $buff{chrom};
136 |                         $runs{$cluster}{start} = $pos[0];
137 |                         $runs{$cluster}{end}   = $pos[$#pos-1];
138 |                         for my $i (0 .. $#t-1) {
139 |                             $runs{$cluster}{tags}  .= "$t[$i]";
140 |                         }
141 |                     }elsif ($t[$#t-1] eq "f") {
142 |                         my @pos = split(/,/,$buff{pos});
143 |                         $runs{$cluster}{chrom} = $buff{chrom};
144 |                         $runs{$cluster}{start} = $pos[0];
145 |                         $runs{$cluster}{end}   = $pos[$#pos-2];
146 |                         for my $i (0 .. $#t-2) {
147 |                             $runs{$cluster}{tags}  .= "$t[$i]";
148 |                         }
149 |                     }else {
150 |                         print "Warning: $buff{tags}\n";
151 |                     }
152 |                 }
153 |                 %buff = ();
154 |                 $buff = 1;
155 |                 $buff{chrom} = $F[0];
156 |                 $buff{pos}   = $F[1];
157 |                 $buff{tags}  = $tag;
158 |             }
159 |         }
160 |     }else {
161 |         if ($buff == 1) {
162 |             if ($F[0] ne $buff{chrom}) {
163 |                 my @t = split(//,$buff{tags});
164 |                 my $num = 0;
165 |                 my $perc = 0;
166 |                 my $numE = 0;
167 |                 my $percE = 0;
168 |                 my $totm = scalar(@t);
169 |                 for my $m (@t) {
170 |                     $num++ if ($m eq "s" or $m eq "e");
171 |                     $numE++ if $m eq "E";
172 |                 }
173 |                 $perc = $num / $totm;
174 |                 $percE = $numE/ $totm;
175 |                 if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) {
176 |                     $cluster++;
177 |                     if ($t[$#t] ne "f" and $t[$#t] ne "f") {
178 |                         my @pos = split(/,/,$buff{pos});
179 |                         $runs{$cluster}{chrom} = $buff{chrom};
180 |                         $runs{$cluster}{start} = $pos[0];
181 |                         $runs{$cluster}{end}   = $pos[$#pos];
182 |                         $runs{$cluster}{tags}  = $buff{tags};
183 |                     }elsif ($t[$#t] eq "f") {
184 |                         my @pos = split(/,/,$buff{pos});
185 |                         $runs{$cluster}{chrom} = $buff{chrom};
186 |                         $runs{$cluster}{start} = $pos[0];
187 |                         $runs{$cluster}{end}   = $pos[$#pos-1];
188 |                         for my $i (0 .. $#t-1) {
189 |                             $runs{$cluster}{tags}  .= "$t[$i]";
190 |                         }
191 |                     }elsif ($t[$#t-1] eq "f") {
192 |                         my @pos = split(/,/,$buff{pos});
193 |                         $runs{$cluster}{chrom} = $buff{chrom};
194 |                         $runs{$cluster}{start} = $pos[0];
195 |                         $runs{$cluster}{end}   = $pos[$#pos-2];
196 |                         for my $i (0 .. $#t-2) {
197 |                             $runs{$cluster}{tags}  .= "$t[$i]";
198 |                         }
199 |                     }else {
200 |                         print "Warning: $buff{tags}\n";
201 |                     }
202 |                 }else {
203 |                     1;
204 |                 }
205 |                 $buff = 0;
206 |                 %buff = ();
207 |             }elsif ($last1 ne "f" and $last2 ne "f") {
208 |                 my @pos = split(/,/,$buff{pos});
209 |                 if ($F[1] - $pos[$#pos] <= $dist_markers) {
210 |                     $buff{pos}  .= ",$F[1]";
211 |                     $buff{tags} .= $tag;
212 |                 }else {
213 |                     my @t = split(//,$buff{tags});
214 |                     my $num = 0;
215 |                     my $perc = 0;
216 |                     my $numE = 0;
217 |                     my $percE = 0;
218 |                     my $totm = scalar(@t);
219 |                     for my $m (@t) {
220 |                         $num++ if ($m eq "s" or $m eq "e");
221 |                         $numE++ if $m eq "E";
222 |                     }
223 |                     $perc = $num / $totm;
224 |                     $percE = $numE/ $totm;
225 |                     if ($totm >= $num_markers and $perc >= $frac_markers and $percE < 1) {
226 |                         $cluster++;
227 |                         if ($t[$#t] ne "f" and $t[$#t] ne "f") {
228 |                             my @pos = split(/,/,$buff{pos});
229 |                             $runs{$cluster}{chrom} = $buff{chrom};
230 |                             $runs{$cluster}{start} = $pos[0];
231 |                             $runs{$cluster}{end}   = $pos[$#pos];
232 |                             $runs{$cluster}{tags}  = $buff{tags};
233 |                         }elsif ($t[$#t] eq "f") {
234 |                             my @pos = split(/,/,$buff{pos});
235 |                             $runs{$cluster}{chrom} = $buff{chrom};
236 |                             $runs{$cluster}{start} = $pos[0];
237 |                             $runs{$cluster}{end}   = $pos[$#pos-1];
238 |                             for my $i (0 .. $#t-1) {
239 |                                 $runs{$cluster}{tags}  .= "$t[$i]";
240 |                             }
241 |                         }elsif ($t[$#t-1] eq "f") {
242 |                             my @pos = split(/,/,$buff{pos});
243 |                             $runs{$cluster}{chrom} = $buff{chrom};
244 |                             $runs{$cluster}{start} = $pos[0];
245 |                             $runs{$cluster}{end}   = $pos[$#pos-2];
246 |                             for my $i (0 .. $#t-2) {
247 |                                 $runs{$cluster}{tags}  .= "$t[$i]";
248 |                             }
249 |                         }else {
250 |                             print "Warning: $buff{tags}\n";
251 |                         }
252 |                     }else {
253 |                         1;
254 |                     }
255 |                     $buff = 0;
256 |                     %buff = ();
257 |                 }
258 |             }else {
259 |                 my @t = split(//,$buff{tags});
260 |                 my $num = 0;
261 |                 my $perc = 0;
262 |                 my $numE = 0;
263 |                 my $percE = 0;
264 |                 my $totm = scalar(@t);
265 |                 for my $m (@t) {
266 |                     $num++ if ($m eq "s" or $m eq "e");
267 |                     $numE++ if $m eq "E";
268 |                 }
269 |                 $perc = $num / $totm;
270 |                 $percE = $numE/ $totm;
271 |                 if ($totm >= ($num_markers+1) and $perc >= $frac_markers and $percE < 1) {
272 |                     $cluster++;
273 |                     if ($t[$#t] eq "f") {
274 |                         my @pos = split(/,/,$buff{pos});
275 |                         $runs{$cluster}{chrom} = $buff{chrom};
276 |                         $runs{$cluster}{start} = $pos[0];
277 |                         $runs{$cluster}{end}   = $pos[$#pos-1];
278 |                         for my $i (0 .. $#t-1) {
279 |                             $runs{$cluster}{tags}  .= "$t[$i]";
280 |                         }
281 |                     }elsif ($t[$#t-1] eq "f") {
282 |                         my @pos = split(/,/,$buff{pos});
283 |                         $runs{$cluster}{chrom} = $buff{chrom};
284 |                         $runs{$cluster}{start} = $pos[0];
285 |                         $runs{$cluster}{end}   = $pos[$#pos-2];
286 |                         for my $i (0 .. $#t-2) {
287 |                             $runs{$cluster}{tags}  .= "$t[$i]";
288 |                         }
289 |                     }else {
290 |                         print "Warning: $buff{tags}\n";
291 |                     }
292 |                 }else {
293 |                     1;
294 |                 }
295 |                 $buff = 0;
296 |                 %buff = ();
297 |             }
298 |         }else {
299 |             1;
300 |         }
301 |     }
302 | 
303 |     if ($. == 2) {
304 |         $last1 = $tag;
305 |         $last2 = $last1;
306 |     }else {
307 |         $last2 = $last1;
308 |         $last1 = $tag;
309 |     }
310 | }
311 | close IN;
312 | 
313 | open OUT, "> $outfile" or die "$outfile: $!";
314 | open BED, "> $bedfile" or die "$bedfile: $!";
315 | print OUT "Run_ID\tChrom\tStart\tEnd\tLength\tNum_Markers\tTag_Markers\n";
316 | for my $c (sort {$a <=> $b} keys %runs) {
317 |     my $len = $runs{$c}{end} - $runs{$c}{start};
318 |     my $num = scalar(split(//,$runs{$c}{tags}));
319 |     print OUT "$c\t$runs{$c}{chrom}\t$runs{$c}{start}\t$runs{$c}{end}\t$len\t$num\t$runs{$c}{tags}\n";
320 |     print BED "$runs{$c}{chrom}\t$runs{$c}{start}\t$runs{$c}{end}\t$c\n";
321 | }
322 | close OUT;
323 | close BED;
324 | 


--------------------------------------------------------------------------------
/dnanexus/cis-x/README.md:
--------------------------------------------------------------------------------
  1 | <!-- dx-header -->
  2 | # St. Jude cis-X
  3 | 
  4 | Search for activating regulatory variants in the tumor genome
  5 | <!-- /dx-header -->
  6 | 
  7 | Activating regular variants usually cause the cis-activation of target genes.
  8 | To find cis-activated genes, allelic specific/imbalance expressions (ASE) and
  9 | outlier high expression (OHE) signals are used. Variants in the same
 10 | topologically associated domains with the candidates can then be searched,
 11 | including structural variants (SV), copy number aberrations (CNA), and single
 12 | nucleotide variations (SNV) and insertion/deletions (indel).
 13 | 
 14 | A transcription factor binding analysis is also done, using motifs from
 15 | [HOCOMOCO] v10 models.
 16 | 
 17 | cis-X currently only works with hg19 (GRCh37).
 18 | 
 19 | [HOCOMOCO]: http://hocomoco11.autosome.ru/
 20 | 
 21 | ## Inputs
 22 | 
 23 |   * `sample-id`: The sample ID. This is primarily used as the prefix for the
 24 |     filenames of the results.
 25 | 
 26 |   * `results-dir`: The output directory. See "[Outputs](#outputs)" for the
 27 |     resulting files.
 28 | 
 29 |   * `markers`: A list of single nucleotide markers. This is a tab-delimited
 30 |     file with the following columns:
 31 | 
 32 |       * `Chr`: chromosome name for the marker
 33 |       * `Pos`: genomic start location for the marker
 34 |       * `Chr_Allele`: reference allele
 35 |       * `Alternative_Allele`: alternative allele
 36 |       * `reference_tumor_count`: reference allele count in the tumor genome
 37 |       * `alternative_tumor_count`: alternative allele count in the tumor genome
 38 |       * `reference_normal_count`: reference allele count in the matched normal genome
 39 |       * `alternative_normal_count`: alternative count in the matched normal genome
 40 | 
 41 |     This file can be generated with Bambino.
 42 | 
 43 |   * `cnv-loh`: CNV/LOH regions. It contains all the genomic regions carrying
 44 |     copy number variations (CNV) or loss of heterozygosity (LOH), which will be
 45 |     filtered out during analysis.
 46 | 
 47 |     This is a tab-delimited file in the bed format. It must have at least the
 48 |     following three columns:
 49 | 
 50 |       * `chrom`: chromosome name
 51 |       * `loc.start`: genomic start location
 52 |       * `loc.end`: genomic end location
 53 | 
 54 |     If no CNV/LOH are in the genome under analysis, a file with no rows (but
 55 |     including headers) can be provided.
 56 | 
 57 |     This file can be generated with CONSERTING.
 58 | 
 59 |   * `bam`: The RNA-Seq BAM file aligned to hg19 (GRCh37). The index file is
 60 |     expected to be in the same directory with the same name and extension
 61 |     `.bai`, e.g, `/path/to/SJ001_D1.bam` and `/path/to/SJ001_D1.bam.bai`.
 62 | 
 63 |     StrongArm or STAR can be used for RNA-Seq alignment.
 64 | 
 65 |   * `fpkm-matrix`: A gene expression table. This is a tab-delimited file
 66 |     containing gene level expressions for the tumor under analysis. The
 67 |     expressions are in FPKM (fragments per kilobase of transcript per million
 68 |     mapped reads).
 69 | 
 70 |       * `GeneID`: gene [Ensembl] ID
 71 |       * `GeneName`: gene symbol
 72 |       * `Type`: [transcript type](https://www.gencodegenes.org/gencode_biotypes.html)
 73 |       * `Status`: transcript status (must be `KNOWN`, `NOVEL`, or `PUTATIVE`)
 74 |       * `Chr`: chromosome name
 75 |       * `Start` genomic start location
 76 |       * `End`: genomic end location
 77 |       * [SampleID...]: FPKM for the given sample
 78 | 
 79 |     This file can can be generated with the output of HTseq-count
 80 |     preprocessed through `src/other/mergeData_geneName.pl`. The data must be
 81 |     able to match values in the given gene specific reference expression
 82 |     matrices (see [cis-X ref-exp]) generated from a larger cohort.
 83 | 
 84 |   * `snv-indel`: Somatic SNV/indels. This is a tab-delimited file containing
 85 |     somatic sequence mutations present in the genome under analysis. It includes
 86 |     both single nucleotide variants (SNV) and small insertion/deletions (indel).
 87 |     The file must have the following columns:
 88 | 
 89 |       * `chr`: chromosome name
 90 |       * `pos`: genomic start location
 91 |       * `ref`: reference allele genotype
 92 |       * `mutant`: mutant allele genotype
 93 |       * `type`: mutation type (either `snv` or `indel`)
 94 | 
 95 |     Note that the coordinate used for an indel is after the inserted sequence.
 96 | 
 97 |     If no SNV/indels are in the sample under analysis, a file with no rows
 98 |     (but including headers) can be provided.
 99 | 
100 |     This file can can be created with Bambino and then preprocessed using the
101 |     steps taken in "[The genetic basis of early T-cell precursor acute lymphoblastic leukaemia][22237106]".
102 | 
103 |   * `sv` Somatic SVs. This is a tab-delimited file containing somatic-acquired
104 |     structural variants (SV) in the cancer genome. The file must have the
105 |     following columns:
106 | 
107 |       * `chrA`: chromosome name of the left breakpoint
108 |       * `posA`: genomic location of the left breakpoint
109 |       * `ortA`: strand orientation of the left breakpoint
110 |       * `chrB`: chromosome name of the right breakpoint
111 |       * `posB`: genomic location of the right breakpoint
112 |       * `ortB`: strand orientation of the right breakpoint
113 | 
114 |     Strand orientations are denoted with a `+` for a sense or coding strand
115 |     and `-` for a antisense or non-coding strand.
116 | 
117 |     If no somatic SVs are in the sample under analysis, a file with no rows (but
118 |     including headers) can be provided.
119 | 
120 |     This file can be generated by CREST.
121 | 
122 |   * `cna` Somatic CNV. This is a tab-delimited file containing the genomic
123 |     regions with somatic-acquired copy number aberrations (CNA) in the cancer
124 |     genome.
125 | 
126 |       * `chr`: chromosome name
127 |       * `start`: genomic start location
128 |       * `end`: genomic end location
129 |       * `logR`: log2 ratio
130 | 
131 |     If no somatic CNVs are in the sample under analysis, a file with no rows
132 |     (but including headers) can be provided.
133 | 
134 |     This file can be generating by CONSERTING.
135 | 
136 |   * `disease`: The disease name.
137 | 
138 |   * `cnv_loh_action`: The behavior when handling markers in CNV/LOH regions. Can
139 |     be either `keep` or `drop`.
140 | 
141 |   * `min_coverage_wgs`: The minimum coverage in WGS to be included in the
142 |     analysis.
143 | 
144 |   * `min_coverage_rna_seq`: The minimum coverage in RNA-seq to be included in
145 |     the analysis.
146 | 
147 |   * `fpkm_threshold_candidate`: The FPKM threshold for the nomination of a
148 |     cis-activated candidate.
149 | 
150 |   * `user_annotation`: Annotations for the candidate SNV/indels in BED format.
151 | 
152 |   * `chr_string`: Whether the names in the reference sequence dictionary are
153 |     prefixed with "chr".
154 | 
155 |   * `tad_info`: TAD information defining the regulatory territory used in
156 |     noncoding variant analysis.
157 | 
158 | [cis-X ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp
159 | [22237106]: https://www.ncbi.nlm.nih.gov/pubmed/22237106
160 | 
161 | ## Outputs
162 | 
163 |   * `*.cisActivated.candidates.txt`: cis-activated candidates in the tumor
164 |     genome under analysis.
165 | 
166 |       * `gene`: gene accession number ([RefSeq] ID)
167 |       * `gsym`: gene symbol
168 |       * `chrom`: chromosome name
169 |       * `strand`: strand orientation
170 |       * `start`: genomic start location
171 |       * `end`: genomic end location
172 |       * `cdsStartStat`: coding sequence (CDS) start status
173 |       * `cdsEndStat`: coding sequence (CDS) end status
174 |       * `markers`: number of heterozygous markers in this gene
175 |       * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE)
176 |       * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers
177 |       * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers
178 |       * `pval_all_markers`: p-value for each marker in the ASE test
179 |       * `pval_ase_markers`: p-value for ASE markers in the ASE test
180 |       * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers
181 |       * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers
182 |       * `comb.pval`: combined p-value for the ASE test
183 |       * `mean.delta`: average BAF difference between RNA and DNA for all markers
184 |       * `rawp`: raw p-value for the ASE test
185 |       * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni)
186 |       * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg)
187 |       * `FPKM`: FPKM value
188 |       * `loo.source`: which reference expression matrix was used in the outlier high expression (OHE) test
189 |       * `loo.cohort.size`: number of cases in the reference expression matrix for this gene
190 |       * `loo.pval`: p-value of the OHE test
191 |       * `loo.rank`: rank of the case under analysis among the reference cases
192 |       * `imprinting.status`: imprinting status of the gene
193 |       * `candidate.group`: status of the gene, combining both ASE and outlier tests
194 |       * `description`: status of the gene in COSMIC database
195 | 
196 |     Strand orientations are denoted with a `+` for a sense or coding strand
197 |     and `-` for a antisense or non-coding strand.
198 | 
199 |     Coding sequence status is typically one of "none" (not specified), "unk"
200 |     (unknown), "incmpl" (incomplete), or "cmpl" (complete).
201 | 
202 |   * `*.sv.candidates.txt`: Structural variant candidates predicted as the
203 |     causal for the cis-activated genes in the regulatory territory.
204 | 
205 |       * `left.candidate.inTAD`: cis-activated candidate near the left breakpoint
206 |       * `right.candidate.inTAD`: cis-activated candidate near the right breakpoint
207 |       * `chrA`: chromosome name of the left breakpoint
208 |       * `posA`: genomic location of the left breakpoint
209 |       * `ortA`: strand orientation of the left breakpoint
210 |       * `chrB`: chromosome name of the right breakpoint
211 |       * `posB`: genomic location of the right breakpoint
212 |       * `ortB`: strand orientation of the right breakpoint
213 |       * `type`: type of translocation
214 | 
215 |   * `*.cna.candidates.txt`: Copy number aberrations predicted as the causal
216 |     for the cis-activated genes in the regulatory territory.
217 | 
218 |       * `candidate.inTAD`: cis-activated candidate by the CNA
219 |       * `chr`: chromosome name
220 |       * `start`: genomic start position
221 |       * `end`: genomic end location
222 |       * `logR`: log ratio of the CNA
223 | 
224 |   * `*.snvindel.candidates.txt`: SNV/indel candidates predicted as functional
225 |     and predicted transcription factors. The mutations are also annotated for
226 |     known regulatory elements reported by the [NIH Roadmap Epigenomics Project]
227 |     by collecting 111 cell lines.
228 | 
229 |       * `chrom`: chromosome name
230 |       * `pos`: genomic start position
231 |       * `ref`: reference allele genotype
232 |       * `mut`: mutant allele genotype
233 |       * `type`: mutation type (either `snv` or `indel`)
234 |       * `target`: cis-activated candidate
235 |       * `dist`: distance between the mutation and transcription start sites of the target gene
236 |       * `tf`: transcription factors predicted to have the binding motif introduced by the mutation
237 |       * `EpiRoadmap_enhancer`: enhancer regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
238 |       * `EpiRoadmap_promoter`: promoter regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
239 |       * `EpiRoadmap_dyadic`: dyadic regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
240 |       * `User_Annot`: annotation from the user-provided BED file
241 | 
242 |   * `*.OHE.results.txt`: Raw results for outlier high expression test.
243 | 
244 |       * `Gene`: gene symbol
245 |       * `fpkm.raw`: FPKM value
246 |       * `size.bi`: number of cases in the bi-allelic reference cohort
247 |       * `p.bi`: p-value in the outlier test using the bi-allelic reference cohort
248 |       * `rank.bi`: rank of the expression level in the case under analysis compared to the bi-allelic reference cohort
249 |       * `size.cohort`: number of cases in the entire reference cohort
250 |       * `p.cohort`: p-value in the outlier test using the entire reference cohort
251 |       * `rank.cohort`: rank of the expression level in the case under analysis compared to the entire reference cohort
252 |       * `size.white`: number of cases in the whitelist reference cohort
253 |       * `p.white`: p-value in the outlier test using the whitelist reference cohort
254 |       * `rank.white`: rank of the expression level in the case under analysis compared to the whitelist reference cohort
255 |       * `tscore.white`: t-score representing if the gene showed outlier expresssion using the whitelist reference cohort
256 |       * `tscore.perc.white`: percentage of the t-score compared to the null distribution
257 | 
258 |   * `*.ase.gene.model.fdr.txt`: Raw results for gene level allelic specific
259 |     expression test.
260 | 
261 |       * `gene`: gene accession number ([RefSeq] ID)
262 |       * `gsym`: gene symbol
263 |       * `chrom`: chromosome name
264 |       * `strand`: strand orientation
265 |       * `start`: genomic start location
266 |       * `end`: genomic end location
267 |       * `cdsStartStat`: coding sequence (CDS) start status
268 |       * `cdsEndStat`: coding sequence (CDS) end status
269 |       * `markers`: number of heterozygous markers in this gene
270 |       * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE)
271 |       * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers
272 |       * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers
273 |       * `pval_all_markers`: p-value for each marker in the ASE test
274 |       * `pval_ase_markers`: p-value for ASE markers in the ASE test
275 |       * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers
276 |       * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers
277 |       * `comb.pval`: combined p-value for the ASE test
278 |       * `mean.delta`: average BAF difference between RNA and DNA for all markers
279 |       * `rawp`: raw p-value for the ASE test
280 |       * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni)
281 |       * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg)
282 | 
283 |     Strand orientations are denoted with a `+` for a sense or coding strand
284 |     and `-` for a antisense or non-coding strand.
285 | 
286 |     Coding sequence status is typically one of "none" (not specified), "unk"
287 |     (unknown), "incmpl" (incomplete), or "cmpl" (complete).
288 | 
289 |   * `*.ase.combine.WGS.RNAseq.goodmarkers.binom.txt`: Raw results for single
290 |     marker based allelic specific expression test.
291 | 
292 |       * `chrom`: chromosome name
293 |       * `pos`: genomic start position
294 |       * `ref`: reference allele genotype
295 |       * `mut`: non-reference allele genotype
296 |       * `cvg_wgs`: coverage of the marker from the whole genome sequence (WGS)
297 |       * `mut_freq_wgs`: non-reference allele fraction in the WGS
298 |       * `cvg_rna`: coverage of the marker from the RNA-seq
299 |       * `mut_freq_rna`: non-reference allele fraction in the RNA-seq
300 |       * `ref.1`: read count of the reference allele in the RNA-seq
301 |       * `var`: read count of the non-reference allele in the RNA-seq
302 |       * `pvalue`: p-value from the binomial test
303 |       * `delta.abs`: absolute difference of the non-reference allele fraction between the WGS and RNA-seq
304 | 
305 | [Ensembl]: http://www.ensembl.org/
306 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html
307 | [RefSeq]: https://www.ncbi.nlm.nih.gov/refseq/
308 | 


--------------------------------------------------------------------------------
/src/core/README.md:
--------------------------------------------------------------------------------
  1 | # cis-X run
  2 | 
  3 | **cis-X run** searches for activating regulatory variants in the tumor genome.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 |   * [Perl] ^5.10.1
  8 |     * [Data::Compare] ~1.25
  9 |   * [R] ^3.1.0
 10 |     * [multtest] ~2.36.0
 11 |   * [Java SE Runtime Environment] ~1.8.0_66
 12 |   * [MEME Suite] =4.9.0
 13 |   * [twoBitToFa]\*
 14 |   * [variants2matrix] (See below.)
 15 | 
 16 | \* UCSC Genome Browser binaries are not versioned. The latest versions
 17 | _should_ work.
 18 | 
 19 | [Perl]: https://www.perl.org/
 20 | [Data::Compare]: https://metacpan.org/pod/Data::Compare
 21 | [R]: https://www.r-project.org/
 22 | [multtest]: https://www.bioconductor.org/packages/release/bioc/html/multtest.html
 23 | [Java SE Runtime Environment]: http://www.oracle.com/technetwork/java/javase/overview/index.html
 24 | [MEME Suite]: http://meme-suite.org/
 25 | [twoBitToFa]: https://genome.ucsc.edu/goldenpath/help/twoBit.html
 26 | [variants2matrix]: #variants2matrix
 27 | 
 28 | ### variants2matrix
 29 | 
 30 | variants2matrix is a St. Jude tool that is available from [St. Jude Research]
 31 | (`variants2matrix.tar.gz`). It is expected to be in `PATH`, along with its
 32 | Perl library and Java class paths, e.g.,
 33 | 
 34 | ```
 35 | $ V2M_HOME=$CIS_X_HOME/vendor/variants2matrix
 36 | $ wget http://ftp.stjude.org/pub/software/cis-x/variants2matrix.tar.gz
 37 | $ tar xf variants2matrix.tar.gz --directory $CIS_X_HOME/vendor
 38 | $ export PATH=$V2M_HOME/bin:$PATH
 39 | $ export PERL5LIB=$V2M_HOME/lib/perl:$PERL5LIB
 40 | $ export CLASSPATH=$V2M_HOME/lib/java/bambino-1.0.jar:$V2M_HOME/lib/java/indelxref-1.0.jar:$V2M_HOME/lib/java/picard.jar:$V2M_HOME/lib/java/samplenamelib-1.0.jar:$CLASSPATH
 41 | ```
 42 | 
 43 | ### References
 44 | 
 45 | Reference files are not included with the source due to their large sizes.
 46 | 
 47 | Internal references are placed in `$CIS_X_HOME/refs`. These files can be
 48 | downloaded from [St. Jude Research] (`cis-x-refs-*.tar.gz`). It includes a
 49 | blacklist of problematic polymorphism markers and two reference expression
 50 | matrices for T-ALL and NBL.
 51 | 
 52 | External references are expected to be in `$CIS_X_HOME/refs/external`. These
 53 | are not distributed with cis-X, but the `cis-X seed` command can download and
 54 | generate them. See [cis-X seed] for more details and a list of required
 55 | reference files.
 56 | 
 57 | [cis-X seed]: https://github.com/stjude/cis-x/tree/master/src/seed
 58 | 
 59 | ## Usage
 60 | 
 61 | ```
 62 | cis-X-run
 63 | 
 64 | USAGE:
 65 |     cis-X run -s <sample-id> -o <results-dir> -l <markers> -g <cnv-loh> -b <bam> -e <fpkm-matrix> -m <snv-indel> -v <sv> -c <cna> -d <disease> -a <cnv-loh-action> -w <min coverage in WGS> -r <min coverage in RNA-seq> -f <FPKM threshold for nominate cis-activated candidate> -u <user-annotation> -h <chr-string> -t <tad-info>
 66 | 
 67 | ARGS:
 68 |     -s <sample-id>       Sample ID
 69 |     -o <results-dir>     Output directory
 70 |     -l <markers>         Path to single nucleotide markers
 71 |     -g <cnv-loh>         Path to CNV/LOH regions
 72 |     -b <bam>             Path to a RNA-Seq BAM (index must be in same directory)
 73 |     -e <fpkm-matrix>     Path to gene expression table
 74 |     -m <snv-indel>       Path to somatic SNV/indels
 75 |     -v <sv>              Path to somatic SVs
 76 |     -c <cna>             Path to somatic CNVs
 77 |     -d <disease>         Disease name
 78 |     -a <cnv-loh-action>  Action of markers in CNV/LOH regions, either keep or drop (default=keep)
 79 |     -w <min coverage in WGS>          Minimal coverage in WGS to include a heterozygous marker (default=10)
 80 |     -r <min coverage in RNA-seq>      Minimal coverage in RNA-seq to include a heterozygous marker (default=10)
 81 |     -f <fpkm threshold for candidate> FPKM threshold for nominate cis-activated candidate (default=5)
 82 |     -u <user-annotation> User applied annotation file in BED format (default=NotSpecified)
 83 |     -h <chr-string>      if the RNA-seq BAM with 'chr' in name, TRUE|FALSE (default=TRUE)
 84 |     -t <tad-info>         Path to the TAD annotation file in BED format in hg19 (default=hESC)
 85 | ```
 86 | 
 87 | ## Inputs
 88 | 
 89 | Running cis-X requires quite a few inputs.
 90 | 
 91 |   * `sample-id`: The sample ID. This is primarily used as the prefix for the
 92 |     filenames of the results.
 93 | 
 94 |   * `results-dir`: The output directory. See "[Outputs](#outputs)" for the
 95 |     resulting files.
 96 | 
 97 |   * `markers`: A list of single nucleotide markers. This is a tab-delimited
 98 |     file with the following columns:
 99 | 
100 |       * `Chr`: chromosome name for the marker
101 |       * `Pos`: genomic start location for the marker
102 |       * `Chr_Allele`: reference allele
103 |       * `Alternative_Allele`: alternative allele
104 |       * `reference_tumor_count`: reference allele count in the tumor genome
105 |       * `alternative_tumor_count`: alternative allele count in the tumor genome
106 |       * `reference_normal_count`: reference allele count in the matched normal genome
107 |       * `alternative_normal_count`: alternative count in the matched normal genome
108 | 
109 |     This file can be generated with Bambino.
110 | 
111 |   * `cnv-loh`: CNV/LOH regions. It contains all the genomic regions carrying
112 |     copy number variations (CNV) or loss of heterozygosity (LOH), which will be
113 |     filtered out during analysis.
114 | 
115 |     This is a tab-delimited file in the bed format. It must have at least the
116 |     following three columns:
117 | 
118 |       * `chrom`: chromosome name
119 |       * `loc.start`: genomic start location
120 |       * `loc.end`: genomic end location
121 | 
122 |     If no CNV/LOH are in the genome under analysis, a file with no rows (but
123 |     including headers) can be provided.
124 | 
125 |     This file can be generated with CONSERTING.
126 | 
127 |   * `bam`: The RNA-Seq BAM file aligned to hg19 (GRCh37). The index file is
128 |     expected to be in the same directory with the same name and extension
129 |     `.bai`, e.g, `/path/to/SJ001_D1.bam` and `/path/to/SJ001_D1.bam.bai`.
130 | 
131 |     StrongArm or STAR can be used for RNA-Seq alignment.
132 | 
133 |   * `fpkm-matrix`: A gene expression table. This is a tab-delimited file
134 |     containing gene level expressions for the tumor under analysis. The
135 |     expressions are in FPKM (fragments per kilobase of transcript per million
136 |     mapped reads).
137 | 
138 |       * `GeneID`: gene [Ensembl] ID
139 |       * `GeneName`: gene symbol
140 |       * `Type`: [transcript type](https://www.gencodegenes.org/gencode_biotypes.html)
141 |       * `Status`: transcript status (must be `KNOWN`, `NOVEL`, or `PUTATIVE`)
142 |       * `Chr`: chromosome name
143 |       * `Start` genomic start location
144 |       * `End`: genomic end location
145 |       * [SampleID...]: FPKM for the given sample
146 | 
147 |     This file can can be generated with the output of HTseq-count
148 |     preprocessed through `src/other/mergeData_geneName.pl`. The data must be
149 |     able to match values in the given gene specific reference expression
150 |     matrices (see [cis-X ref-exp]) generated from a larger cohort.
151 | 
152 |   * `snv-indel`: Somatic SNV/indels. This is a tab-delimited file containing
153 |     somatic sequence mutations present in the genome under analysis. It includes
154 |     both single nucleotide variants (SNV) and small insertion/deletions (indel).
155 |     The file must have the following columns:
156 | 
157 |       * `chr`: chromosome name
158 |       * `pos`: genomic start location
159 |       * `ref`: reference allele genotype
160 |       * `mutant`: mutant allele genotype
161 |       * `type`: mutation type (either `snv` or `indel`)
162 | 
163 |     Note that the coordinate used for an indel is after the inserted sequence.
164 | 
165 |     If no SNV/indels are in the sample under analysis, a file with no rows
166 |     (but including headers) can be provided.
167 | 
168 |     This file can can be created with Bambino and then preprocessed using the
169 |     steps taken in "[The genetic basis of early T-cell precursor acute lymphoblastic leukaemia][22237106]".
170 | 
171 |   * `sv` Somatic SVs. This is a tab-delimited file containing somatic-acquired
172 |     structural variants (SV) in the cancer genome. The file must have the
173 |     following columns:
174 | 
175 |       * `chrA`: chromosome name of the left breakpoint
176 |       * `posA`: genomic location of the left breakpoint
177 |       * `ortA`: strand orientation of the left breakpoint
178 |       * `chrB`: chromosome name of the right breakpoint
179 |       * `posB`: genomic location of the right breakpoint
180 |       * `ortB`: strand orientation of the right breakpoint
181 | 
182 |     Strand orientations are denoted with a `+` for a sense or coding strand
183 |     and `-` for a antisense or non-coding strand.
184 | 
185 |     If no somatic SVs are in the sample under analysis, a file with no rows (but
186 |     including headers) can be provided.
187 | 
188 |     This file can be generated by CREST.
189 | 
190 |   * `cna` Somatic CNV. This is a tab-delimited file containing the genomic
191 |     regions with somatic-acquired copy number aberrations (CNA) in the cancer
192 |     genome.
193 | 
194 |       * `chr`: chromosome name
195 |       * `start`: genomic start location
196 |       * `end`: genomic end location
197 |       * `logR`: log2 ratio
198 | 
199 |     If no somatic CNVs are in the sample under analysis, a file with no rows
200 |     (but including headers) can be provided.
201 | 
202 |     This file can be generating by CONSERTING.
203 | 
204 |   * `disease`: The disease name.
205 | 
206 |   * `cnv_loh_action`: The behavior when handling markers in CNV/LOH regions. Can
207 |     be either `keep` or `drop`.
208 | 
209 |   * `min_coverage_wgs`: The minimum coverage in WGS to be included in the
210 |     analysis.
211 | 
212 |   * `min_coverage_rna_seq`: The minimum coverage in RNA-seq to be included in
213 |     the analysis.
214 | 
215 |   * `fpkm_threshold_candidate`: The FPKM threshold for the nomination of a
216 |     cis-activated candidate.
217 | 
218 |   * `user-annotation`: Annotations for the candidate SNV/indels in BED format.
219 | 
220 |   * `chr-string`: Whether the names in the reference sequence dictionary are
221 |     prefixed with "chr".
222 | 
223 |   * `tad-info`: TAD information defining the regulatory territory used in
224 |     noncoding variant analysis.
225 | 
226 | [cis-X ref-exp]: https://github.com/stjude/cis-x/tree/master/src/ref-exp
227 | [22237106]: https://www.ncbi.nlm.nih.gov/pubmed/22237106
228 | 
229 | ## Outputs
230 | 
231 | Results are saved as tab-delimited files to `$RESULTS_DIR`.
232 | 
233 |   * `*.cisActivated.candidates.txt`: cis-activated candidates in the tumor
234 |     genome under analysis.
235 | 
236 |       * `gene`: gene accession number ([RefSeq] ID)
237 |       * `gsym`: gene symbol
238 |       * `chrom`: chromosome name
239 |       * `strand`: strand orientation
240 |       * `start`: genomic start location
241 |       * `end`: genomic end location
242 |       * `cdsStartStat`: coding sequence (CDS) start status
243 |       * `cdsEndStat`: coding sequence (CDS) end status
244 |       * `markers`: number of heterozygous markers in this gene
245 |       * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE)
246 |       * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers
247 |       * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers
248 |       * `pval_all_markers`: p-value for each marker in the ASE test
249 |       * `pval_ase_markers`: p-value for ASE markers in the ASE test
250 |       * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers
251 |       * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers
252 |       * `comb.pval`: combined p-value for the ASE test
253 |       * `mean.delta`: average BAF difference between RNA and DNA for all markers
254 |       * `rawp`: raw p-value for the ASE test
255 |       * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni)
256 |       * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg)
257 |       * `FPKM`: FPKM value
258 |       * `loo.source`: which reference expression matrix was used in the outlier high expression (OHE) test
259 |       * `loo.cohort.size`: number of cases in the reference expression matrix for this gene
260 |       * `loo.pval`: p-value of the OHE test
261 |       * `loo.rank`: rank of the case under analysis among the reference cases
262 |       * `imprinting.status`: imprinting status of the gene
263 |       * `candidate.group`: status of the gene, combining both ASE and outlier tests
264 |       * `description`: status of the gene in COSMIC database
265 | 
266 |     Strand orientations are denoted with a `+` for a sense or coding strand
267 |     and `-` for a antisense or non-coding strand.
268 | 
269 |     Coding sequence status is typically one of "none" (not specified), "unk"
270 |     (unknown), "incmpl" (incomplete), or "cmpl" (complete).
271 | 
272 |   * `*.sv.candidates.txt`: Structural variant candidates predicted as the
273 |     causal for the cis-activated genes in the regulatory territory.
274 | 
275 |       * `left.candidate.inTAD`: cis-activated candidate near the left breakpoint
276 |       * `right.candidate.inTAD`: cis-activated candidate near the right breakpoint
277 |       * `chrA`: chromosome name of the left breakpoint
278 |       * `posA`: genomic location of the left breakpoint
279 |       * `ortA`: strand orientation of the left breakpoint
280 |       * `chrB`: chromosome name of the right breakpoint
281 |       * `posB`: genomic location of the right breakpoint
282 |       * `ortB`: strand orientation of the right breakpoint
283 |       * `type`: type of translocation
284 | 
285 |   * `*.cna.candidates.txt`: Copy number aberrations predicted as the causal
286 |     for the cis-activated genes in the regulatory territory.
287 | 
288 |       * `candidate.inTAD`: cis-activated candidate by the CNA
289 |       * `chr`: chromosome name
290 |       * `start`: genomic start position
291 |       * `end`: genomic end location
292 |       * `logR`: log ratio of the CNA
293 | 
294 |   * `*.snvindel.candidates.txt`: SNV/indel candidates predicted as functional
295 |     and predicted transcription factors. The mutations are also annotated for
296 |     known regulatory elements reported by the [NIH Roadmap Epigenomics Project]
297 |     by collecting 111 cell lines.
298 | 
299 |       * `chrom`: chromosome name
300 |       * `pos`: genomic start position
301 |       * `ref`: reference allele genotype
302 |       * `mut`: mutant allele genotype
303 |       * `type`: mutation type (either `snv` or `indel`)
304 |       * `target`: cis-activated candidate
305 |       * `dist`: distance between the mutation and transcription start sites of the target gene
306 |       * `tf`: transcription factors predicted to have the binding motif introduced by the mutation
307 |       * `EpiRoadmap_enhancer`: enhancer regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
308 |       * `EpiRoadmap_promoter`: promoter regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
309 |       * `EpiRoadmap_dyadic`: dyadic regions that overlap with the mutation (from the [NIH Roadmap Epigenomics Project])
310 |       * `User_Annot`: annotation from the user-provided BED file
311 | 
312 |   * `*.OHE.results.txt`: Raw results for outlier high expression test.
313 | 
314 |       * `Gene`: gene symbol
315 |       * `fpkm.raw`: FPKM value
316 |       * `size.bi`: number of cases in the bi-allelic reference cohort
317 |       * `p.bi`: p-value in the outlier test using the bi-allelic reference cohort
318 |       * `rank.bi`: rank of the expression level in the case under analysis compared to the bi-allelic reference cohort
319 |       * `size.cohort`: number of cases in the entire reference cohort
320 |       * `p.cohort`: p-value in the outlier test using the entire reference cohort
321 |       * `rank.cohort`: rank of the expression level in the case under analysis compared to the entire reference cohort
322 |       * `size.white`: number of cases in the whitelist reference cohort
323 |       * `p.white`: p-value in the outlier test using the whitelist reference cohort
324 |       * `rank.white`: rank of the expression level in the case under analysis compared to the whitelist reference cohort
325 |       * `tscore.white`: t-score representing if the gene showed outlier expresssion using the whitelist reference cohort
326 |       * `tscore.perc.white`: percentage of the t-score compared to the null distribution
327 | 
328 |   * `*.ase.gene.model.fdr.txt`: Raw results for gene level allelic specific
329 |     expression test.
330 | 
331 |       * `gene`: gene accession number ([RefSeq] ID)
332 |       * `gsym`: gene symbol
333 |       * `chrom`: chromosome name
334 |       * `strand`: strand orientation
335 |       * `start`: genomic start location
336 |       * `end`: genomic end location
337 |       * `cdsStartStat`: coding sequence (CDS) start status
338 |       * `cdsEndStat`: coding sequence (CDS) end status
339 |       * `markers`: number of heterozygous markers in this gene
340 |       * `ase_markers`: number of heterozygous markers showing allelic specific expressions (ASE)
341 |       * `average_ai_all`: average B-allele frequency (BAF) difference between RNA and DNA for all heterozygous markers
342 |       * `average_ai_ase`: average BAF difference between RNA and DNA for ASE markers
343 |       * `pval_all_markers`: p-value for each marker in the ASE test
344 |       * `pval_ase_markers`: p-value for ASE markers in the ASE test
345 |       * `ai_all_markers`: BAF difference between RNA and DNA for all heterozygrous markers
346 |       * `ai_ase_markers`: BAF difference between RNA and DNA for ASE markers
347 |       * `comb.pval`: combined p-value for the ASE test
348 |       * `mean.delta`: average BAF difference between RNA and DNA for all markers
349 |       * `rawp`: raw p-value for the ASE test
350 |       * `Bonferroni`: adjusted p-value for the ASE test (single-step Bonferroni)
351 |       * `ABH`: adjusted p-value for the ASE test (Benjamini-Hochberg)
352 | 
353 |     Strand orientations are denoted with a `+` for a sense or coding strand
354 |     and `-` for a antisense or non-coding strand.
355 | 
356 |     Coding sequence status is typically one of "none" (not specified), "unk"
357 |     (unknown), "incmpl" (incomplete), or "cmpl" (complete).
358 | 
359 |   * `*.ase.combine.WGS.RNAseq.goodmarkers.binom.txt`: Raw results for single
360 |     marker based allelic specific expression test.
361 | 
362 |       * `chrom`: chromosome name
363 |       * `pos`: genomic start position
364 |       * `ref`: reference allele genotype
365 |       * `mut`: non-reference allele genotype
366 |       * `cvg_wgs`: coverage of the marker from the whole genome sequence (WGS)
367 |       * `mut_freq_wgs`: non-reference allele fraction in the WGS
368 |       * `cvg_rna`: coverage of the marker from the RNA-seq
369 |       * `mut_freq_rna`: non-reference allele fraction in the RNA-seq
370 |       * `ref.1`: read count of the reference allele in the RNA-seq
371 |       * `var`: read count of the non-reference allele in the RNA-seq
372 |       * `pvalue`: p-value from the binomial test
373 |       * `delta.abs`: absolute difference of the non-reference allele fraction between the WGS and RNA-seq
374 | 
375 | [Ensembl]: http://www.ensembl.org/
376 | [NIH Roadmap Epigenomics Project]: https://egg2.wustl.edu/roadmap/web_portal/index.html
377 | [RefSeq]: https://www.ncbi.nlm.nih.gov/refseq/
378 | [St. Jude Research]: https://www.stjuderesearch.org/site/lab/zhang/cis-x
379 | 


--------------------------------------------------------------------------------