├── DNAcopy_1.48.0.tar.gz
├── Dockerfile
├── Dockstore.cwl
├── README.md
├── basicDNAcopy.R
├── meanLogRatioByChromosome.py
├── run_varscan
├── separateArms.py
└── src
    └── cbsToBed.py


/DNAcopy_1.48.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Jeltje/varscan2/c57bd2b0cba5168fc8b7a64f30cbbc9fd4608e22/DNAcopy_1.48.0.tar.gz


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:14.04
 2 | 
 3 | MAINTAINER Jeltje van Baren, jeltje.van.baren@gmail.com
 4 | 
 5 | # create a working directory and work from there
 6 | RUN mkdir /tmp/install
 7 | WORKDIR /tmp/install
 8 | 
 9 | RUN apt-get update && apt-get install -y \
10 | 	gcc \
11 | 	make \
12 | 	zlib1g-dev \
13 |         git \
14 | 	wget \
15 | 	python-numpy \
16 | 	default-jre \
17 | 	r-base \
18 | 	bc
19 | 
20 | # DNAcopy version keeps changing so deprecated this:
21 | # R and DNAcopy package (move to R library location)
22 | #RUN apt-get install -y r-base
23 | #RUN wget http://www.bioconductor.org/packages/release/bioc/src/contrib/DNAcopy_1.40.0.tar.gz
24 | # instead:
25 | COPY ./DNAcopy_1.48.0.tar.gz ./ 
26 | RUN R CMD INSTALL DNAcopy_1.48.0.tar.gz 
27 | 
28 | # Samtools 0.1.18 - note: 0.1.19 and 1.1 do NOT work, VarScan copynumber dies on the mpileup
29 | RUN wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.18/samtools-0.1.18.tar.bz2
30 | RUN tar -xvf samtools-0.1.18.tar.bz2
31 | # the make command generates a lot of warnings, none of them relevant to the final samtools code, hence 2>/dev/null
32 | RUN (cd samtools-0.1.18/ && make DFLAGS='-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0' LIBCURSES='' 2>/dev/null && mv samtools /usr/local/bin)
33 | 
34 | # get varscan 
35 | RUN wget  -O /usr/local/bin/VarScan.jar https://github.com/dkoboldt/varscan/releases/download/2.4.2/VarScan.v2.4.2.jar
36 | 
37 | # Move wrapper and helper scripts to same location
38 | ADD ./run_varscan /usr/local/bin/
39 | ADD ./separateArms.py /usr/local/bin/
40 | ADD ./basicDNAcopy.R /usr/local/bin/
41 | ADD ./meanLogRatioByChromosome.py /usr/local/bin/
42 | 
43 | # Set WORKDIR to /data -- predefined mount location.
44 | RUN mkdir /data
45 | WORKDIR /data
46 | 
47 | # And clean up
48 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/install
49 | 
50 | ENTRYPOINT ["bash", "/usr/local/bin/run_varscan"]
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/Dockstore.cwl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env cwl-runner
 2 | 
 3 | class: CommandLineTool
 4 | id: "Varscan2"
 5 | label: "Varscan2 workflow"
 6 | cwlVersion: v1.0
 7 | description: |
 8 |     A Docker container for a Varscan2 workflow. See the [github repo](https://github.com/Jeltje/varscan2) for more information.
 9 |     ```
10 |     Usage:
11 |     # fetch CWL
12 |     $> dockstore cwl --entry quay.io/jeltje/varscan2:v1.0.2 > Dockstore.cwl
13 |     # make a runtime JSON template and edit it (or use the content of sample_configs.json in this git repo)
14 |     $> dockstore convert cwl2json --cwl Dockstore.cwl > Dockstore.json
15 |     # run it locally with the Dockstore CLI
16 |     $> dockstore launch --entry quay.io/jeltje/varscan2:v1.0.2 \
17 |         --json Dockstore.json
18 |     ```
19 | 
20 | dct:creator:
21 |   "@id": "jeltje"
22 |   foaf:name: Jeltje van Baren
23 |   foaf:mbox: "mailto:jeltje.van.baren@gmail.com"
24 | 
25 | requirements:
26 |   - class: DockerRequirement
27 |     dockerPull: "quay.io/jeltje/varscan2:v1.0.2"
28 | 
29 | hints:
30 |   - class: ResourceRequirement
31 |     coresMin: 1
32 |     ramMin: 4092
33 |     outdirMin: 512000
34 |     doc: "the process requires at least 4G of RAM"
35 | inputs:
36 |   - id: "#genome"
37 |     type: File
38 |     doc: "Genome fasta"
39 |     format: "http://edamontology.org/format_1929"
40 |     inputBinding:
41 |       prefix: -i
42 |     secondaryFiles:
43 |     - .fai
44 | 
45 |   - id: "#centromeres"
46 |     type: File
47 |     doc: "Centromere bed file"
48 |     format: "http://edamontology.org/format_3003"
49 |     inputBinding:
50 |       prefix: -b
51 | 
52 |   - id: "#targets"
53 |     type: File
54 |     doc: "Exome Targets bed file"
55 |     format: "http://edamontology.org/format_3003"
56 |     inputBinding:
57 |       prefix: -w
58 | 
59 |   - id: "#control_bam_input"
60 |     type: File
61 |     doc: "The control exome BAM file used as input, it must be sorted."
62 |     format: "http://edamontology.org/format_2572"
63 |     inputBinding:
64 |       prefix: -c 
65 | 
66 |   - id: "#tumor_bam_input"
67 |     type: File
68 |     doc: "The tumor exome BAM file used as input, it must be sorted."
69 |     format: "http://edamontology.org/format_2572"
70 |     inputBinding:
71 |       prefix: -t 
72 | 
73 |   - id: "#sample_id"
74 |     type: string
75 |     doc: "sample ID to use in output"
76 |     inputBinding:
77 |       prefix: -q 
78 | 
79 | 
80 | stdout: output.cnv
81 | 
82 | outputs: 
83 |   - id: output
84 |     type: stdout
85 | 
86 | baseCommand: ["-s", "/var/spool/cwl"]
87 | 
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Varscan2
  2 | 
  3 | **This repository contains code to create a docker implementation of the Varscan2.4.2 copynumber variation (CNV) caller.**
  4 | 
  5 | Varscan2 was developed by Dan Koboldt (see References below). It can be used to detect copy number variation (CNV) in sample pairs, usually exomes from a tumor and control from one patient.
  6 | 
  7 | The Varscan2 executable (https://github.com/dkoboldt/varscan.git) combines several tools. It is meant to be run in a pipeline, during which different tools are called in sequence. For details on Varscan, see http://dkoboldt.github.io/varscan/
  8 | 
  9 | 
 10 | This repository ONLY contains a pipeline for Varscan2 **copynumber variation**. If you want to run other Varscan tools, please use Varscan2 directly. This docker container contains a wrapper script that uses Varscan tools and other programs *with specific parameters*. These may not be the perfect parameters for your particular samples. See below for the full list of pipeline steps.
 11 | 
 12 | **Inputs** to the program are a tumor/control pair of BAM files and several [bed format](https://genome.ucsc.edu/FAQ/FAQformat#format1) helper files (see below). Your input bam files must be sorted by coordinate (try `samtools sort`)
 13 | **Output** is a file with chromosome segments that are scored for amplification or deletion.
 14 | 
 15 | To get per-gene output, these scores must be mapped to an annotation, for example using [this program] (https://github.com/Jeltje/cnvtogenes)
 16 | 
 17 | ## The code
 18 | 
 19 | The Varscan wrapper script runs the following:
 20 | 
 21 | 1. samtools flagstat on each bam file
 22 | 2. samtools mpileup on both bam files
 23 | 3. Determine unique mapped read ratio
 24 | 4. Varscan copynumber
 25 | 5. Remove low coverage regions
 26 | 6. Varscan copyCaller
 27 | 7. Calculate median for recentering
 28 | 8. Varscan copyCaller recenter
 29 | 9. Separate chromosome arms
 30 | 10. DNAcopy (CBS)
 31 | 11. Merge chromosome arms
 32 | 
 33 | The chromosome arms are separated before the Circular Binary Segmentation (CBS) step to avoid making calls across centromeres.
 34 | 
 35 | ## Getting the docker container
 36 | 
 37 | The latest Varscan docker image can be downloaded directly from quay.io using
 38 | `docker pull quay.io/jeltje/varscan2`
 39 | 
 40 | Alternatively, you can build from the github repo:
 41 | ```
 42 | git clone https://github.com/jeltje/varscan2.git
 43 | cd varscan2
 44 | docker build -t jeltje/varscan2 .
 45 | ```
 46 | 
 47 | ## Running the docker container
 48 | 
 49 | For details on running docker containers in general, see the excellent tutorial at https://docs.docker.com/
 50 | 
 51 | To see a usage statement, run
 52 | 
 53 | ```
 54 | docker run jeltje/varscan2 -h
 55 | ```
 56 | 
 57 | ### Example input:
 58 | 
 59 | ```
 60 | docker run  -v /path/to/input/files:/data jeltje/varscan2 -c normal.bam -t  tumor.bam -q sampleid -i genome.fa -b centromeres.bed -w targets.bed -s tmpdir > varscan.cnv
 61 | 
 62 | ```
 63 | 
 64 | where
 65 | 
 66 | `normal.bam` and `tumor.bam`    are BAM format files of exome reads aligned to the genome. 
 67 | 
 68 | `sampleid` is an identifier for the patient. This will be used in the output.
 69 | 
 70 | `genome.fa` is a fasta file containing the genome that was used to create the BAM files. A samtools indexed `.fai` file must be present in the same directory as this file (for details see Other Considerations, below)
 71 | 
 72 | `centromeres.bed` is a [bed format file](https://genome.ucsc.edu/FAQ/FAQformat#format1) containing centromere locations. This list is used to remove centromeres from the CBS calls.
 73 | 
 74 | `targets.bed` is a list of exome targets in bed format. This is used as a 'whitelist' of genome regions so that off target alignments will not be used for analysis
 75 | 
 76 | `tmpdir` is a directory for temporary output files. If you set option -d, these files will be kept
 77 | 
 78 | Keep in mind that all these file locations must be with respect to your `/path/to/input/files`.
 79 | 
 80 | Centromeres for hg19 are provided ind the `/data` directory
 81 | 
 82 | >       You can find centromere locations for genomes via
 83 | >       http://genome.ucsc.edu/cgi-bin/hgTables
 84 | >       Using the following selections:
 85 | >       - group: Mapping and Sequencing
 86 | >       - track:gap
 87 | >       -       filter - goes to new page, look for 'type does match' and type centromere, submit
 88 | >       -       output format: bed
 89 | >       Submit, on the next page just press Get Bed
 90 | 
 91 | 
 92 | ## Output
 93 | 
 94 | Output is written to `STDOUT` and uses the following format:
 95 | ```
 96 | sampleID    chrom    loc.start    loc.end    num.mark    seg.mean
 97 | 
 98 | ```
 99 | 
100 | To get amplified or deleted segments from this file, a threshold must be applied. This is often set to `0.25/-0.25`,
101 | and with a minimum number of 10 markers per segment.
102 | 
103 | 
104 | ## Other considerations
105 | 
106 | Tumor and control really must be from the same patient and processed in the same experiment. Batch effects are strong in exome experiments and using the wrong control renders Varscan output meaningless.
107 | 
108 | To index a genome, run
109 | ```
110 | 	samtools faidx <genome.fa>
111 | ```
112 | This creates an index named genome.fa.fai
113 | The genome and the index must be in the same directory, and the genome file (not the index) is the input to run_varscan
114 | 
115 | The whitelist is a bed format file with the exome targets used in the experiment. It ensures that Varscan only uses target regions for its analysis and not any off target read matches. It is important to use the real list of exome targets. For meaningful results do not use a generic list.
116 | 
117 | 
118 | ## References
119 | 
120 | Koboldt DC, Zhang Q, Larson DE, Shen D, McLellan MD, Lin L, Miller CA, Mardis ER, Ding L, Wilson RK. 
121 | VarScan 2: somatic mutation and copy number alteration discovery in cancer by exome sequencing. 
122 | Genome Res. 2012 Mar;22(3):568-76. doi: 10.1101/gr.129684.111.
123 | 


--------------------------------------------------------------------------------
/basicDNAcopy.R:
--------------------------------------------------------------------------------
 1 | library(DNAcopy)
 2 | 
 3 | args <- commandArgs(TRUE)
 4 | 
 5 | # arguments are input and SD
 6 | 
 7 | # Alway use the same random seed for reproducible results
 8 | set.seed(0xcafe)	# cafe is a hex number
 9 | 
10 | cn <- read.table(args[1],header=TRUE)
11 | sd <- as.double(args[2])
12 | CNA.object <-CNA(genomdat = cn$adjusted_log_ratio, chrom = cn$chrom, maploc = cn$chr_stop, 
13 | 	data.type = 'logratio', sampleid = "sample")
14 | 
15 | smoothed.CNA.object <- smooth.CNA(CNA.object)
16 | 
17 | segment.smoothed.CNA.object <- segment(smoothed.CNA.object, undo.splits="sdundo", undo.SD=sd, verbose=1)
18 | p.segment.smoothed.CNA.object <- segments.p(segment.smoothed.CNA.object)
19 | 
20 | outfile <- paste(args[1], "SD", sd, "dnacopy.out", sep=".")
21 | 
22 | write.table(p.segment.smoothed.CNA.object[,1:6], file=outfile, quote=F, row.names=F, sep="\t")
23 | 
24 | detach(package:DNAcopy)
25 | 


--------------------------------------------------------------------------------
/meanLogRatioByChromosome.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys, os, re, argparse
 4 | from numpy import *
 5 | 
 6 | parser = argparse.ArgumentParser(description="Calculates log average per chromosome")
 7 | parser.add_argument('cpcalled', type=str,help="copyCalled output")
 8 | 
 9 | class Chrom(object):
10 |     """Holds fragment scores by chromosomes """
11 |     def __init__(self, chrom, score, endpos):
12 |         self.frags = []
13 | 	self.chrom = chrom
14 | 	self.add(chrom, score, endpos)
15 |     def add(self, chrom, score, endpos):
16 | 	if self.chrom == chrom:
17 | 	    self.frags.append(score)
18 | 	    self.lastfrag = endpos
19 | 	    return True
20 | 	else:
21 | 	    return False
22 |     def stats(self):
23 | 	self.mean = mean(self.frags)
24 | 	self.std = std(self.frags)
25 | 
26 | 
27 | if len(sys.argv)==1:
28 |     parser.print_help()
29 |     sys.exit(1)
30 | args = parser.parse_args()
31 | 
32 | # Main
33 | chromTable = []	# holds chrom objects
34 | curChrom = Chrom('empty', 0, 0)
35 | f = open(args.cpcalled,'r')
36 | for line in f:
37 |     line = line.strip()
38 |     fields = line.split("\t")
39 |     chr = fields[0]
40 |     if chr == "chrom":
41 | 	continue
42 |     score = float(fields[6])
43 |     endpos = int(fields[2])
44 |     if not (curChrom.add(chr, score, endpos)):
45 | 	curChrom = Chrom(chr, score, endpos)
46 | 	chromTable.append(curChrom)
47 | f.close()
48 | 
49 | if len(chromTable) < 3:
50 |     print >>sys.stderr, "ERROR: Please enter whole genome file"
51 |     sys.exit(1)
52 | 
53 | means = []
54 | for chr in chromTable:
55 |     if chr.lastfrag < 60000000:	# skip MT, GL, Y
56 | #    if len(chr.frags) < 3000:   # skip MT, GL, Y
57 | 	continue
58 |     chr.stats()
59 |     means.append(chr.mean)
60 | 
61 | med = len(means)/2   # this rounds, which is perfect
62 | means.sort()
63 | print "%.2f" % mean([means[med-1], means[med], means[med+1]])
64 | 
65 | 


--------------------------------------------------------------------------------
/run_varscan:
--------------------------------------------------------------------------------
  1 | #! /bin/bash
  2 | 
  3 | print_usage(){
  4 | >&2 cat <<EOF
  5 | $0 -c <control bam file> -t <tumor bam file> -q <sample ID> -i <genome fasta> -b <centromere bed file> -w <exome whitelist> -s <tmp output>
  6 | 	Wrapper script for Varscan2
  7 | 	Runs the following steps:
  8 | 	1. samtools flagstat on each bam file
  9 | 	2. samtools mpileup on both bam files
 10 | 	3. determine unique mapped read ratio
 11 | 	4. VarScan copynumber
 12 | 	5. Remove low coverage regions
 13 | 	6. VarScan copyCaller
 14 | 	7. calculate median for recentering
 15 | 	8. VarScan copyCaller recenter
 16 | 	9. Separate chromosome arms
 17 | 	10. DNAcopy
 18 | 	11. Merge chromosome arms
 19 | 
 20 | OPTIONS:
 21 |    -h      Show this message
 22 |    -t      tumor bam file
 23 |    -c      control bam file
 24 |    -q      optional sample ID, eg TCGA-001-4-2018
 25 |    -i      path to indexed genome (index using samtools faidx)
 26 |    -b      centromere locations (bed format)
 27 |    -w      exome whitelist (bed format)
 28 |    -s      directory for temporary files. The script creates a directory varscan.N inside.
 29 |    -d      do not delete temporary output
 30 |    -n      <varscan.N> instead of creating a new temporary directory, use this one
 31 | 
 32 | EOF
 33 | }
 34 | 
 35 | cBam='False'
 36 | tBam='False'
 37 | sampleid="sample"
 38 | idx='False'
 39 | cent='False'
 40 | white='False'
 41 | scratch=
 42 | prevDir=
 43 | cleanup=true
 44 | 
 45 | while getopts "ht:c:q:i:b:w:s:n:a:d" OPTION
 46 | do
 47 |      case $OPTION in
 48 |          h)
 49 |              print_usage
 50 |              exit
 51 |              ;;
 52 |          t)
 53 |              tBam=$OPTARG
 54 |              ;;
 55 |          c)
 56 |              cBam=$OPTARG
 57 |              ;;
 58 |          q)
 59 |              sampleid=$OPTARG
 60 |              ;;
 61 |          i)
 62 |              idx=$OPTARG
 63 |              ;;
 64 |          b)
 65 |              cent=$OPTARG
 66 |              ;;
 67 |          w)
 68 |              white=$OPTARG
 69 |              ;;
 70 |          s)
 71 |              scratch=$OPTARG
 72 |              ;;
 73 |          d)
 74 |              cleanup=false
 75 |              ;;
 76 |          n)
 77 |              prevDir=$OPTARG
 78 |              ;;
 79 |          ?)
 80 |              print_usage
 81 |              exit
 82 |              ;;
 83 |      esac
 84 | done
 85 | 
 86 | graceful_death() {
 87 | 	>&2 echo "ERROR: Cannot finish $0 because $1";
 88 | 	exit 1
 89 | }
 90 | 
 91 | # Check if all file arguments have been given and are valid
 92 | file_check() {
 93 |     if [ $1 == 'False' ]; then
 94 |         print_usage
 95 |         graceful_death "some input arguments are missing"
 96 |     fi
 97 |     if [[ ! -e "$1" ]]; then
 98 |         print_usage
 99 |         graceful_death "can't find $1"
100 |     fi
101 | }
102 | 
103 | for i in $cBam $tBam $idx $cent $white; do
104 |         file_check $i
105 | done
106 | 
107 | # Sanity check
108 | tmpdir=
109 | if [[ -z "$prevDir" ]] && [[ -z $scratch ]]; then
110 | 	graceful_death "Please give either the -n OR -s option"
111 | fi
112 | 
113 | # select correct temp dir
114 | if  [[ -z "$prevDir" ]]; then
115 | 	if [ ! -d "$scratch" ]; then
116 | 		graceful_death "cannot find scratch output dir $scratch"
117 | 	fi 
118 | 	tmpExt=$RANDOM
119 | 	tmpdir="$scratch/varscan.$tmpExt"
120 | 	mkdir -p $tmpdir
121 | else
122 | 	if [ ! -d "$prevDir" ]; then
123 | 		graceful_death "cannot find previous run directory $prevDir"
124 | 	fi
125 | 	tmpdir=$prevDir
126 | fi
127 | >&2 echo "Output files will be stored in $tmpdir"
128 | 
129 | # Make sure all inputs are for the same genome
130 | >&2 echo "Checking inputs..."
131 | 
132 | # Check that the bam files are sorted
133 | issort(){
134 |   didsort=$(samtools view -H $1 | grep ^@HD | cut -f3)
135 |   if [[ "$didsort" != 'SO:coordinate' ]]; then
136 |     graceful_death "it looks like $1 is not sorted by coordinate, please run samtools sort"
137 |   fi
138 | }
139 | issort $tBam
140 | issort $cBam
141 | 
142 | contain(){
143 |   if [[ $(comm -23 $1 $2) ]]; then
144 |     graceful_death "some or all of the chromosomes in the input $3 cannot be found in the genome fasta; please make sure all your inputs are for the same genome version"
145 |   fi
146 | }
147 | 
148 | samtools view -H $tBam | grep ^@SQ | cut -f2 | cut -f2 -d':' | sort > $tmpdir/bam.chrs
149 | cut -f1 $cent | sort > $tmpdir/cent.chrs
150 | cut -f1 $white | sort -u > $tmpdir/target.chrs
151 | grep '>' $idx | sed 's/\s.*//' | sed 's/>//' | sort > $tmpdir/geno.chrs
152 | 
153 | # all bam chrs, and all bed chromosomes should be in the genome (but not vice versa)
154 | contain $tmpdir/bam.chrs $tmpdir/geno.chrs $tBam
155 | contain $tmpdir/cent.chrs $tmpdir/geno.chrs $cent
156 | contain $tmpdir/target.chrs $tmpdir/geno.chrs $white
157 | 
158 | # checks if a file exists and has more than one line in it
159 | # several programs in this wrapper will output a single line if they fail
160 | exists(){
161 |   if [ -e "$1" ]
162 |   then
163 |     ct=$(head -n 2 $1 | wc -l | cut -f1 -d' ')
164 |     if [ "$ct" -eq "2" ]; then
165 |         return 0
166 |     else
167 |         return 1
168 |     fi
169 |   else
170 |     return 1
171 |   fi
172 | }
173 | 
174 | # runOrDie gets its variables directly from MAIN
175 | runOrDie(){
176 | 	if exists "$outfile" ; then
177 | 	    return 0	# nothing to be done
178 | 	fi
179 | 	for file in $infile; do
180 | 		ext=$(echo $file | sed "s/.*\.//");
181 | 		[ "$ext" == "bam" ] && continue	# do not check bam files again
182 | 		if ! exists "$file" && [ -z $DEBUG ]; then
183 | 			graceful_death "cannot run $cmd: missing or corrupted $infile"
184 | 		fi
185 | 	done
186 | 	>&2 echo $cmd
187 | 	if [[ -z $DEBUG ]]; then
188 | 		date >&2
189 | 		eval $cmd
190 | 		if ! exists "$outfile" ; then
191 | 			graceful_death "failed to find $outfile"
192 | 		fi
193 | 	fi
194 | }
195 | 
196 | 
197 | 
198 | 
199 | # correct version of samtools?
200 | cmd="samtools 2>&1 | grep Version | cut -f2 -d' '"
201 | sVersion=$(eval $cmd)
202 | if [ $sVersion != "0.1.18" ]; then
203 | 	graceful_death "wrong samtools version: expected 0.1.18, got $sVersion"
204 | fi
205 | 
206 | # find location of run script so we can get the other necessary scripts
207 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
208 | DNAcopy=$DIR/basicDNAcopy.R
209 | findDelta=$DIR/meanLogRatioByChromosome.py
210 | separateArms=$DIR/separateArms.py
211 | varScan="nice java -Xmx2048m -jar $DIR/VarScan.jar"
212 | 
213 | ########## MAIN ################
214 | 
215 | # Samtools flagstat
216 | infile="$cBam"
217 | outfile="$tmpdir/control.flagstat"
218 | cmd="samtools flagstat $infile > $outfile"
219 | runOrDie 
220 | 
221 | infile=$tBam
222 | outfile="$tmpdir/tumor.flagstat"
223 | cmd="samtools flagstat $infile > $outfile"
224 | runOrDie
225 | 
226 | # Samtools mpileup
227 | infile="$idx $cBam $tBam"
228 | outfile="$tmpdir/mpileup"
229 | cmd="samtools mpileup -q 1 -B -l $white -f $infile > $outfile"
230 | runOrDie
231 | 
232 | 
233 | ntest=$(head -n 100000 $tmpdir/mpileup | cut -f3 | grep -c N)
234 | if  [ "$ntest" -eq "100000" ]; then
235 | 	graceful_death "it looks like the chromosome names in your bam files don't match the ones in the input genome"
236 | fi
237 | 
238 | # Varscan copynumber
239 | # must calculate data ratio from flagstat output
240 | # also must move to output dir to run this because varscan doesn't parse the output name
241 | dratio=
242 | if exists $tmpdir/control.flagstat && exists $tmpdir/tumor.flagstat ; then
243 | 	cnum=$(grep -m 1 mapped $tmpdir/control.flagstat | cut -f1 -d' ')
244 | 	tnum=$(grep -m 1 mapped $tmpdir/tumor.flagstat | cut -f1 -d' ')
245 | 	dratio=$(echo "scale=2;$cnum/$tnum" | bc)
246 | fi
247 | if [[ -z $dratio ]] && [ -z $DEBUG ]; then
248 | 	graceful_death "could not determine data ratio from $tmpdir/control.flagstat and $tmpdir/tumor.flagstat"
249 | fi 
250 | 
251 | pushd $tmpdir > /dev/null
252 | vOptions='--min-segment-size 100 --mpileup 1'
253 | dr="--data-ratio $dratio"	# .88 works instead of 0.88
254 | infile="mpileup"
255 | outfile="output.copynumber"
256 | cmd="$varScan copynumber $infile output $vOptions $dr"	# output is base name, copynumber gets added as extension
257 | runOrDie
258 | pushd > /dev/null
259 | 
260 | # From the output, filter any segments for which the tumor coverage is less than 10
261 | # and the control coverage is less than 20
262 | awk -v x=10 '$6 >= x' $tmpdir/output.copynumber | \
263 | awk -v x=20 '$5 >= x' > $tmpdir/output.copynumber.cov
264 | 
265 | 
266 | # Varscan copycaller
267 | infile="$tmpdir/output.copynumber.cov"
268 | outfile="$tmpdir/copyCalled"
269 | ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel"
270 | cmd="$varScan copyCaller $infile $ccOptions"
271 | runOrDie
272 | 
273 | # Calculate recenter amount
274 | infile="$tmpdir/copyCalled"
275 | delta=$($findDelta $infile)
276 | if [ -z "$delta" ]; then
277 |     graceful_death "Could not find chr average, please make sure your bamfiles cover all chromosomes (samtools idxstats file.bam)"
278 | fi
279 | 
280 | # Rerun copycaller
281 | infile="$tmpdir/output.copynumber.cov"
282 | outfile="$tmpdir/copyCalled.recenter"
283 | ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel"
284 | 
285 | cmp=$(awk -v delta=$delta 'END{if (delta < -0.2) {print "lt"} else {if (delta > 0.2) {print "gt"} else {print "eq"}}}' < /dev/null)
286 | if [[ "$cmp" == "lt" ]]; then
287 |     rd=$(echo $delta | sed 's/-//')
288 |     cmd="$varScan copyCaller $infile $ccOptions --recenter-down $rd"
289 |     runOrDie
290 | elif [[ "$cmp" == "gt" ]]; then
291 |     cmd="$varScan copyCaller $infile $ccOptions --recenter-up $delta"
292 |     runOrDie
293 | else
294 |     ln -s copyCalled $tmpdir/copyCalled.recenter
295 | fi
296 | 
297 | # add p and q to chromosome arms
298 | infile="$tmpdir/copyCalled.recenter"
299 | outfile="$tmpdir/copyCalled.recenter.sep"
300 | cmd="$separateArms $infile $cent > $outfile"
301 | runOrDie
302 | 
303 | # Circular binary segmentation
304 | infile="$tmpdir/copyCalled.recenter.sep"
305 | outfile="$tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out"
306 | cmd="Rscript $DNAcopy $infile 2.5 >/dev/null"
307 | runOrDie
308 | 
309 | # remove the arms and print to stdout
310 | sed 's/\.[pq]	/	/' $tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out | \
311 | 	sed "s/^sample/$sampleid/"
312 | 
313 | # clean up
314 | if $cleanup; then
315 |     rm $tmpdir/*
316 |     rmdir $tmpdir
317 | fi
318 | 
319 | 


--------------------------------------------------------------------------------
/separateArms.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys, os, re, argparse
 4 | 
 5 | parser = argparse.ArgumentParser(description="Divide input copycaller file into p and q segments using an input bed file with centromere coordinates")
 6 | parser.add_argument('input', type=str,help="Tab separated copycaller file")
 7 | parser.add_argument('centro', type=str,help="Centromere bed file")
 8 | # optional argument
 9 | #parser.add_argument('-t', dest='targetfile', type=str, default=False,
10 | #        help="Input file with target genes and scores")
11 | # optional flag
12 | #parser.add_argument('-d', '--debug', help="Optional debugging output", action='store_true')
13 | 
14 | def myfunction(line):
15 |     """DESCRIBE ME"""
16 |     line = line[:-1]
17 | 
18 | class cent(object):
19 |     """centromere objects by chromosome"""
20 |     def __init__(self, line):
21 |         [id, start, end] = line.split("\t")
22 | 	self.name = id
23 |         self.p = int(start)
24 |         self.q = int(end)
25 | 
26 | def getChrom(id, centList, cur):
27 |     if cur and id == cur.name:
28 | 	return cur
29 |     for i in centList:
30 | 	if id == i.name:
31 | 	    return i
32 |     return False
33 | 
34 | if len(sys.argv)==1:
35 |     parser.print_help()
36 |     sys.exit(1)
37 | args = parser.parse_args()
38 | 
39 | # Main
40 | 
41 | centros=[]
42 | f = open(args.centro,'r')
43 | for line in f:
44 |     line = line.strip()
45 |     centobj = cent(line)
46 |     centros.append(centobj)
47 | f.close
48 | 
49 | curchrom = False
50 | f = open(args.input,'r')
51 | for line in f:
52 |     line = line.strip()
53 |     fields = line.split("\t")
54 |     if fields[0] == 'chrom':
55 | 	print line
56 | 	continue
57 |     id = fields[0]
58 |     start = int(fields[1])
59 |     end = int(fields[2])
60 |     curchrom = getChrom(id, centros, curchrom)
61 | #    print >>sys.stderr, id, curchrom
62 | #    if (not curchrom) or (not curchrom.name == fields[0]):
63 | #	curchrom = False
64 | #	for c in centros:
65 | #	    if c.name == id:
66 | #		curchrom = c
67 | #		print >>sys.stderr, "setting", curchrom
68 | #		break
69 |     # centromere free chromosomes
70 |     if not curchrom:
71 | 	print line
72 |     # print only if segment does not overlap centromere
73 |     elif end < curchrom.p:
74 | 	fields[0] = ('.').join([fields[0], 'p'])
75 | 	print ("\t").join(fields)
76 |     elif start > curchrom.q:
77 | 	fields[0] = ('.').join([fields[0], 'q'])
78 | 	print ("\t").join(fields)
79 | f.close
80 | 


--------------------------------------------------------------------------------
/src/cbsToBed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import sys, os, re, getopt
 4 | usage = sys.argv[0]+""" <file>
 5 | 
 6 | Create bed file based on Varscan CBS output with
 7 | segment scores converted to colors (red for amplified, blue for deleted)
 8 | 
 9 | Option: 
10 | 	-n Don't create bed header
11 | 	-c <float> cutoff for amplification or deletion (default 0.25)
12 | 
13 | """
14 | 
15 | 
16 | # Main
17 | # read in command line and options
18 | try:
19 |     opts, args = getopt.getopt(sys.argv[1:], "fdc:hbn")
20 | except getopt.GetoptError:
21 |     
22 |         # print help information and exit:
23 |     print usage
24 |     print "ERROR did not recognize input\n"
25 |     sys.exit(2)
26 | 
27 | makeBed = True
28 | head = True
29 | val = 0.25
30 | for o, a  in opts:
31 | #    if o == "-d":
32 | #        doNotDelete = True
33 |     if o == "-n":
34 |         head = False
35 |     if o == "-c":
36 |         val = float(a)
37 |     if o == "-h":
38 |         print usage
39 |         sys.exit()
40 | 
41 | 
42 | if len(args) != 1:
43 |     sys.exit(usage)
44 | 
45 | # Run program
46 | 
47 | f = open(args[0],'r')
48 | counter = 0
49 | 
50 | if makeBed and head:
51 |     print 'track name=%s description="%s" itemRgb="On"'% (args[0], args[0])
52 | for line in f:
53 |     line = line.strip()
54 |     fields = line.split("\t")
55 |     if fields[0] == 'ID':
56 | 	continue
57 |     counter+=1
58 |     id=('.').join(['mrg', str(counter)])
59 |     qualifier = "neutral"
60 |     rgb='0,0,0'
61 |     if(float(fields[5]) < -val):
62 |         rgb='0,0,255'	# blue
63 |     elif(float(fields[5]) > val):
64 |         rgb='255,0,0'	# red
65 |     else:
66 |         rgb='0,0,0'	# black
67 |     chr = ("").join(["chr", fields[1]])
68 |     outstring = ("\t").join([chr, fields[2], fields[3], id, '0', '.', fields[2], fields[2], rgb ])
69 |     print outstring
70 | f.close()
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------