├── DNAcopy_1.48.0.tar.gz ├── Dockerfile ├── Dockstore.cwl ├── README.md ├── basicDNAcopy.R ├── meanLogRatioByChromosome.py ├── run_varscan ├── separateArms.py └── src └── cbsToBed.py /DNAcopy_1.48.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Jeltje/varscan2/c57bd2b0cba5168fc8b7a64f30cbbc9fd4608e22/DNAcopy_1.48.0.tar.gz -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:14.04 2 | 3 | MAINTAINER Jeltje van Baren, jeltje.van.baren@gmail.com 4 | 5 | # create a working directory and work from there 6 | RUN mkdir /tmp/install 7 | WORKDIR /tmp/install 8 | 9 | RUN apt-get update && apt-get install -y \ 10 | gcc \ 11 | make \ 12 | zlib1g-dev \ 13 | git \ 14 | wget \ 15 | python-numpy \ 16 | default-jre \ 17 | r-base \ 18 | bc 19 | 20 | # DNAcopy version keeps changing so deprecated this: 21 | # R and DNAcopy package (move to R library location) 22 | #RUN apt-get install -y r-base 23 | #RUN wget http://www.bioconductor.org/packages/release/bioc/src/contrib/DNAcopy_1.40.0.tar.gz 24 | # instead: 25 | COPY ./DNAcopy_1.48.0.tar.gz ./ 26 | RUN R CMD INSTALL DNAcopy_1.48.0.tar.gz 27 | 28 | # Samtools 0.1.18 - note: 0.1.19 and 1.1 do NOT work, VarScan copynumber dies on the mpileup 29 | RUN wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.18/samtools-0.1.18.tar.bz2 30 | RUN tar -xvf samtools-0.1.18.tar.bz2 31 | # the make command generates a lot of warnings, none of them relevant to the final samtools code, hence 2>/dev/null 32 | RUN (cd samtools-0.1.18/ && make DFLAGS='-D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE -D_USE_KNETFILE -D_CURSES_LIB=0' LIBCURSES='' 2>/dev/null && mv samtools /usr/local/bin) 33 | 34 | # get varscan 35 | RUN wget -O /usr/local/bin/VarScan.jar https://github.com/dkoboldt/varscan/releases/download/2.4.2/VarScan.v2.4.2.jar 36 | 37 | # Move wrapper and helper scripts to same location 38 | ADD ./run_varscan /usr/local/bin/ 39 | ADD ./separateArms.py /usr/local/bin/ 40 | ADD ./basicDNAcopy.R /usr/local/bin/ 41 | ADD ./meanLogRatioByChromosome.py /usr/local/bin/ 42 | 43 | # Set WORKDIR to /data -- predefined mount location. 44 | RUN mkdir /data 45 | WORKDIR /data 46 | 47 | # And clean up 48 | RUN apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/install 49 | 50 | ENTRYPOINT ["bash", "/usr/local/bin/run_varscan"] 51 | 52 | 53 | -------------------------------------------------------------------------------- /Dockstore.cwl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env cwl-runner 2 | 3 | class: CommandLineTool 4 | id: "Varscan2" 5 | label: "Varscan2 workflow" 6 | cwlVersion: v1.0 7 | description: | 8 | A Docker container for a Varscan2 workflow. See the [github repo](https://github.com/Jeltje/varscan2) for more information. 9 | ``` 10 | Usage: 11 | # fetch CWL 12 | $> dockstore cwl --entry quay.io/jeltje/varscan2:v1.0.2 > Dockstore.cwl 13 | # make a runtime JSON template and edit it (or use the content of sample_configs.json in this git repo) 14 | $> dockstore convert cwl2json --cwl Dockstore.cwl > Dockstore.json 15 | # run it locally with the Dockstore CLI 16 | $> dockstore launch --entry quay.io/jeltje/varscan2:v1.0.2 \ 17 | --json Dockstore.json 18 | ``` 19 | 20 | dct:creator: 21 | "@id": "jeltje" 22 | foaf:name: Jeltje van Baren 23 | foaf:mbox: "mailto:jeltje.van.baren@gmail.com" 24 | 25 | requirements: 26 | - class: DockerRequirement 27 | dockerPull: "quay.io/jeltje/varscan2:v1.0.2" 28 | 29 | hints: 30 | - class: ResourceRequirement 31 | coresMin: 1 32 | ramMin: 4092 33 | outdirMin: 512000 34 | doc: "the process requires at least 4G of RAM" 35 | inputs: 36 | - id: "#genome" 37 | type: File 38 | doc: "Genome fasta" 39 | format: "http://edamontology.org/format_1929" 40 | inputBinding: 41 | prefix: -i 42 | secondaryFiles: 43 | - .fai 44 | 45 | - id: "#centromeres" 46 | type: File 47 | doc: "Centromere bed file" 48 | format: "http://edamontology.org/format_3003" 49 | inputBinding: 50 | prefix: -b 51 | 52 | - id: "#targets" 53 | type: File 54 | doc: "Exome Targets bed file" 55 | format: "http://edamontology.org/format_3003" 56 | inputBinding: 57 | prefix: -w 58 | 59 | - id: "#control_bam_input" 60 | type: File 61 | doc: "The control exome BAM file used as input, it must be sorted." 62 | format: "http://edamontology.org/format_2572" 63 | inputBinding: 64 | prefix: -c 65 | 66 | - id: "#tumor_bam_input" 67 | type: File 68 | doc: "The tumor exome BAM file used as input, it must be sorted." 69 | format: "http://edamontology.org/format_2572" 70 | inputBinding: 71 | prefix: -t 72 | 73 | - id: "#sample_id" 74 | type: string 75 | doc: "sample ID to use in output" 76 | inputBinding: 77 | prefix: -q 78 | 79 | 80 | stdout: output.cnv 81 | 82 | outputs: 83 | - id: output 84 | type: stdout 85 | 86 | baseCommand: ["-s", "/var/spool/cwl"] 87 | 88 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Varscan2 2 | 3 | **This repository contains code to create a docker implementation of the Varscan2.4.2 copynumber variation (CNV) caller.** 4 | 5 | Varscan2 was developed by Dan Koboldt (see References below). It can be used to detect copy number variation (CNV) in sample pairs, usually exomes from a tumor and control from one patient. 6 | 7 | The Varscan2 executable (https://github.com/dkoboldt/varscan.git) combines several tools. It is meant to be run in a pipeline, during which different tools are called in sequence. For details on Varscan, see http://dkoboldt.github.io/varscan/ 8 | 9 | 10 | This repository ONLY contains a pipeline for Varscan2 **copynumber variation**. If you want to run other Varscan tools, please use Varscan2 directly. This docker container contains a wrapper script that uses Varscan tools and other programs *with specific parameters*. These may not be the perfect parameters for your particular samples. See below for the full list of pipeline steps. 11 | 12 | **Inputs** to the program are a tumor/control pair of BAM files and several [bed format](https://genome.ucsc.edu/FAQ/FAQformat#format1) helper files (see below). Your input bam files must be sorted by coordinate (try `samtools sort`) 13 | **Output** is a file with chromosome segments that are scored for amplification or deletion. 14 | 15 | To get per-gene output, these scores must be mapped to an annotation, for example using [this program] (https://github.com/Jeltje/cnvtogenes) 16 | 17 | ## The code 18 | 19 | The Varscan wrapper script runs the following: 20 | 21 | 1. samtools flagstat on each bam file 22 | 2. samtools mpileup on both bam files 23 | 3. Determine unique mapped read ratio 24 | 4. Varscan copynumber 25 | 5. Remove low coverage regions 26 | 6. Varscan copyCaller 27 | 7. Calculate median for recentering 28 | 8. Varscan copyCaller recenter 29 | 9. Separate chromosome arms 30 | 10. DNAcopy (CBS) 31 | 11. Merge chromosome arms 32 | 33 | The chromosome arms are separated before the Circular Binary Segmentation (CBS) step to avoid making calls across centromeres. 34 | 35 | ## Getting the docker container 36 | 37 | The latest Varscan docker image can be downloaded directly from quay.io using 38 | `docker pull quay.io/jeltje/varscan2` 39 | 40 | Alternatively, you can build from the github repo: 41 | ``` 42 | git clone https://github.com/jeltje/varscan2.git 43 | cd varscan2 44 | docker build -t jeltje/varscan2 . 45 | ``` 46 | 47 | ## Running the docker container 48 | 49 | For details on running docker containers in general, see the excellent tutorial at https://docs.docker.com/ 50 | 51 | To see a usage statement, run 52 | 53 | ``` 54 | docker run jeltje/varscan2 -h 55 | ``` 56 | 57 | ### Example input: 58 | 59 | ``` 60 | docker run -v /path/to/input/files:/data jeltje/varscan2 -c normal.bam -t tumor.bam -q sampleid -i genome.fa -b centromeres.bed -w targets.bed -s tmpdir > varscan.cnv 61 | 62 | ``` 63 | 64 | where 65 | 66 | `normal.bam` and `tumor.bam` are BAM format files of exome reads aligned to the genome. 67 | 68 | `sampleid` is an identifier for the patient. This will be used in the output. 69 | 70 | `genome.fa` is a fasta file containing the genome that was used to create the BAM files. A samtools indexed `.fai` file must be present in the same directory as this file (for details see Other Considerations, below) 71 | 72 | `centromeres.bed` is a [bed format file](https://genome.ucsc.edu/FAQ/FAQformat#format1) containing centromere locations. This list is used to remove centromeres from the CBS calls. 73 | 74 | `targets.bed` is a list of exome targets in bed format. This is used as a 'whitelist' of genome regions so that off target alignments will not be used for analysis 75 | 76 | `tmpdir` is a directory for temporary output files. If you set option -d, these files will be kept 77 | 78 | Keep in mind that all these file locations must be with respect to your `/path/to/input/files`. 79 | 80 | Centromeres for hg19 are provided ind the `/data` directory 81 | 82 | > You can find centromere locations for genomes via 83 | > http://genome.ucsc.edu/cgi-bin/hgTables 84 | > Using the following selections: 85 | > - group: Mapping and Sequencing 86 | > - track:gap 87 | > - filter - goes to new page, look for 'type does match' and type centromere, submit 88 | > - output format: bed 89 | > Submit, on the next page just press Get Bed 90 | 91 | 92 | ## Output 93 | 94 | Output is written to `STDOUT` and uses the following format: 95 | ``` 96 | sampleID chrom loc.start loc.end num.mark seg.mean 97 | 98 | ``` 99 | 100 | To get amplified or deleted segments from this file, a threshold must be applied. This is often set to `0.25/-0.25`, 101 | and with a minimum number of 10 markers per segment. 102 | 103 | 104 | ## Other considerations 105 | 106 | Tumor and control really must be from the same patient and processed in the same experiment. Batch effects are strong in exome experiments and using the wrong control renders Varscan output meaningless. 107 | 108 | To index a genome, run 109 | ``` 110 | samtools faidx 111 | ``` 112 | This creates an index named genome.fa.fai 113 | The genome and the index must be in the same directory, and the genome file (not the index) is the input to run_varscan 114 | 115 | The whitelist is a bed format file with the exome targets used in the experiment. It ensures that Varscan only uses target regions for its analysis and not any off target read matches. It is important to use the real list of exome targets. For meaningful results do not use a generic list. 116 | 117 | 118 | ## References 119 | 120 | Koboldt DC, Zhang Q, Larson DE, Shen D, McLellan MD, Lin L, Miller CA, Mardis ER, Ding L, Wilson RK. 121 | VarScan 2: somatic mutation and copy number alteration discovery in cancer by exome sequencing. 122 | Genome Res. 2012 Mar;22(3):568-76. doi: 10.1101/gr.129684.111. 123 | -------------------------------------------------------------------------------- /basicDNAcopy.R: -------------------------------------------------------------------------------- 1 | library(DNAcopy) 2 | 3 | args <- commandArgs(TRUE) 4 | 5 | # arguments are input and SD 6 | 7 | # Alway use the same random seed for reproducible results 8 | set.seed(0xcafe) # cafe is a hex number 9 | 10 | cn <- read.table(args[1],header=TRUE) 11 | sd <- as.double(args[2]) 12 | CNA.object <-CNA(genomdat = cn$adjusted_log_ratio, chrom = cn$chrom, maploc = cn$chr_stop, 13 | data.type = 'logratio', sampleid = "sample") 14 | 15 | smoothed.CNA.object <- smooth.CNA(CNA.object) 16 | 17 | segment.smoothed.CNA.object <- segment(smoothed.CNA.object, undo.splits="sdundo", undo.SD=sd, verbose=1) 18 | p.segment.smoothed.CNA.object <- segments.p(segment.smoothed.CNA.object) 19 | 20 | outfile <- paste(args[1], "SD", sd, "dnacopy.out", sep=".") 21 | 22 | write.table(p.segment.smoothed.CNA.object[,1:6], file=outfile, quote=F, row.names=F, sep="\t") 23 | 24 | detach(package:DNAcopy) 25 | -------------------------------------------------------------------------------- /meanLogRatioByChromosome.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, os, re, argparse 4 | from numpy import * 5 | 6 | parser = argparse.ArgumentParser(description="Calculates log average per chromosome") 7 | parser.add_argument('cpcalled', type=str,help="copyCalled output") 8 | 9 | class Chrom(object): 10 | """Holds fragment scores by chromosomes """ 11 | def __init__(self, chrom, score, endpos): 12 | self.frags = [] 13 | self.chrom = chrom 14 | self.add(chrom, score, endpos) 15 | def add(self, chrom, score, endpos): 16 | if self.chrom == chrom: 17 | self.frags.append(score) 18 | self.lastfrag = endpos 19 | return True 20 | else: 21 | return False 22 | def stats(self): 23 | self.mean = mean(self.frags) 24 | self.std = std(self.frags) 25 | 26 | 27 | if len(sys.argv)==1: 28 | parser.print_help() 29 | sys.exit(1) 30 | args = parser.parse_args() 31 | 32 | # Main 33 | chromTable = [] # holds chrom objects 34 | curChrom = Chrom('empty', 0, 0) 35 | f = open(args.cpcalled,'r') 36 | for line in f: 37 | line = line.strip() 38 | fields = line.split("\t") 39 | chr = fields[0] 40 | if chr == "chrom": 41 | continue 42 | score = float(fields[6]) 43 | endpos = int(fields[2]) 44 | if not (curChrom.add(chr, score, endpos)): 45 | curChrom = Chrom(chr, score, endpos) 46 | chromTable.append(curChrom) 47 | f.close() 48 | 49 | if len(chromTable) < 3: 50 | print >>sys.stderr, "ERROR: Please enter whole genome file" 51 | sys.exit(1) 52 | 53 | means = [] 54 | for chr in chromTable: 55 | if chr.lastfrag < 60000000: # skip MT, GL, Y 56 | # if len(chr.frags) < 3000: # skip MT, GL, Y 57 | continue 58 | chr.stats() 59 | means.append(chr.mean) 60 | 61 | med = len(means)/2 # this rounds, which is perfect 62 | means.sort() 63 | print "%.2f" % mean([means[med-1], means[med], means[med+1]]) 64 | 65 | -------------------------------------------------------------------------------- /run_varscan: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | print_usage(){ 4 | >&2 cat < -t -q -i -b -w -s 6 | Wrapper script for Varscan2 7 | Runs the following steps: 8 | 1. samtools flagstat on each bam file 9 | 2. samtools mpileup on both bam files 10 | 3. determine unique mapped read ratio 11 | 4. VarScan copynumber 12 | 5. Remove low coverage regions 13 | 6. VarScan copyCaller 14 | 7. calculate median for recentering 15 | 8. VarScan copyCaller recenter 16 | 9. Separate chromosome arms 17 | 10. DNAcopy 18 | 11. Merge chromosome arms 19 | 20 | OPTIONS: 21 | -h Show this message 22 | -t tumor bam file 23 | -c control bam file 24 | -q optional sample ID, eg TCGA-001-4-2018 25 | -i path to indexed genome (index using samtools faidx) 26 | -b centromere locations (bed format) 27 | -w exome whitelist (bed format) 28 | -s directory for temporary files. The script creates a directory varscan.N inside. 29 | -d do not delete temporary output 30 | -n instead of creating a new temporary directory, use this one 31 | 32 | EOF 33 | } 34 | 35 | cBam='False' 36 | tBam='False' 37 | sampleid="sample" 38 | idx='False' 39 | cent='False' 40 | white='False' 41 | scratch= 42 | prevDir= 43 | cleanup=true 44 | 45 | while getopts "ht:c:q:i:b:w:s:n:a:d" OPTION 46 | do 47 | case $OPTION in 48 | h) 49 | print_usage 50 | exit 51 | ;; 52 | t) 53 | tBam=$OPTARG 54 | ;; 55 | c) 56 | cBam=$OPTARG 57 | ;; 58 | q) 59 | sampleid=$OPTARG 60 | ;; 61 | i) 62 | idx=$OPTARG 63 | ;; 64 | b) 65 | cent=$OPTARG 66 | ;; 67 | w) 68 | white=$OPTARG 69 | ;; 70 | s) 71 | scratch=$OPTARG 72 | ;; 73 | d) 74 | cleanup=false 75 | ;; 76 | n) 77 | prevDir=$OPTARG 78 | ;; 79 | ?) 80 | print_usage 81 | exit 82 | ;; 83 | esac 84 | done 85 | 86 | graceful_death() { 87 | >&2 echo "ERROR: Cannot finish $0 because $1"; 88 | exit 1 89 | } 90 | 91 | # Check if all file arguments have been given and are valid 92 | file_check() { 93 | if [ $1 == 'False' ]; then 94 | print_usage 95 | graceful_death "some input arguments are missing" 96 | fi 97 | if [[ ! -e "$1" ]]; then 98 | print_usage 99 | graceful_death "can't find $1" 100 | fi 101 | } 102 | 103 | for i in $cBam $tBam $idx $cent $white; do 104 | file_check $i 105 | done 106 | 107 | # Sanity check 108 | tmpdir= 109 | if [[ -z "$prevDir" ]] && [[ -z $scratch ]]; then 110 | graceful_death "Please give either the -n OR -s option" 111 | fi 112 | 113 | # select correct temp dir 114 | if [[ -z "$prevDir" ]]; then 115 | if [ ! -d "$scratch" ]; then 116 | graceful_death "cannot find scratch output dir $scratch" 117 | fi 118 | tmpExt=$RANDOM 119 | tmpdir="$scratch/varscan.$tmpExt" 120 | mkdir -p $tmpdir 121 | else 122 | if [ ! -d "$prevDir" ]; then 123 | graceful_death "cannot find previous run directory $prevDir" 124 | fi 125 | tmpdir=$prevDir 126 | fi 127 | >&2 echo "Output files will be stored in $tmpdir" 128 | 129 | # Make sure all inputs are for the same genome 130 | >&2 echo "Checking inputs..." 131 | 132 | # Check that the bam files are sorted 133 | issort(){ 134 | didsort=$(samtools view -H $1 | grep ^@HD | cut -f3) 135 | if [[ "$didsort" != 'SO:coordinate' ]]; then 136 | graceful_death "it looks like $1 is not sorted by coordinate, please run samtools sort" 137 | fi 138 | } 139 | issort $tBam 140 | issort $cBam 141 | 142 | contain(){ 143 | if [[ $(comm -23 $1 $2) ]]; then 144 | graceful_death "some or all of the chromosomes in the input $3 cannot be found in the genome fasta; please make sure all your inputs are for the same genome version" 145 | fi 146 | } 147 | 148 | samtools view -H $tBam | grep ^@SQ | cut -f2 | cut -f2 -d':' | sort > $tmpdir/bam.chrs 149 | cut -f1 $cent | sort > $tmpdir/cent.chrs 150 | cut -f1 $white | sort -u > $tmpdir/target.chrs 151 | grep '>' $idx | sed 's/\s.*//' | sed 's/>//' | sort > $tmpdir/geno.chrs 152 | 153 | # all bam chrs, and all bed chromosomes should be in the genome (but not vice versa) 154 | contain $tmpdir/bam.chrs $tmpdir/geno.chrs $tBam 155 | contain $tmpdir/cent.chrs $tmpdir/geno.chrs $cent 156 | contain $tmpdir/target.chrs $tmpdir/geno.chrs $white 157 | 158 | # checks if a file exists and has more than one line in it 159 | # several programs in this wrapper will output a single line if they fail 160 | exists(){ 161 | if [ -e "$1" ] 162 | then 163 | ct=$(head -n 2 $1 | wc -l | cut -f1 -d' ') 164 | if [ "$ct" -eq "2" ]; then 165 | return 0 166 | else 167 | return 1 168 | fi 169 | else 170 | return 1 171 | fi 172 | } 173 | 174 | # runOrDie gets its variables directly from MAIN 175 | runOrDie(){ 176 | if exists "$outfile" ; then 177 | return 0 # nothing to be done 178 | fi 179 | for file in $infile; do 180 | ext=$(echo $file | sed "s/.*\.//"); 181 | [ "$ext" == "bam" ] && continue # do not check bam files again 182 | if ! exists "$file" && [ -z $DEBUG ]; then 183 | graceful_death "cannot run $cmd: missing or corrupted $infile" 184 | fi 185 | done 186 | >&2 echo $cmd 187 | if [[ -z $DEBUG ]]; then 188 | date >&2 189 | eval $cmd 190 | if ! exists "$outfile" ; then 191 | graceful_death "failed to find $outfile" 192 | fi 193 | fi 194 | } 195 | 196 | 197 | 198 | 199 | # correct version of samtools? 200 | cmd="samtools 2>&1 | grep Version | cut -f2 -d' '" 201 | sVersion=$(eval $cmd) 202 | if [ $sVersion != "0.1.18" ]; then 203 | graceful_death "wrong samtools version: expected 0.1.18, got $sVersion" 204 | fi 205 | 206 | # find location of run script so we can get the other necessary scripts 207 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 208 | DNAcopy=$DIR/basicDNAcopy.R 209 | findDelta=$DIR/meanLogRatioByChromosome.py 210 | separateArms=$DIR/separateArms.py 211 | varScan="nice java -Xmx2048m -jar $DIR/VarScan.jar" 212 | 213 | ########## MAIN ################ 214 | 215 | # Samtools flagstat 216 | infile="$cBam" 217 | outfile="$tmpdir/control.flagstat" 218 | cmd="samtools flagstat $infile > $outfile" 219 | runOrDie 220 | 221 | infile=$tBam 222 | outfile="$tmpdir/tumor.flagstat" 223 | cmd="samtools flagstat $infile > $outfile" 224 | runOrDie 225 | 226 | # Samtools mpileup 227 | infile="$idx $cBam $tBam" 228 | outfile="$tmpdir/mpileup" 229 | cmd="samtools mpileup -q 1 -B -l $white -f $infile > $outfile" 230 | runOrDie 231 | 232 | 233 | ntest=$(head -n 100000 $tmpdir/mpileup | cut -f3 | grep -c N) 234 | if [ "$ntest" -eq "100000" ]; then 235 | graceful_death "it looks like the chromosome names in your bam files don't match the ones in the input genome" 236 | fi 237 | 238 | # Varscan copynumber 239 | # must calculate data ratio from flagstat output 240 | # also must move to output dir to run this because varscan doesn't parse the output name 241 | dratio= 242 | if exists $tmpdir/control.flagstat && exists $tmpdir/tumor.flagstat ; then 243 | cnum=$(grep -m 1 mapped $tmpdir/control.flagstat | cut -f1 -d' ') 244 | tnum=$(grep -m 1 mapped $tmpdir/tumor.flagstat | cut -f1 -d' ') 245 | dratio=$(echo "scale=2;$cnum/$tnum" | bc) 246 | fi 247 | if [[ -z $dratio ]] && [ -z $DEBUG ]; then 248 | graceful_death "could not determine data ratio from $tmpdir/control.flagstat and $tmpdir/tumor.flagstat" 249 | fi 250 | 251 | pushd $tmpdir > /dev/null 252 | vOptions='--min-segment-size 100 --mpileup 1' 253 | dr="--data-ratio $dratio" # .88 works instead of 0.88 254 | infile="mpileup" 255 | outfile="output.copynumber" 256 | cmd="$varScan copynumber $infile output $vOptions $dr" # output is base name, copynumber gets added as extension 257 | runOrDie 258 | pushd > /dev/null 259 | 260 | # From the output, filter any segments for which the tumor coverage is less than 10 261 | # and the control coverage is less than 20 262 | awk -v x=10 '$6 >= x' $tmpdir/output.copynumber | \ 263 | awk -v x=20 '$5 >= x' > $tmpdir/output.copynumber.cov 264 | 265 | 266 | # Varscan copycaller 267 | infile="$tmpdir/output.copynumber.cov" 268 | outfile="$tmpdir/copyCalled" 269 | ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel" 270 | cmd="$varScan copyCaller $infile $ccOptions" 271 | runOrDie 272 | 273 | # Calculate recenter amount 274 | infile="$tmpdir/copyCalled" 275 | delta=$($findDelta $infile) 276 | if [ -z "$delta" ]; then 277 | graceful_death "Could not find chr average, please make sure your bamfiles cover all chromosomes (samtools idxstats file.bam)" 278 | fi 279 | 280 | # Rerun copycaller 281 | infile="$tmpdir/output.copynumber.cov" 282 | outfile="$tmpdir/copyCalled.recenter" 283 | ccOptions="--output-file $outfile --output-homdel-file $outfile.homdel" 284 | 285 | cmp=$(awk -v delta=$delta 'END{if (delta < -0.2) {print "lt"} else {if (delta > 0.2) {print "gt"} else {print "eq"}}}' < /dev/null) 286 | if [[ "$cmp" == "lt" ]]; then 287 | rd=$(echo $delta | sed 's/-//') 288 | cmd="$varScan copyCaller $infile $ccOptions --recenter-down $rd" 289 | runOrDie 290 | elif [[ "$cmp" == "gt" ]]; then 291 | cmd="$varScan copyCaller $infile $ccOptions --recenter-up $delta" 292 | runOrDie 293 | else 294 | ln -s copyCalled $tmpdir/copyCalled.recenter 295 | fi 296 | 297 | # add p and q to chromosome arms 298 | infile="$tmpdir/copyCalled.recenter" 299 | outfile="$tmpdir/copyCalled.recenter.sep" 300 | cmd="$separateArms $infile $cent > $outfile" 301 | runOrDie 302 | 303 | # Circular binary segmentation 304 | infile="$tmpdir/copyCalled.recenter.sep" 305 | outfile="$tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out" 306 | cmd="Rscript $DNAcopy $infile 2.5 >/dev/null" 307 | runOrDie 308 | 309 | # remove the arms and print to stdout 310 | sed 's/\.[pq] / /' $tmpdir/copyCalled.recenter.sep.SD.2.5.dnacopy.out | \ 311 | sed "s/^sample/$sampleid/" 312 | 313 | # clean up 314 | if $cleanup; then 315 | rm $tmpdir/* 316 | rmdir $tmpdir 317 | fi 318 | 319 | -------------------------------------------------------------------------------- /separateArms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, os, re, argparse 4 | 5 | parser = argparse.ArgumentParser(description="Divide input copycaller file into p and q segments using an input bed file with centromere coordinates") 6 | parser.add_argument('input', type=str,help="Tab separated copycaller file") 7 | parser.add_argument('centro', type=str,help="Centromere bed file") 8 | # optional argument 9 | #parser.add_argument('-t', dest='targetfile', type=str, default=False, 10 | # help="Input file with target genes and scores") 11 | # optional flag 12 | #parser.add_argument('-d', '--debug', help="Optional debugging output", action='store_true') 13 | 14 | def myfunction(line): 15 | """DESCRIBE ME""" 16 | line = line[:-1] 17 | 18 | class cent(object): 19 | """centromere objects by chromosome""" 20 | def __init__(self, line): 21 | [id, start, end] = line.split("\t") 22 | self.name = id 23 | self.p = int(start) 24 | self.q = int(end) 25 | 26 | def getChrom(id, centList, cur): 27 | if cur and id == cur.name: 28 | return cur 29 | for i in centList: 30 | if id == i.name: 31 | return i 32 | return False 33 | 34 | if len(sys.argv)==1: 35 | parser.print_help() 36 | sys.exit(1) 37 | args = parser.parse_args() 38 | 39 | # Main 40 | 41 | centros=[] 42 | f = open(args.centro,'r') 43 | for line in f: 44 | line = line.strip() 45 | centobj = cent(line) 46 | centros.append(centobj) 47 | f.close 48 | 49 | curchrom = False 50 | f = open(args.input,'r') 51 | for line in f: 52 | line = line.strip() 53 | fields = line.split("\t") 54 | if fields[0] == 'chrom': 55 | print line 56 | continue 57 | id = fields[0] 58 | start = int(fields[1]) 59 | end = int(fields[2]) 60 | curchrom = getChrom(id, centros, curchrom) 61 | # print >>sys.stderr, id, curchrom 62 | # if (not curchrom) or (not curchrom.name == fields[0]): 63 | # curchrom = False 64 | # for c in centros: 65 | # if c.name == id: 66 | # curchrom = c 67 | # print >>sys.stderr, "setting", curchrom 68 | # break 69 | # centromere free chromosomes 70 | if not curchrom: 71 | print line 72 | # print only if segment does not overlap centromere 73 | elif end < curchrom.p: 74 | fields[0] = ('.').join([fields[0], 'p']) 75 | print ("\t").join(fields) 76 | elif start > curchrom.q: 77 | fields[0] = ('.').join([fields[0], 'q']) 78 | print ("\t").join(fields) 79 | f.close 80 | -------------------------------------------------------------------------------- /src/cbsToBed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys, os, re, getopt 4 | usage = sys.argv[0]+""" 5 | 6 | Create bed file based on Varscan CBS output with 7 | segment scores converted to colors (red for amplified, blue for deleted) 8 | 9 | Option: 10 | -n Don't create bed header 11 | -c cutoff for amplification or deletion (default 0.25) 12 | 13 | """ 14 | 15 | 16 | # Main 17 | # read in command line and options 18 | try: 19 | opts, args = getopt.getopt(sys.argv[1:], "fdc:hbn") 20 | except getopt.GetoptError: 21 | 22 | # print help information and exit: 23 | print usage 24 | print "ERROR did not recognize input\n" 25 | sys.exit(2) 26 | 27 | makeBed = True 28 | head = True 29 | val = 0.25 30 | for o, a in opts: 31 | # if o == "-d": 32 | # doNotDelete = True 33 | if o == "-n": 34 | head = False 35 | if o == "-c": 36 | val = float(a) 37 | if o == "-h": 38 | print usage 39 | sys.exit() 40 | 41 | 42 | if len(args) != 1: 43 | sys.exit(usage) 44 | 45 | # Run program 46 | 47 | f = open(args[0],'r') 48 | counter = 0 49 | 50 | if makeBed and head: 51 | print 'track name=%s description="%s" itemRgb="On"'% (args[0], args[0]) 52 | for line in f: 53 | line = line.strip() 54 | fields = line.split("\t") 55 | if fields[0] == 'ID': 56 | continue 57 | counter+=1 58 | id=('.').join(['mrg', str(counter)]) 59 | qualifier = "neutral" 60 | rgb='0,0,0' 61 | if(float(fields[5]) < -val): 62 | rgb='0,0,255' # blue 63 | elif(float(fields[5]) > val): 64 | rgb='255,0,0' # red 65 | else: 66 | rgb='0,0,0' # black 67 | chr = ("").join(["chr", fields[1]]) 68 | outstring = ("\t").join([chr, fields[2], fields[3], id, '0', '.', fields[2], fields[2], rgb ]) 69 | print outstring 70 | f.close() 71 | 72 | 73 | --------------------------------------------------------------------------------