├── CNV_PDX
├── ASCAT_single_tool.xml
├── annotation_segments_tool.xml
├── bin
│ ├── ensemblegenes_cnv_break.pl
│ ├── get_model_gender.py
│ ├── lrrbaf_ascat_tumor.R
│ └── segment_raw_extend.pl
├── gender_single_tool.xml
├── lrrbaf_tool.xml
└── single_sample_cnv_snparray.xml
├── CTP_PDX
├── XenomeSingleSample_PDX_Panel.xml
├── aggregate_stats_updated.xml
├── bin
│ ├── aggregate_stats_updated.py
│ ├── allele_depth_min_and_AF_from_ADs.py
│ ├── caller_add_pindel.sh
│ ├── clean_intergenic_region_gene_names.py
│ ├── coveragecalculator.py
│ ├── filter_dna_coverage.py
│ ├── filter_for_minimum_depth
│ ├── filter_trim.py
│ └── read_group_from_fastq.py
├── bwa_mem.xml
├── config_file_SingleSample_PDX_Panel
├── gatkcoveragestats.xml
├── microIndel_calling.xml
├── qual_statistics.xml
├── removeFiles.xml
├── variant_annotation.xml
├── variant_calling.xml
├── variant_filtration.xml
├── variant_filtration_pindel.xml
├── variant_pre_proc_1.xml
├── variant_pre_proc_2.xml
├── variant_pre_proc_3.xml
└── xenome_classification_DNA.xml
├── LICENSE.md
├── README.md
└── RNA_PDX
├── XenomeRnaSeqSingleSamplePE.xml
├── add_gene_name_normalization_out.xml
├── bin
├── GeneName_and_Normalization_without_UCSC.pl
├── filter_rna_coverage.py
├── filter_trim.py
├── lymphoma_classifier.py
├── read_group_from_fastq.py
└── summary_QC_metrics.pl
├── classifier_and_coverage.xml
├── picard_alignment_metrics.xml
├── qual_statistics.xml
├── read_group.xml
├── rsem_alignment.xml
├── summary_metrics.xml
└── xenome_classification_RNA.xml
/CNV_PDX/ASCAT_single_tool.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
7 | ASCAT 2.4 Single sample
8 |
9 |
22 |
23 |
24 | R/3.1.1
25 |
26 |
27 |
28 |
29 | -f1-5 {in_2} > {out_6}
30 |
31 |
32 |
33 | CMD BATCH --slave "--args {in_1} {in_3}" {run_ascat_single}
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/CNV_PDX/annotation_segments_tool.xml:
--------------------------------------------------------------------------------
1 |
6 |
7 |
8 | Annotate ASCAT segments with LOH, chromosome arm fraction, ploidy.
9 | Annotate ensembl genes with copy number (CN).
10 | Rename relevant files with sample name.
11 |
12 |
13 |
26 |
27 |
29 |
30 |
32 |
33 |
34 |
35 |
36 | {segment_ploidy} {in_1} {in_2} {in_5} {in_7}
37 |
38 |
39 |
40 | {segment_gene} {out_1} {in_4}
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/CNV_PDX/bin/ensemblegenes_cnv_break.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use POSIX;
3 | use File::Basename;
4 |
5 | # This script annotates ensembl genes with copy number and breakpoints
6 | # nohup perl ensemblegenes_cnv_break.pl *.segments_raw.extend.txt mart_export_gene_chr1-Y.hg19ensembl75-85.08232016.txt
7 |
8 | if ($#ARGV != 1) {
9 | print "This scripts requires: \n";
10 | exit(-1);
11 | }
12 |
13 | $file_cn = $ARGV[0];
14 | $file_gene = $ARGV[1];
15 |
16 | $file_output = basename($file_cn,".txt").".ensgene_cnvbreak.txt";
17 | open(OUTFILE, ">$file_output");
18 |
19 | open(GENEFILE, "$file_gene") or die "can't open $file_gene: $!";
20 | $gene = ;
21 | chomp($gene);
22 |
23 | open(CNFILE, "$file_cn") or die "can't open $file_cn: $!";
24 | @data = ;
25 | close(CNFILE);
26 | chomp(@data);
27 |
28 | #print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n";
29 | print OUTFILE "$gene\tnum_cnv_seg\tseg_desc\tploidy\tnMajor\tnMinor\tnAraw\tnBraw\tCN_raw\tLOH\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\tnMajor_max\tnMinor_max\tnAraw_max\tnBraw_max\tCN_raw_max\tLOH_max\tcopydiff_2_max\tcopydiff_ploidy_max\tlogratio_2_max\tlogratio_ploidy_max\n";
30 |
31 | while ($gene = ) {
32 |
33 | chomp($gene);
34 | @line = split(/\t/, $gene);
35 | $chr = $line[2];
36 | $start = $line[3];
37 | $end = $line[4];
38 |
39 | #$cnraw1=999;
40 | $numseg=0;
41 | $region="";
42 | %segline = ();
43 | @n = ();
44 |
45 | for ($j=1; $j<=$#data; $j++) {
46 | @segment = split(/\t/, $data[$j]);
47 |
48 | $chr_cn = $segment[1];
49 | $pos1 = $segment[2];
50 | $pos2 = $segment[3];
51 | $pos1ext = $segment[9];
52 | $pos2ext = $segment[10];
53 | $left = $segment[11];
54 | $right = $segment[12];
55 | $cnraw = $segment[13];
56 |
57 | if (($chr_cn eq $chr) && ($start <= $pos2ext) && ($end >= $pos1ext)) { #overlap
58 | #$numseg++;
59 | push(@n, $cnraw);
60 | $segline{$cnraw} = [ @segment ];
61 |
62 | #check if overlap with regions with no call
63 | if (($start <= $pos1) && ($end >= $pos1ext)) {
64 | $region = $region.$left.";";
65 | }
66 | if (($start <= $pos2ext) && ($end >= $pos2)) {
67 | $region = $region.$right.";";
68 | }
69 |
70 | #if ($cnraw < $cnraw1) {
71 | # $cnraw1 = $cnraw;
72 | # $count = $j;
73 | #}
74 | }
75 | }
76 |
77 | if ($region eq "") {
78 | $region = "NA";
79 | }
80 |
81 | if ($#n >= 0) {
82 |
83 | $numseg = $#n +1;
84 | @sortn = sort{ $a <=> $b } @n;
85 |
86 | $nA = $segline{$sortn[0]}[4];
87 | $nB = $segline{$sortn[0]}[5];
88 | $rawA = $segline{$sortn[0]}[6];
89 | $rawB = $segline{$sortn[0]}[7];
90 | $cnraw = $segline{$sortn[0]}[13];
91 | $loh = $segline{$sortn[0]}[14];
92 | $ploidy= $segline{$sortn[0]}[17];
93 | $copydiff1 = $segline{$sortn[0]}[18];
94 | $copydiff2 = $segline{$sortn[0]}[19];
95 | $logratio1 = $segline{$sortn[0]}[20];
96 | $logratio2 = $segline{$sortn[0]}[21];
97 |
98 | $outline = "$gene\t$numseg\t$region\t$ploidy\t$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2\t";
99 |
100 | if ($numseg > 1 ) {
101 | $nA = $segline{$sortn[$#sortn]}[4];
102 | $nB = $segline{$sortn[$#sortn]}[5];
103 | $rawA = $segline{$sortn[$#sortn]}[6];
104 | $rawB = $segline{$sortn[$#sortn]}[7];
105 | $cnraw = $segline{$sortn[$#sortn]}[13];
106 | $loh = $segline{$sortn[$#sortn]}[14];
107 | $copydiff1 = $segline{$sortn[$#sortn]}[18];
108 | $copydiff2 = $segline{$sortn[$#sortn]}[19];
109 | $logratio1 = $segline{$sortn[$#sortn]}[20];
110 | $logratio2 = $segline{$sortn[$#sortn]}[21];
111 | }
112 | else {
113 | $nA = "NA";
114 | $nB = "NA";
115 | $rawA = "NA";
116 | $rawB = "NA";
117 | $cnraw = "NA";
118 | $loh = "NA";
119 | $copydiff1 = "NA";
120 | $copydiff2 = "NA";
121 | $logratio1 = "NA";
122 | $logratio2 = "NA";
123 |
124 | }
125 |
126 | $outline = $outline."$nA\t$nB\t$rawA\t$rawB\t$cnraw\t$loh\t$copydiff1\t$copydiff2\t$logratio1\t$logratio2";
127 | print OUTFILE "$outline\n";
128 | }
129 | }
130 |
131 | close (GENEFILE);
132 | close (OUTFILE);
133 |
--------------------------------------------------------------------------------
/CNV_PDX/bin/get_model_gender.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | from __future__ import print_function
3 | import sys
4 | import requests
5 | import argparse
6 | import json
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-d', '--details', action='store_true',
11 | help="Return ID information as well as gender")
12 | parser.add_argument('id', help="The model ID whose gender is needed")
13 |
14 | return parser.parse_args()
15 |
16 |
17 | def main():
18 | args = parse_args()
19 | # for test:
20 | url = 'http://JSON/gender'
21 | # for production:
22 | url = 'http://elims/JSON/gender'
23 | r = requests.get(url, params={'id': args.id})
24 | if r.status_code == 200:
25 | try:
26 | d = r.json()
27 | except:
28 | print('JSON decoding failed. Here is the returned string:',
29 | file=sys.stderr)
30 | print(r.text, file=sys.stderr)
31 | print('And the request...', file=sys.stderr)
32 | print(r.request.__dict__, file=sys.stderr)
33 |
34 | sys.exit(3)
35 |
36 | if args.details:
37 | print('Query ID: {0}\tInventory Code: {1}\t'
38 | 'Model ID: {2}\tGender: {3}'.
39 | format(d['query_id'], d['inventory_code'],
40 | d['model_id'], d['gender']))
41 | else:
42 | print(d['gender'].lower())
43 | if d['gender'] == "NOT FOUND":
44 | # We're about to exit with error status. Write a reason to the log.
45 | print("Couldn't find model {0} in the database.".format(args.id),
46 | file=sys.stderr)
47 | sys.exit(1)
48 | else:
49 | print('Request failed with status code:', r.status_code, file=sys.stderr)
50 | sys.exit(2)
51 |
52 | if __name__ == '__main__':
53 | main()
54 |
--------------------------------------------------------------------------------
/CNV_PDX/bin/lrrbaf_ascat_tumor.R:
--------------------------------------------------------------------------------
1 | options(scipen = 999)
2 |
3 | args=(commandArgs(TRUE))
4 | snp_pos <- args[1]
5 | gcfile <- args[2]
6 |
7 | lrrbaf = read.table("lrr_baf1.txt", header = T, sep = "\t", row.names=1)
8 |
9 | SNPpos = read.table(snp_pos,header=T,sep="\t",row.names=1)
10 |
11 | firstline = read.table("lrr_baf1.txt", nrows=1, sep = "\t")
12 | sample = sub(".CEL.Log.R.Ratio","",firstline[1,4])
13 | #sample = sub(".CEL.Log.R.Ratio","",colnames(lrrbaf)[3])
14 |
15 | Tumor_LogR = lrrbaf[rownames(SNPpos),3,drop=F]
16 | colnames(Tumor_LogR) = sample
17 |
18 | Tumor_BAF = lrrbaf[rownames(SNPpos),4,drop=F]
19 | colnames(Tumor_BAF) = sample
20 |
21 | #Normal_LogR = lrrbaf[rownames(SNPpos),5,drop=F]
22 | #colnames(Normal_LogR) = sample
23 |
24 | #Normal_BAF = lrrbaf[rownames(SNPpos),6,drop=F]
25 | #colnames(Normal_BAF) = sample
26 |
27 | #replace 2's by NA
28 | Tumor_BAF[Tumor_BAF==2]=NA
29 | #Normal_BAF[Normal_BAF==2]=NA
30 |
31 | # Tumor_LogR: correct difference between copy number only probes and other probes
32 | CNprobes = substring(rownames(SNPpos),1,2)=="CN"
33 |
34 | Tumor_LogR[CNprobes,1] = Tumor_LogR[CNprobes,1]-mean(Tumor_LogR[CNprobes,1],na.rm=T)
35 | Tumor_LogR[!CNprobes,1] = Tumor_LogR[!CNprobes,1]-mean(Tumor_LogR[!CNprobes,1],na.rm=T)
36 |
37 | #Normal_LogR[CNprobes,1] = Normal_LogR[CNprobes,1]-mean(Normal_LogR[CNprobes,1],na.rm=T)
38 | #Normal_LogR[!CNprobes,1] = Normal_LogR[!CNprobes,1]-mean(Normal_LogR[!CNprobes,1],na.rm=T)
39 |
40 | # limit the number of digits:
41 | Tumor_LogR = round(Tumor_LogR,4)
42 | #Normal_LogR = round(Normal_LogR,4)
43 |
44 | write.table(cbind(SNPpos,Tumor_BAF),paste(sample, ".tumor.BAF.txt", sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
45 | #write.table(cbind(SNPpos,Normal_BAF),paste(sample, ".normal.BAF.txt", sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
46 |
47 | write.table(cbind(SNPpos,Tumor_LogR),paste(sample, ".tumor.LogR.txt", sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
48 | #write.table(cbind(SNPpos,Normal_LogR),paste(sample, ".normal.LogR.txt", sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
49 |
50 | #run ASCAT functions
51 |
52 | library(ASCAT)
53 | file.tumor.LogR <- dir(pattern="tumor.LogR")
54 | file.tumor.BAF <- dir(pattern="tumor.BAF")
55 | #file.normal.LogR <- dir(pattern="normal.LogR")
56 | #file.normal.BAF <- dir(pattern="normal.BAF")
57 |
58 | gender <- read.table("gender.txt", sep="\t")
59 | sex <- as.vector(gender[1,1])
60 | sex[sex == "female"] <- "XX"
61 | sex[sex == "male"] <- "XY"
62 | sex[sex == "unknown"] <- "XX"
63 |
64 | #samplename <- sub(".tumor.LogR.txt", "", file.tumor.LogR)
65 |
66 | if (sex == "XX") {
67 |
68 | ascat.bc <- ascat.loadData(file.tumor.LogR, file.tumor.BAF, chrs=c(1:22, "X", "Y"), gender=sex)
69 |
70 | } else if (sex == "XY") {
71 |
72 | ascat.bc <- ascat.loadData(file.tumor.LogR, file.tumor.BAF, chrs=c(1:22, "X","Y"), gender=sex)
73 |
74 | }
75 | #ascat.bc <- ascat.loadData(file.tumor.LogR, file.tumor.BAF, file.normal.LogR, file.normal.BAF, chrs=c(1:22, "X"), gender=sex)
76 |
77 | #GC correction for SNP6 data
78 | ascat.bc <- ascat.GCcorrect(ascat.bc, gcfile)
79 |
80 | ascat.plotRawData(ascat.bc)
81 |
82 | gg<-ascat.predictGermlineGenotypes(ascat.bc, platform = "AffySNP6")
83 |
84 | ascat.bc = ascat.aspcf(ascat.bc, ascat.gg=gg)
85 |
86 | ascat.plotSegmentedData(ascat.bc)
87 |
88 | ascat.output = ascat.runAscat(ascat.bc)
89 |
90 | #save ASCAT results
91 |
92 | save.image(paste(sample,".RData",sep=""))
93 |
94 | if ( length(ascat.output$failedarrays) == 0 ) {
95 |
96 | num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments_raw))
97 | for (i in 1:nrow(ascat.output$segments_raw)) {
98 |
99 | #print(i)
100 | L1 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical.Position == ascat.output$segments_raw$startpos[i])
101 | L2 = which(SNPpos$Chromosome == ascat.output$segments_raw$chr[i] & SNPpos$Physical.Position == ascat.output$segments_raw$endpos[i])
102 | num_probes[i] = L2[length(L2)] - L1[1] + 1
103 |
104 | }
105 | seg_raw = cbind(ascat.output$segments_raw,num_probes)
106 |
107 | num_probes <- vector(mode="numeric", length=nrow(ascat.output$segments))
108 | for (i in 1:nrow(ascat.output$segments)) {
109 |
110 | #print(i)
111 | L1 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical.Position == ascat.output$segments$startpos[i])
112 | L2 = which(SNPpos$Chromosome == ascat.output$segments$chr[i] & SNPpos$Physical.Position == ascat.output$segments$endpos[i])
113 | num_probes[i] = L2[length(L2)] - L1[1] + 1
114 |
115 | }
116 | seg = cbind(ascat.output$segments,num_probes)
117 |
118 | write.table(seg_raw, file=paste(sample,".segments_raw.txt",sep=""), sep="\t", quote=F, row.names=F)
119 | write.table(seg, file=paste(sample,".segments.txt",sep=""), sep="\t", quote=F, row.names=F)
120 | write.table(as.data.frame(ascat.output$aberrantcellfraction), file=paste(sample,".aberrantcellfraction.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
121 | write.table(as.data.frame(ascat.output$ploidy), file=paste(sample,".ploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
122 |
123 | } else {
124 |
125 | write.table(as.data.frame(ascat.output$failedarrays), file=paste(sample,".failedarrays.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
126 |
127 | }
128 |
129 | if ( !is.null(ascat.output$nonaberrantarrays) ) {
130 |
131 | write.table(as.data.frame(ascat.output$nonaberrantarrays), file=paste(sample,".nonaberrantarrays.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=F)
132 |
133 | }
134 |
--------------------------------------------------------------------------------
/CNV_PDX/bin/segment_raw_extend.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl -w
2 | use POSIX;
3 | use File::Basename;
4 |
5 | # This script adds to segment file the the arm fraction, LOH and CN diff and log ratio relative to 2 and ploidy
6 | # The segments are extended
7 |
8 | # nohup perl segment_raw_annotate.pl *segments_raw.txt *ploidy.txt hg19_chromosome_arm.txt gender.txt &
9 |
10 | if ($#ARGV != 3) {
11 | print "This scripts requires: \n";
12 | exit(-1);
13 | }
14 |
15 | $file_cn = $ARGV[0];
16 | $file_ploidy = $ARGV[1];
17 | $file_arm = $ARGV[2];
18 | $file_gender = $ARGV[3];
19 |
20 | $file_output = basename($file_cn,".txt").".extend.txt";
21 |
22 | $ploidy = `cat $file_ploidy`;
23 | chomp($ploidy);
24 |
25 | $gender = `cat $file_gender`;
26 | chomp($gender);
27 |
28 | if (($gender eq "female") || ($gender eq "unknown")) {
29 | $cn_factor = 1;
30 | }
31 | elsif ($gender eq "male") {
32 | $cn_factor= 0.5;
33 | }
34 |
35 | $tmp = `cat $file_arm | awk 'NR>1'`;
36 | @arm = split(/\n/,$tmp);
37 | chomp(@arm);
38 |
39 | open(CN, "$file_cn") or die "can't open $file_cn: $!";
40 | $tmp = ;
41 | chomp($tmp);
42 |
43 | open(OUTFILE, ">$file_output");
44 | print OUTFILE "$tmp\tstartext\tendext\tstartext_desc\tendext_desc\tCN_raw\tLOH\tparm_fraction\tqarm_fraction\tploidy\tcopydiff_2\tcopydiff_ploidy\tlogratio_2\tlogratio_ploidy\n";
45 |
46 | open(TMPFILE, ">tmp.txt");
47 |
48 | #merge segments
49 | $tmp = ;
50 | chomp($tmp);
51 | @line = split(/\t/,$tmp);
52 | $sample = $line[0];
53 | $chromo = $line[1];
54 | $n1 = $line[4];
55 | $n2 = $line[5];
56 | $cn1 = $line[6];
57 | $cn2 = $line[7];
58 | $start = $line[2];
59 | $end = $line[3];
60 | $num = $line[8];
61 |
62 | while ($tmp = ) {
63 | chomp($tmp);
64 | @line = split(/\t/,$tmp);
65 |
66 | if (($chromo eq $line[1]) && ($cn1 == $line[6]) && ($cn2 == $line[7])) {
67 | $end = $line[3];
68 | $num = $num + $line[8];
69 | }
70 | else {
71 | print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n";
72 | $sample = $line[0];
73 | $chromo = $line[1];
74 | $n1 = $line[4];
75 | $n2 = $line[5];
76 | $cn1 = $line[6];
77 | $cn2 = $line[7];
78 | $start = $line[2];
79 | $end = $line[3];
80 | $num = $line[8];
81 | }
82 | }
83 | #lastline
84 | print TMPFILE "$sample\t$chromo\t$start\t$end\t$n1\t$n2\t$cn1\t$cn2\t$num\n";
85 |
86 | close (CN);
87 | close (TMPFILE);
88 |
89 | open(CN, "tmp.txt") or die "can't open tmp.txt: $!";
90 | @seg = ;
91 | chomp(@seg);
92 | close (CN);
93 | $n = 0;
94 |
95 | for ($j=0; $j<$#seg; $j++) {
96 |
97 | @array1 = split(/\t/,$seg[$j]);
98 | @array2 = split(/\t/,$seg[$j+1]);
99 | #$x1 = $array1[2];
100 | $x2 = $array1[3];
101 | $y1 = $array2[2];
102 | #$y2 = $array2[3];
103 |
104 | if ($array1[1] ne $n) { #first line for chr
105 |
106 | $n = $array1[1];
107 | $left = 0;
108 | $left1 = "telomere";
109 |
110 | for ($i=1; $i<=$#arm; $i+=2) {
111 | @line = split(/\t/,$arm[$i]);
112 | if ($n eq substr($line[0],3)) {
113 | $a = $line[1];
114 | $b = $line[2];
115 | }
116 | }
117 |
118 | if ($array2[1] ne $n) { #last line for chr
119 | $right = $b;
120 | $right1 = "telomere";
121 | }
122 | elsif (($x2 < $a) && ($y1 > $a)) {
123 | $right = $a;
124 | $right1 = "centromere";
125 | }
126 | else {
127 | $right = floor(($x2 + $y1)/2);
128 | $right1 = "no_probe";
129 | }
130 | }
131 | else {
132 |
133 | $left = $right + 1;
134 | $left1 = $right1;
135 |
136 | if ($array2[1] ne $n) { #last line for chr
137 |
138 | $right = $b;
139 | $right1 = "telomere";
140 | }
141 | elsif (($x2 < $a) && ($y1 > $a)) {
142 | $right = $a;
143 | $right1 = "centromere";
144 | }
145 | else {
146 | $right = floor(($x2 + $y1)/2);
147 | $right1 = "no_probe";
148 | }
149 | }
150 |
151 | $copy = $array1[6] + $array1[7];
152 | if ($array1[6] >= 0.5 && $array1[7] <= 0.1) {
153 | $loh=1;
154 | }
155 | else {
156 | $loh=0;
157 | }
158 |
159 | for ($i=0; $i<=$#arm; $i+=2) {
160 | @line = split(/\t/,$arm[$i]);
161 | if ($n eq substr($line[0],3)) {
162 | if (($right>=$line[1]) && ($left<=$line[2])) {
163 | @tmp = ($left,$right,$line[1],$line[2]);
164 | @sorttmp = sort{ $a <=> $b } @tmp;
165 | $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]);
166 | }
167 | else {
168 | $overlap1=0;
169 | }
170 | }
171 | }
172 |
173 | for ($i=1; $i<=$#arm; $i+=2) {
174 | @line = split(/\t/,$arm[$i]);
175 | if ($n eq substr($line[0],3)) {
176 | if (($right>=$line[1]) && ($left<=$line[2])) {
177 | @tmp = ($left,$right,$line[1],$line[2]);
178 | @sorttmp = sort{ $a <=> $b } @tmp;
179 | $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]);
180 | }
181 | else {
182 | $overlap2=0;
183 | }
184 | }
185 | }
186 |
187 | if (($n eq "X") || ($n eq "Y")) {
188 | $diff1=$copy - ($cn_factor * 2);
189 | $diff2=$copy- ($cn_factor * $ploidy);
190 | $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2);
191 | $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2);
192 | }
193 | else {
194 | $diff1=$copy-2;
195 | $diff2=$copy-$ploidy;
196 | $logratio1 = log(($copy+0.01)/2)/log(2);
197 | $logratio2 = log(($copy+0.01)/$ploidy)/log(2);
198 | }
199 |
200 | print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n";
201 | }
202 |
203 | @array1 = split(/\t/,$seg[$#seg]);
204 |
205 | if ($array1[1] ne $n) { #first line for chr
206 |
207 | $n = $array1[1];
208 | $left = 0;
209 | $left1 = "telomere";
210 |
211 | for ($i=1; $i<=$#arm; $i+=2) {
212 | @line = split(/\t/,$arm[$i]);
213 | if ($n eq substr($line[0],3)) {
214 | $a = $line[1];
215 | $b = $line[2];
216 | }
217 | }
218 |
219 | $right = $b;
220 | $right1 = "telomere";
221 |
222 | }
223 | else {
224 |
225 | $left = $right + 1;
226 | $left1 = $right1;
227 |
228 | $right = $b;
229 | $right1 = "telomere";
230 |
231 | }
232 |
233 | $copy = $array1[6] + $array1[7];
234 | if ($array1[6] >= 0.5 && $array1[7] <= 0.1) {
235 | $loh=1;
236 | }
237 | else {
238 | $loh=0;
239 | }
240 |
241 | for ($i=0; $i<=$#arm; $i+=2) {
242 | @line = split(/\t/,$arm[$i]);
243 | if ($n eq substr($line[0],3)) {
244 | if (($right>=$line[1]) && ($left<=$line[2])) {
245 | @tmp = ($left,$right,$line[1],$line[2]);
246 | @sorttmp = sort{ $a <=> $b } @tmp;
247 | $overlap1=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]);
248 | }
249 | else {
250 | $overlap1=0;
251 | }
252 | }
253 | }
254 |
255 | for ($i=1; $i<=$#arm; $i+=2) {
256 | @line = split(/\t/,$arm[$i]);
257 | if ($n eq substr($line[0],3)) {
258 | if (($right>=$line[1]) && ($left<=$line[2])) {
259 | @tmp = ($left,$right,$line[1],$line[2]);
260 | @sorttmp = sort{ $a <=> $b } @tmp;
261 | $overlap2=($sorttmp[2]-$sorttmp[1])/($line[2]-$line[1]);
262 | }
263 | else {
264 | $overlap2=0;
265 | }
266 | }
267 | }
268 |
269 | if (($n eq "X") || ($n eq "Y")) {
270 | $diff1=$copy - ($cn_factor * 2);
271 | $diff2=$copy- ($cn_factor * $ploidy);
272 | $logratio1 = log(($copy+0.01)/($cn_factor * 2))/log(2);
273 | $logratio2 = log(($copy+0.01)/($cn_factor * $ploidy))/log(2);
274 | }
275 | else {
276 | $diff1=$copy-2;
277 | $diff2=$copy-$ploidy;
278 | $logratio1 = log(($copy+0.01)/2)/log(2);
279 | $logratio2 = log(($copy+0.01)/$ploidy)/log(2);
280 | }
281 |
282 | print OUTFILE "$seg[$j]\t$left\t$right\t$left1\t$right1\t$copy\t$loh\t$overlap1\t$overlap2\t$ploidy\t$diff1\t$diff2\t$logratio1\t$logratio2\n";
283 |
284 | close(CN);
285 | close (OUTFILE);
286 |
--------------------------------------------------------------------------------
/CNV_PDX/gender_single_tool.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
7 | Get gender from elims or genotype and prepare listfile
8 |
9 |
24 |
25 |
26 | python
27 | apt/1.15.0
28 |
29 |
30 |
31 |
32 | {elims_gender} {in_1} > {out_3}
33 |
34 |
35 |
36 |
37 |
38 | "cel_files" > {list_file}
39 |
40 |
41 |
42 | {in_2} >> {list_file}
43 |
44 |
45 |
46 | -v a=|in_3| '{if (NR>1) print a"/"$2}' |in_4| >> |list_file|
47 |
48 |
49 |
50 | -v a=|in_3| '{if (NR>1) print a"/"$2}' |in_5| >> |list_file|
51 |
52 |
53 |
55 |
56 |
58 |
59 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 | apt-probeset-genotype -c {cdf} -a birdseed --read-models-birdseed {birdseed_models} --special-snps {special_snps} --out-dir {in_6} --cel-files {list_file}
72 |
73 |
74 |
75 | {birdseed_report} | grep -v "#" | awk 'NR==2' | cut -f2 > {out_4}
76 |
77 |
78 |
79 | gender=`cat {out_3}`
80 |
81 |
82 |
83 | if [ "X$gender" = "Xunknown" -o "X$gender" = "Xunspecified" ];
84 | then
85 | gender=`cat {out_4}`;
86 | cp {out_4} {out_1};
87 | else
88 | cp {out_3} {out_1};
89 | fi
90 |
91 |
92 |
93 | "cel_files" > {out_2}
94 |
95 |
96 |
97 | {in_2} >> {out_2}
98 |
99 |
100 |
101 | if [ "X$gender" = "Xfemale" -o "X$gender" = "Xunknown" ];
102 | then
103 | awk -v a=#in_3# '{if (NR>1) print a"/"$2}' #in_4# >> #out_2#;
104 | elif [ "X$gender" = "Xmale" ];
105 | then
106 | awk -v a=#in_3# '{if (NR>1) print a"/"$2}' #in_5# >> #out_2#;
107 | fi
108 |
109 |
110 |
111 | {birdseed_confidences} | grep -v "#" | cut -f1-2 > {birdseed_confidences1}
112 |
113 |
114 |
115 | {birdseed_calls} | grep -v "#" | cut -f1-2 > {birdseed_calls1}
116 |
117 |
118 |
119 | {birdseed_report} | grep -v "#" | head -2 > {birdseed_report1}
120 |
121 |
122 |
123 |
--------------------------------------------------------------------------------
/CNV_PDX/lrrbaf_tool.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
7 | Normalize snp array cancer cel file with 300 hapmap cel files, and outputs LRR (Log R ratio) and BAF (B-Allele Frequency) using genoclustering
8 |
9 |
16 |
17 |
18 | apt/1.15.0
19 |
20 |
21 |
22 |
24 |
25 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | apt-probeset-summarize --cdf-file {cdf} {analysis} --target-sketch {target} --out-dir {in_2} --cel-files {in_1}
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 | {geno_cluster} {clusterfile} {summary} -locfile {pfb} -out {out_1}
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/CNV_PDX/single_sample_cnv_snparray.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 |
7 |
11 |
12 |
16 |
17 |
18 |
23 |
24 |
25 |
30 |
32 |
33 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
45 |
46 |
48 |
49 |
51 |
52 |
54 |
55 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
94 |
95 |
96 |
98 |
99 |
101 |
102 |
103 |
104 |
105 |
106 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/CTP_PDX/XenomeSingleSample_PDX_Panel.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
8 |
12 |
13 |
14 |
15 |
20 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
35 |
38 |
40 |
41 |
42 |
43 |
44 |
46 |
47 |
48 |
49 |
50 |
51 |
53 |
54 |
55 |
57 |
58 |
59 |
63 |
64 |
65 |
66 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
82 |
83 |
84 |
85 |
87 |
91 |
93 |
94 |
96 |
98 |
99 |
100 |
101 |
102 |
106 |
107 |
111 |
112 |
116 |
117 |
118 |
120 |
121 |
123 |
124 |
126 |
127 |
129 |
130 |
132 |
133 |
135 |
136 |
138 |
139 |
141 |
142 |
143 |
147 |
148 |
152 |
153 |
154 |
158 |
159 |
160 |
164 |
165 |
166 |
170 |
171 |
172 |
173 |
174 |
176 |
178 |
180 |
182 |
183 |
184 |
188 |
189 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
203 |
204 |
205 |
206 |
--------------------------------------------------------------------------------
/CTP_PDX/aggregate_stats_updated.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | Compile alignment QC, duplication metrics and coverage statistics into a single file
5 |
6 |
16 |
17 | python/2.7.3
18 |
19 | {out_1} {in_1} {in_2} {in_3}
20 |
21 |
22 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/aggregate_stats_updated.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # aggregate_CGA_stats.py OUT INP_QC INP_HS
4 | # A script to parse the quality and hybrid-selection statistics files
5 | # and aggregate relevant metrics into an output file.
6 |
7 | # Parameters:
8 | # out = output file
9 | # inp_qc = *stat file output by qualtool
10 | # inp_dup = *.dat output picard mark duplicates
11 | # inp_hs = *Metricsfile.txt outoput by Picard CalculateHsMetrics
12 |
13 | import sys
14 |
15 | if len(sys.argv) < 5:
16 | print >>sys.stderr, "Commandline arguments missing:\nFormat: aggregate_CGA_stats.py OUT INP_QC INP_DUP INP_HS\nout = output file\ninp_qc = *stat file output by qualtool\ninp_dup = *.dat output by Picard MarkDuplicates\ninp_hs = *Metricsfile.txt outoput by Picard CalculateHsMetrics"
17 | sys.exit()
18 |
19 | out = open(sys.argv[1],"w")
20 | inp_qc = open(sys.argv[2],"r")
21 | inp_dup = open(sys.argv[3],"r")
22 | inp_hs = open(sys.argv[4],"r")
23 |
24 | qc_out = [None, None]
25 | read_data = False
26 | for line in inp_qc:
27 | line = line.strip()
28 | elems = line.split("\t")
29 |
30 | if line.startswith("QC statistics"):
31 | read_data = True
32 |
33 | if line.startswith("Detailed QC statistics"):
34 | break
35 |
36 | if read_data:
37 | if None not in qc_out:
38 | break
39 | if line.startswith("Total number of reads"):
40 | try:
41 | elems = line.split()
42 | qc_out[0] = str(int(elems[-1]) + int(elems[-2]))
43 | except Exception:
44 | qc_out[0] = "NA"
45 | if line.startswith("Total number of HQ filtered reads"):
46 | try:
47 | elems = line.split()
48 | qc_out[1] = str(int(elems[-1]) + int(elems[-2]))
49 | except Exception:
50 | qc_out[1] = "NA"
51 | print >>out, "Total number of reads\t%s\nTotal number of HQ filtered reads\t%s" %(qc_out[0],qc_out[1])
52 |
53 | data_lines_dup = []
54 | for line in inp_dup:
55 | line = line.strip()
56 | if line and not(line.startswith("#")):
57 | data_lines_dup.append(line)
58 |
59 | col_names = data_lines_dup[0].split("\t")
60 | col_values = data_lines_dup[1].split("\t")
61 | for i,n in enumerate(col_names):
62 | if n in ["PERCENT_DUPLICATION"]:
63 | print >>out, "%s\t%s" %(n,col_values[i])
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | data_lines = []
73 | for line in inp_hs:
74 | line = line.strip()
75 | if line and not(line.startswith("#")):
76 | data_lines.append(line)
77 |
78 |
79 | if len(data_lines) != 2:
80 | print >>sys.stderr, "CoverageMetrics.txt is invalid"
81 | else:
82 | col_names = data_lines[0].split("\t")
83 | col_values = data_lines[1].split("\t")
84 | for i,n in enumerate(col_names):
85 | if n in ["PF_UNIQUE_READS", "PCT_PF_UQ_READS_ALIGNED", "PCT_SELECTED_BASES", "MEAN_TARGET_COVERAGE"] or n.startswith("PCT_TARGET_BASES"):
86 | print >>out, "%s\t%s" %(n,col_values[i])
87 |
88 |
89 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/allele_depth_min_and_AF_from_ADs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 |
4 | GOALS of script:
5 | recompute the locus depth from the allele-depths, and filter
6 | based on a minimumTotalAlleleDepth
7 |
8 | Add Estimated Allele Frequency (ALT_AF) to the info cell
9 |
10 | The FORMAT column indicates the order of the fields in the following,
11 | sample, column.
12 | We need to find the AD (allele depth) field in the FORMAT, and
13 | then process the corresponding field in the sample column. It will
14 | be two comma separated integers.
15 | Add them together and compare to the required minimum depth.
16 | If greater or equal, output the line as-is.
17 | If less, change the FILTER column value to MinDP and output the line.
18 |
19 | """
20 | import sys
21 |
22 | # Check for a version request.
23 | version.parse_options()
24 |
25 | ALT_ALLELE_INDEX = 4
26 | FILTER_CELL_INDEX = 6
27 | INFO_CELL_INDEX = 7
28 | FORMAT_CELL_INDEX = 8
29 | SAMPLE_DATA_INDEX = 9
30 |
31 | # expecting sys.argv to look something like
32 | if len(sys.argv) != 4:
33 | print "Incorrect number of args!"
34 | print "expected usage:"
35 | print "allele_depth_min_and_AF_from_ADs.py inputFile " \
36 | "outputFile minDP"
37 | raise Exception("Incorrect number of args!")
38 |
39 | inp = open(sys.argv[1], 'r')
40 | out = open(sys.argv[2], 'w')
41 | minimumTotalAlleleDepth = int(sys.argv[3])
42 |
43 | NEW_INFO_HEADERS = ['##INFO=',
47 | '##INFO=',
50 | '##FILTER=' %minimumTotalAlleleDepth]
51 | vcf_headers = []
52 | for line in inp:
53 | line = line.strip()
54 | if line.startswith("#"):
55 | vcf_headers.append(line)
56 | # adding new info headers just before the #CHROM line
57 | if line.startswith("#CHROM"):
58 | # remove any existing INFO headers for DP_HQ and ALT_AF
59 | vcf_headers = [e for e in vcf_headers if
60 | "##INFO=> out, e
72 | continue
73 | elems = line.split("\t")
74 | info = elems[INFO_CELL_INDEX]
75 | formatTokens = elems[FORMAT_CELL_INDEX].split(":")
76 | alleleDepthIndex = formatTokens.index('AD')
77 | ads = elems[SAMPLE_DATA_INDEX].split(":")[alleleDepthIndex].\
78 | split(',')
79 | for a, ad in enumerate(ads):
80 | ads[a] = int(ad)
81 | totalAlleleDepth = sum(ads)
82 | # update filter cell with minDP to fail if the totalAlleleDepth
83 | # is less than the specified value
84 | if minimumTotalAlleleDepth is not None and \
85 | totalAlleleDepth < minimumTotalAlleleDepth:
86 | # ignore case when looking for an existing minDP in the
87 | # filter cell
88 | if "minDP".lower() in (elems[FILTER_CELL_INDEX]).lower():
89 | # already contains minDP so don't add another
90 | pass
91 | elif elems[FILTER_CELL_INDEX] == ".":
92 | elems[FILTER_CELL_INDEX] = "minDP"
93 | elif elems[FILTER_CELL_INDEX] == "PASS":
94 | elems[FILTER_CELL_INDEX] = "minDP"
95 | else:
96 | elems[FILTER_CELL_INDEX] += ";minDP"
97 | alternativeAlleles = elems[ALT_ALLELE_INDEX].split(",")
98 | alternateAlleleFrequencies = []
99 | for a, ad in enumerate(ads):
100 | # if the Allele depths are all zero, just call it zero,
101 | # we can't divide by 0
102 | if totalAlleleDepth == 0:
103 | alternateAlleleFrequencies.append("0")
104 | else:
105 | alternateAlleleFrequencies.append(str(
106 | round(ad * (100.0 / totalAlleleDepth))))
107 | # update the info cell to have DP_HQ or ALT_AF entries, taking
108 | # care to remove any existing ones
109 | info_cell_items = elems[INFO_CELL_INDEX].split(";")
110 | # remove any existing DP_HQ or ALT_AF entries
111 | info_cell_items = [e for e in info_cell_items if "DP_HQ=" not in e]
112 | info_cell_items = [e for e in info_cell_items if "ALT_AF=" not in e]
113 |
114 | # add new entry for DP_HQ
115 | info_cell_items.append("DP_HQ=" + str(totalAlleleDepth))
116 |
117 | # add new entries for ALT_AF separately for each alternative allele.
118 | # This splitting ensures that the each alternative allele and its allele frequency are on a new row, and will thus get annotated (downstream) independently.
119 | for a, alt in enumerate(alternativeAlleles):
120 | tmp_elems = list(elems)
121 | tmp_elems[ALT_ALLELE_INDEX] = alt
122 | tmp_info_cell_items = list(info_cell_items)
123 | tmp_info_cell_items.append("ALT_AF=" + alternateAlleleFrequencies[a+1])
124 | tmp_elems[INFO_CELL_INDEX] = ";".join(tmp_info_cell_items)
125 | tmp_format_cell_items = list(tmp_elems[SAMPLE_DATA_INDEX].split(":"))
126 | tmp_format_cell_items[alleleDepthIndex] = "%s,%s" %(ads[0],ads[a+1])
127 | tmp_elems[SAMPLE_DATA_INDEX] = ":".join(tmp_format_cell_items)
128 | tmp_elems[SAMPLE_DATA_INDEX].split(":")[alleleDepthIndex] = "%s,%s" %(ads[0],ads[a+1])
129 | print >> out, "\t".join(tmp_elems)
130 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/caller_add_pindel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # A script to Add Caller column
4 |
5 | export file=$1
6 | export output1=$2
7 |
8 |
9 |
10 |
11 | awk -F$'\t' '/^[^#]/ { print $1,$2,$3,$4,$5,$6,$7,$8";CALLER=Pindel",$9,$10;next } {print $0}' $file|tr ' ' '\t' > $output1
12 |
13 | sed -i '1 a\##INFO=\ ' $output1
--------------------------------------------------------------------------------
/CTP_PDX/bin/clean_intergenic_region_gene_names.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | from __future__ import print_function
4 |
5 | """
6 | A new version of snpEff started reporting the bounding genes for variants
7 | in intergenic regions, resulting in gene "names" comprising two gene names
8 | separated by a dash.
9 |
10 | We don't want these.
11 |
12 | This will be fixed in the pipeline, but we don't want to have to rerun
13 | the pipelines just to clean this up. So this program will go over the existing
14 | *Annotated.tab files, and remove the gene names from any variant whose effect
15 | is "intragenic_region".
16 | """
17 | import csv
18 | import os
19 | from glob import glob
20 | import argparse
21 |
22 |
23 | def parse_args():
24 | """
25 | Parse the command line arguments
26 | :return: The parsed arguments.
27 | """
28 | parser = argparse.ArgumentParser(
29 | "Set the gene name to '', when the variant in an intergenic_region.")
30 | parser.add_argument('-r', '--root',
31 | help="Process all *Annotated.tab files under this "
32 | "directory.")
33 | parser.add_argument('-f', '--file-name',
34 | help="Process only this file.")
35 | parser.add_argument('-s', '--suffix', default='.ORIG',
36 | help="append this suffix to the original file "
37 | "[Default = .ORIG]")
38 | parser.add_argument('-d', '--delete', action='store_true',
39 | help="Delete the original file.")
40 | args = parser.parse_args()
41 |
42 | if args.root and args.file_name:
43 | parser.error("May only specify --root or --file-name, not both.")
44 | if not (args.root or args.file_name):
45 | parser.error("Must specify exactly one of --root or --file-name")
46 | return args
47 |
48 |
49 | def get_files(root):
50 | """
51 | Get the list of *Annotated.tab files.
52 | :param root: The base directory containing the model level directories.
53 | :return: a list of filepaths to the *Annotated.tab files.
54 | """
55 | files = glob(os.path.join(root, '[JT]*', '*', 'analysis', '*',
56 | '*Annotated.tab'))
57 | return files
58 |
59 |
60 | def get_header(fn):
61 | """
62 | Get the header columns as a list.
63 | :param fn: A filename from which we will use the first row to determine
64 | the column names.
65 | :return: a list of column names.
66 | """
67 | with open(fn) as f:
68 | line = f.readline()
69 | headers = line.strip().split('\t')
70 | return headers
71 |
72 |
73 | def needs_processing(fn):
74 | """
75 | Check a file to see if it has the condition that we have to clean up,
76 | before we go to the effort to re-write the whole file.
77 | :param fn: The filename to check.
78 | :return: True if we need to process the file.
79 | """
80 | f = open(fn)
81 | reader = csv.DictReader(f, delimiter='\t')
82 | ret = False
83 | for row in reader:
84 | if row['EFF[*].EFFECT'] == 'intergenic_region' and \
85 | row['EFF[*].GENE'] != '':
86 | ret = True
87 | break
88 | f.close()
89 | return ret
90 |
91 |
92 | def process_file(fn, args):
93 | """
94 | Process one file. Make sure the headers are the same as the initial file;
95 | there is a possibility that different pipelines have different file formats.
96 |
97 | We will rename the original file with the extension .ORIG, and write a new
98 | file with the original name.
99 |
100 | :param fn: The path to the file to process.
101 | :param args: The parsed command line arguments
102 | :return:
103 | """
104 | if not needs_processing(fn):
105 | # Nothing to do.
106 | return
107 | headers = get_header(fn)
108 | orig_file = fn + args.suffix
109 | os.rename(fn, orig_file)
110 | in_f = open(orig_file)
111 | out_f = open(fn, 'wb')
112 |
113 | reader = csv.DictReader(in_f, delimiter='\t')
114 | writer = csv.DictWriter(out_f, headers, delimiter='\t', lineterminator='\n')
115 | writer.writeheader()
116 | for row in reader:
117 | if row['EFF[*].EFFECT'] == 'intergenic_region':
118 | # print(row)
119 | row['EFF[*].GENE'] = ''
120 | # print(row)
121 | writer.writerow(row)
122 | in_f.close()
123 | out_f.close()
124 | if args.delete:
125 | os.remove(orig_file)
126 |
127 |
128 | def main():
129 | args = parse_args()
130 | if args.root:
131 | files = get_files(args.root)
132 | for fn in files:
133 | if not needs_processing(fn):
134 | # print('***Skipping', fn)
135 | continue
136 | print('PROCESSING', fn)
137 | process_file(fn, args)
138 | else:
139 | # The argument parser guarantees that we'll have either root or
140 | # file_name.
141 | process_file(args.file_name, args)
142 |
143 |
144 | if __name__ == '__main__':
145 | main()
146 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/coveragecalculator.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import csv
4 | import sys
5 | import numpy as np
6 | import collections
7 | import fileinput
8 | from collections import defaultdict
9 | from itertools import imap
10 | from sys import argv
11 | import os
12 |
13 | # Check for a version request.
14 | version.parse_options()
15 |
16 |
17 |
18 |
19 |
20 | if len(sys.argv) < 2:
21 | print >>sys.stderr, "This program has no version,It just needs targetbed as input and outputs the target coverage"
22 | sys.exit()
23 |
24 |
25 | targetbed = open(sys.argv[1],"r")
26 | targetcov = open(sys.argv[2],"w")
27 |
28 |
29 |
30 |
31 |
32 |
33 | chr_a=[]
34 | start_a=[]
35 | stop_a=[]
36 | bp_a=[]
37 | cov_a=[]
38 | gene_a=[]
39 |
40 | with targetbed as f:
41 | reader=csv.reader(f,delimiter='\t')
42 | for a,b,c,d,e,f in reader:
43 | chr_a.append(a)
44 | start_a.append(b)
45 | stop_a.append(c)
46 | gene_a.append(d)
47 | cov_a.append(f)
48 |
49 |
50 |
51 |
52 | target_keys = zip(chr_a,start_a,stop_a,gene_a)
53 |
54 | result_dict = defaultdict(list)
55 |
56 |
57 |
58 |
59 | print >>targetcov,'chr',"\t", 'start',"\t", 'stop',"\t",'Gene name',"\t",'Mean_coverage',"\t",'Median_coverage',"\t",'min_coverage',"\t",'Max_coverage'
60 |
61 |
62 | for k,v in zip(target_keys,cov_a):
63 | result_dict[k].append(v)
64 |
65 |
66 | for k,v in result_dict.iteritems():
67 | L = [int(n) for n in v if n]
68 | print >>targetcov,k[0],"\t",k[1],"\t",k[2],"\t",k[3],"\t",sum(L)/float(len(L)),"\t",np.median(L),"\t",min(L),"\t",max(L)
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/filter_dna_coverage.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | from __future__ import print_function
3 | """
4 | Read a Picard coverage stats file and terminate the run if coverage is below
5 | a threshold.
6 |
7 | A picard stats file consists of a metadata section, terminated by the line,
8 |
9 | ## METRICS CLASS net.sf.picard.analysis.directed.HsMetrics
10 |
11 | followed by two rows; a header and data.
12 |
13 | we'll read both lines and then pick out the field matching the requested coverage level.
14 |
15 | """
16 | import sys
17 | import argparse
18 |
19 |
20 | def parse_args():
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('-c', '--ctp-coverage-level', default='100',
23 | help="Coverage level to use for the test on CTP runs: "
24 | "2, 10, 20, 30, 40, 50, or 100 [default: 100]")
25 | parser.add_argument('-d', '--debug', action='store_true',
26 | help="Enable some debugging prints")
27 | parser.add_argument('-x', '--hex-coverage-level', default='20',
28 | help="Coverage level to use for the test on HEX runs: "
29 | "2, 10, 20, 30, 40, 50, or 100 [default: 20]")
30 | parser.add_argument('-p', '--ctp-percentage', type=float, default=75,
31 | help="Minimum %% of bases covered at that depth. Enter"
32 | "as a percentage or decimal (e.g., 75 or 0.75 "
33 | "[default: 75]")
34 | parser.add_argument('-P', '--hex-percentage', type=float, default=75,
35 | help="Minimum %% of bases covered at that depth. Enter"
36 | "as a percentage or decimal (e.g., 75 or 0.75 "
37 | "[default: 75]")
38 | parser.add_argument('files', nargs='+',
39 | help="The file[s] to test.")
40 |
41 | return parser.parse_args()
42 |
43 |
44 | def process_file(fn, coverage, percentage, multiple, debug):
45 | """
46 | Determine whether there was adequate coverage of bases in this file.
47 | NOTE: Returns True if the run was OK.
48 | :param fn:
49 | :param coverage:
50 | :param percentage:
51 | :param multiple:
52 | :param debug:
53 | :return:
54 | """
55 | metadata_ended = False
56 | pattern = 'PCT_TARGET_BASES_{0}X'.format(coverage)
57 | header = []
58 | data = []
59 | for line in open(fn):
60 | line = line.strip()
61 | if not metadata_ended:
62 | if line.startswith('## METRICS CLASS'):
63 | metadata_ended = True
64 | continue
65 | if not header:
66 | # process the header line
67 | header = line.split()
68 | else:
69 | # process the data line
70 | data = line.split()
71 | break
72 |
73 | try:
74 | idx = header.index(pattern)
75 | except ValueError:
76 | print("Could not find coverage column {0} in: {1}\n"
77 | "header line is\n{2}".format(pattern, fn, header),
78 | file=sys.stderr)
79 | return False
80 |
81 | if debug:
82 | print("Percentage at {0} is {1}".format(header[idx], data[idx]))
83 | this_percentage = float(data[idx])
84 | if this_percentage < percentage:
85 | if multiple:
86 | print("{0}X\t{1}\t{2}".format(coverage, this_percentage, fn))
87 | else:
88 | print("Too low coverage percentage at {0}X: {1} {2}".format(
89 | coverage, this_percentage, fn))
90 | print("Too low coverage percentage at {0}X: {1} {2}".format(
91 | coverage, this_percentage, fn), file=sys.stderr)
92 | return False
93 | return True
94 |
95 |
96 | def main():
97 | args = parse_args()
98 |
99 | ctp_percentage = float(args.ctp_percentage)
100 | hex_percentage = float(args.hex_percentage)
101 | multiple = len(args.files) > 1
102 | debug = args.debug
103 |
104 | if ctp_percentage > 1.0:
105 | ctp_percentage /= 100.0
106 | if hex_percentage > 1.0:
107 | hex_percentage /= 100.0
108 |
109 | success = True
110 | coverage = ""
111 | percentage = 0.0
112 | for fn in args.files:
113 | if '_HEX' in fn:
114 | coverage = args.hex_coverage_level
115 | percentage = hex_percentage
116 | elif '_CTP' in fn or '_TEX' in fn:
117 | coverage = args.ctp_coverage_level
118 | percentage = ctp_percentage
119 | else:
120 | print("Couldn't determine HEX or CTP. Assuming CTP.", file=sys.stderr)
121 |
122 | success &= process_file(fn, coverage, percentage, multiple, debug)
123 | if not success:
124 | sys.exit(1)
125 |
126 | if __name__ == '__main__':
127 | main()
128 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/filter_for_minimum_depth:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """Flag variant calls without sufficient support.
4 |
5 | This is a very simple script to process VCF files produced by the
6 | micro-indel caller, pindel. Pindel marks all calls as "PASS" in the
7 | filter column. Our requirement is that a variant have at least
8 | 140 reads supporting it; otherwise, we mark it as 'minDP'"""
9 |
10 | import sys, os
11 |
12 | def usage():
13 | print >> sys.stderr, 'USAGE: {0} input-VCF output-VCF'.format(
14 | sys.argv[0])
15 | sys.exit(1)
16 |
17 | def main():
18 | # Check for a version request.
19 | version.parse_options()
20 |
21 | if len(sys.argv) != 3:
22 | usage()
23 |
24 | of = open(sys.argv[2], 'w')
25 | for line in open(sys.argv[1]):
26 | if line[0] == '#':
27 | of.write(line)
28 | continue
29 |
30 | # We're done with the headers. Now we need to analyze each variant
31 | # call. If the total reads supporting the call are less than 140,
32 | # then mark the filter column (column 7, or parts[6]) as 'minDP'.
33 | # The allele read counts are in the last column, parts[9].
34 | parts = line.rstrip().split('\t')
35 | sample_data = parts[9]
36 | read_counts = sample_data.split(':')[1]
37 | ref_count, allele_count = read_counts.split(',')
38 |
39 | depth = int(ref_count) + int(allele_count)
40 | if depth < 140:
41 | parts[6] = 'minDP'
42 | of.write( '\t'.join(parts) + os.linesep)
43 | main()
44 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/filter_trim.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """
4 | filter_trim.py
5 |
6 | Script does the folllowing:
7 | 1. filter_quality: The quality value considered high quality when
8 | filtering reads.
9 | 2. filter_percent_bases: The percentage of a reads bases that must
10 | be of high quality to consider the read to be high quality.
11 | 3. trimming_quality: The quality value needed for a base to terminate
12 | trimming.
13 | 4. post_trim_length_pct: The minimum length of a read required post
14 | trimming to retain the read, expressed as a percentage of the
15 | initial read length.
16 | 5. trim_5: Whether to trim bases from the 5' end of the read as well
17 | as the the 3' end.
18 | 6. min_pct_hq_reads: The minimum percentage of high quality reads out
19 | of all reads, required for successful exit. Fewer high quality reads
20 | will result in a failure exit status return to the shell, allowing
21 | pipelines using this program to fail. Default: 0.0 (always return
22 | success).
23 | 7. single_end: Use single end mode, even if there are multiple fastq
24 | files presented.
25 |
26 | Trimming occurs per read; trimming does not need to match read 1 vs
27 | read 2 for paired end data. Filtering and acting on the
28 | post_trim_length_min occurs on a per-end basis; however if one read is
29 | discarded due to these criteria, the other end's read must be discarded
30 | as well.
31 |
32 | Inputs:
33 | 1. A sequence of pairs of fastq files from a paired-end sequencing run
34 | or runs. All pairs must be for the same sample.
35 |
36 | Outputs:
37 | 1. One or two filtered, trimmed fastq files. The paired reads may not
38 | be the same length due to trimming.
39 | 2. Statistics file: The pipeline uses several criteria from this
40 | step to determine whether the run was good enough to analyze.
41 | a. Percent HQ reads: Number of reads written to the filtered
42 | file(s) / number of reads in the input file(s).
43 |
44 | We also report the following statistics, but they aren't used to
45 | judge the quality of a run:
46 | b. Total Reads (Reported once, since both ends must be the same.)
47 | c. Total HQ reads (Reported once, since both ends must be the
48 | same.)
49 | d. Min, Max and Mean trimmed read lengths for reads whose trimmed
50 | length is sufficient to retain the read, reported separately for
51 | each end.
52 |
53 | All output file naming is based on the first file in each fastq file
54 | list the following input fastqs are not represented in the output file
55 | names.
56 |
57 | Exit status:
58 | 0 = Success
59 | non-0 = Failure:
60 | 1) Could not open an input file or create an output file
61 | 2) Python 2.7 or higher not used
62 | 3) Insufficient high quality reads
63 | 4) Odd number of fastq files when in paired mode
64 | 5) Input files in a pair are not the same length
65 | 6) Zero total reads, percent of high quality reads not computed
66 | """
67 | __author__ = 'simons'
68 |
69 | import sys
70 | import os
71 | import math
72 | import gzip
73 | import datetime
74 | import inspect
75 |
76 | # In Python 2.7, the core bz2 module can't process multi-stream files, such
77 | # as those produced by pbzip2. In Python 3.4 and above, it can. The
78 | # Python 3 version has been backported to Python 2.7, and is available
79 | # as bz2file. Conditionally load it to handle a wider array of files;
80 | # fall back to the core bz2 module.
81 | try:
82 | import bz2file as bz2
83 | except:
84 | print >> sys.stderr, 'Could not import bz2file; using the core bz2.'
85 | import bz2
86 |
87 | # Support the version command.
88 | cmd_folder = os.path.realpath(os.path.abspath(os.path.split(
89 | inspect.getfile(inspect.currentframe()))[0]))
90 | lib_folder = os.path.join(cmd_folder, '../lib')
91 | if lib_folder not in sys.path:
92 | sys.path.insert(0, lib_folder)
93 |
94 | try:
95 | import argparse
96 | except:
97 | print >> sys.stderr, 'This program requires python V2.7 or higher.'
98 | sys.exit(2)
99 |
100 |
101 | # The guts of this program. All processing of reads.
102 | class FastqRead(object):
103 | trim_5 = False
104 | trim_hq = 30
105 | read_hq = 30
106 | pct_hq = 0.7
107 |
108 | def __init__(self, fastqs, odir=None, suffix='_filtered_trimmed'):
109 | ofn = os.path.split(fastqs[0])[1] + suffix
110 | if odir:
111 | ofn_path = os.path.join(odir, ofn)
112 | else:
113 | ofn_path = ofn
114 | try:
115 | self.of = open(ofn_path, 'w')
116 | except IOError:
117 | print >> sys.stderr, \
118 | 'Could not open "{0}". Exiting.'.format(ofn_path)
119 | sys.exit(1)
120 |
121 | self.fastqs = fastqs
122 | self.fn = None
123 | self.f = None
124 | self.total_reads = 0
125 | self.hq_reads = 0
126 | self.output_reads = 0
127 | self.min_trimmed_length = sys.maxint
128 | self.max_trimmed_length = -1
129 | self.total_trimmed_length = 0
130 | self.trimmed_reads = 0
131 | self.name = ''
132 | self.bases = ''
133 | self.plus = ''
134 | self.qual = ''
135 | self.timestamp = False
136 | self.line_count = 0
137 |
138 | # Initialize our first input fastq
139 | self.next_file()
140 |
141 | def next_file(self):
142 | #Do we have any files left?
143 | if not self.fastqs:
144 | return False
145 |
146 | self.fn = self.fastqs.pop(0)
147 | try:
148 | self.f = FastqRead.open(self.fn)
149 | except IOError:
150 | print >> sys.stderr, \
151 | 'Could not open "{0}". Exiting.'.format(self.fn)
152 | sys.exit(1)
153 | return True
154 |
155 | def get_filename(self):
156 | return self.fn
157 |
158 | @staticmethod
159 | def open(name):
160 | """
161 | Intended to be private to the class...
162 |
163 | A flexible open routine that can handle plain text files or
164 | files compressed with gzip or bzip2. Only used for the
165 | input files. Output files are emitted uncompressed, until the
166 | tools in the next leg of the pipeline can work properly with
167 | compressed files.
168 |
169 | :param name: The filename to open.
170 | :return: A file object for the named file.
171 | """
172 | if name.endswith('.gz'):
173 | f = gzip.open(name)
174 | elif name.endswith('.bz2'):
175 | f = bz2.BZ2File(name)
176 | else:
177 | f = open(name)
178 | return f
179 |
180 | def stats(self):
181 | s = {}
182 | s['total_reads'] = self.total_reads
183 | s['hq_reads'] = self.hq_reads
184 | s['output_reads'] = self.output_reads
185 | s['max_trimmed_length'] = self.max_trimmed_length
186 | s['min_trimmed_length'] = self.min_trimmed_length
187 | try:
188 | tmp_mean = float(self.total_trimmed_length) / \
189 | float(self.trimmed_reads)
190 | s['mean_trimmed_length'] = '{0:.2f}'.format(tmp_mean)
191 | except ZeroDivisionError:
192 | s['mean_trimmed_length'] = 'N/A'
193 |
194 | return s
195 |
196 | def next(self):
197 | """
198 |
199 | :return: True iff the read was successfully retrieved from
200 | the file.
201 | """
202 | name = self.f.readline()
203 | # Test whether we had a successful read.
204 | # Will be zero length if EOF reached.
205 | if not name:
206 | return False
207 | self.name = name.strip()
208 | self.bases = self.f.readline().strip()
209 | self.plus = self.f.readline().strip()
210 | self.qual = self.f.readline().strip()
211 |
212 | # All four lines must have content to be a valid read.
213 | if len(self.bases) == 0 or \
214 | len(self.plus) == 0 or \
215 | len(self.qual) == 0:
216 | print >> sys.stderr, 'NAME:', self.name
217 | print >> sys.stderr, 'BASES:', self.bases
218 | print >> sys.stderr, 'PLUS:', self.plus
219 | print >> sys.stderr, 'QUAL:', self.qual
220 | raise ValueError('Incomplete read found in file {0}'.
221 | format(self.fn))
222 |
223 | self.total_reads += 1
224 | if self.timestamp:
225 | self.line_count += 1
226 | if self.line_count % 1000000 == 0:
227 | print >> sys.stderr, \
228 | datetime.datetime.strftime(datetime.datetime.now(),
229 | '%H:%M:%S'), \
230 | self.line_count
231 | return True
232 |
233 | def do_timestamp(self):
234 | self.timestamp = True
235 |
236 | @staticmethod
237 | def set_criteria(pct_hq=0.7,
238 | read_hq=30,
239 | trim_hq=30,
240 | trim_5=False,
241 | min_pct=0.7,
242 | min_pct_hq_reads=0.0):
243 |
244 | FastqRead.pct_hq = float(pct_hq)
245 | if FastqRead.pct_hq > 1.0:
246 | FastqRead.pct_hq /= 100.0
247 |
248 | # Use phred33 quality scoring
249 | FastqRead.read_hq = chr(int(read_hq) + 33)
250 | FastqRead.trim_hq = chr(int(trim_hq) + 33)
251 | FastqRead.trim_5 = trim_5 # Passed in as boolean
252 | FastqRead.min_pct = float(min_pct)
253 | if FastqRead.min_pct > 1.0:
254 | FastqRead.min_pct /= 100.0
255 | FastqRead.min_pct_hq_reads = float(min_pct_hq_reads)
256 | if FastqRead.min_pct_hq_reads > 1.0:
257 | FastqRead.min_pct_hq_reads /= 100.0
258 |
259 | # Cache the minimum length of a trimmed read.
260 | min_len = None
261 |
262 | def trim(self):
263 | """
264 |
265 | :return: True if the read is long enough after trimming.
266 | """
267 |
268 | original_length = len(self.qual)
269 | if FastqRead.trim_5:
270 | for p5 in range(original_length):
271 | if self.qual[p5] >= FastqRead.trim_hq:
272 | break
273 | else:
274 | p5 = 0
275 |
276 | for p3 in range(original_length - 1, -1, -1):
277 | if self.qual[p3] >= FastqRead.trim_hq:
278 | break
279 |
280 | tlg = (p3 - p5) + 1 # Length after trimming.
281 |
282 | if FastqRead.min_len is None:
283 | FastqRead.min_len = \
284 | math.ceil(FastqRead.min_pct * original_length)
285 | if tlg < FastqRead.min_len:
286 | return False
287 |
288 | self.bases = self.bases[p5:p3 + 1]
289 | self.qual = self.qual[p5:p3 + 1]
290 |
291 | assert tlg == len(self.qual), "Length calculation is broken"
292 |
293 | # Track our trimmed length stats
294 | if tlg > self.max_trimmed_length:
295 | self.max_trimmed_length = tlg
296 | if tlg < self.min_trimmed_length:
297 | self.min_trimmed_length = tlg
298 | self.total_trimmed_length += tlg # To compute the mean
299 | self.trimmed_reads += 1
300 | return True
301 |
302 | def filter(self):
303 | """
304 |
305 | :return: True if the read passed HQ filtering criteria
306 | """
307 | lg = len(self.qual)
308 | lq_reads_allowed = math.floor(float(lg) *
309 | (1.0 - FastqRead.pct_hq))
310 | lq_reads = 0
311 | for n in range(lg):
312 | if self.qual[n] < FastqRead.read_hq:
313 | lq_reads += 1
314 | if lq_reads > lq_reads_allowed:
315 | return False
316 | self.hq_reads += 1
317 | return True
318 |
319 | def write(self):
320 | print >> self.of, self.name
321 | print >> self.of, self.bases
322 | print >> self.of, self.plus
323 | print >> self.of, self.qual
324 | self.output_reads += 1
325 |
326 | def close(self):
327 | self.of.close()
328 |
329 | # End of class FastqRead.
330 |
331 |
332 | def parse_args():
333 |
334 | parser = argparse.ArgumentParser(description=
335 | "Perform filtering and trimming of paired end fastq "
336 | "files", version="2.0")
337 | parser.add_argument("-p", "--hq_pct", default="70", help=
338 | "Percentage of bases that must be high quality [70]")
339 | parser.add_argument('-f', '--filter_hq', default="30", help=
340 | 'Numeric quality value to pass filtering [30]')
341 | parser.add_argument('-t', '--trim_hq', default="30", help=
342 | 'Numeric quality value to not be trimmed [30]')
343 | parser.add_argument('-m', '--min_len_pct', default="70", help=
344 | 'Minimum read length after trimming to '
345 | 'retain read. (percentage 0-100) [70]')
346 | parser.add_argument('-M', '--min_pct_hq_reads', default=0, help=
347 | 'Minimum percentage of reads classified as High '
348 | 'Quality reads (percentage 0-100) [0]')
349 | parser.add_argument('-5', '--trim_5', action="store_true", help=
350 | "Trim 5' end as well as 3' [False]")
351 | parser.add_argument('-s', '--suffix', default='_filtered_trimmed',
352 | help='Suffix to construct the output file name '
353 | '[_filtered_trimmed]')
354 | parser.add_argument('-S', '--single_end', action="store_true",
355 | help="Use single end mode with multiple fastq files " \
356 | "[False]")
357 | parser.add_argument('-d', '--directory', dest='odir', default='.',
358 | help=
359 | 'Directory in which to write the output files '
360 | '[current directory]')
361 | parser.add_argument('-i', '--timestamp', action='store_true', help=
362 | 'Emit a timestamp ever 1,000,000 reads [False]')
363 | parser.add_argument("fastqs", nargs="+")
364 | args = parser.parse_args()
365 | return args
366 |
367 |
368 | def output_stats_single(r1, args, start_time):
369 | """
370 | Report the statistics for a single end run.
371 |
372 | NOTE WELL!!!
373 | This routine and output_stats_paired have the same logic
374 | flow. If one is changed, the other almost certainly has to
375 | have the corresponding change made.
376 |
377 | YOU HAVE BEEN WARNED!
378 |
379 | :param r1: Accumulated info for the reads.
380 | :param args: Our command line arguments
381 | :param start_time: The run's start time
382 | :return: None
383 | """
384 |
385 | # Here we have completely processed the input file. Write out
386 | # the statistics
387 | r1_stats = r1.stats()
388 | bn_fq1 = os.path.split(args.fastqs[0])[1]
389 |
390 | with open(os.path.join(args.odir, bn_fq1 + '_stat'), 'w') as sf:
391 | print >> sf, 'Input file:'
392 | print >> sf, 'Read 1: {0}'.format(args.fastqs[::2])
393 | print >> sf, 'QC statistics'
394 | print >> sf, 'Statistic\tRead 1'
395 |
396 | try:
397 | f_pct_hq = float(r1_stats['output_reads']) / \
398 | float(r1_stats['total_reads'])
399 | pct_hq = '{0:.2%}'.format(f_pct_hq)
400 | except ZeroDivisionError:
401 | pct_hq = 'N/A'
402 | print >> sf, 'Percentage of HQ reads\t{0}'.format(pct_hq)
403 |
404 | print >> sf, 'Total number of reads\t{0}'.format(
405 | r1_stats['total_reads'])
406 | print >> sf, 'Total number of HQ filtered reads\t{0}'.\
407 | format(r1_stats['output_reads'])
408 | print >> sf, 'Detailed QC statistics'
409 | print >> sf, 'Reads passing filter\t{0}'.\
410 | format(r1_stats['hq_reads'])
411 |
412 | try:
413 | pct_rpf = '{0:.2%}'.format(float(r1_stats['hq_reads']) /
414 | float(r1_stats['total_reads']))
415 | except ZeroDivisionError:
416 | pct_rpf = 'N/A'
417 | print >> sf, 'Percent reads passing filter\t{0}'.format(pct_rpf)
418 |
419 | print >> sf, 'Max Trimmed Length\t{0}'.\
420 | format(r1_stats['max_trimmed_length'])
421 | print >> sf, 'Min Trimmed Length\t{0}'.\
422 | format(r1_stats['min_trimmed_length'])
423 | print >> sf, 'Mean Trimmed Length\t{0}'.\
424 | format(r1_stats['mean_trimmed_length'])
425 | print >> sf, 'Run start time\t{0}'.\
426 | format(datetime.datetime.strftime(start_time, '%H:%M:%S'))
427 | end_time = datetime.datetime.now()
428 | print >> sf, 'Run end time\t{0}'.\
429 | format(datetime.datetime.strftime(end_time, '%H:%M:%S'))
430 |
431 | if r1_stats['total_reads'] == 0:
432 | # This will be the same as sys.exit(6)
433 | print >> sys.stderr, 'Failure: total reads == 0\nExiting' \
434 | 'with status 6'
435 | return 6
436 | if f_pct_hq < FastqRead.min_pct_hq_reads:
437 | # This will be the same effect as sys.exit(3)
438 | print >> sys.stderr, 'Failure: not enough high quality ' \
439 | 'read percent: {} required: {}\n' \
440 | 'Exiting with status 3'.\
441 | format(f_pct_hq, FastqRead.min_pct_hq_reads)
442 | return 3
443 | # Success!
444 | return 0
445 |
446 | def output_stats_paired(r1, r2, args, start_time):
447 | """
448 | Report the statistics for a paired end run.
449 |
450 | NOTE WELL!!!
451 | This routine and output_stats_single have the same logic
452 | flow. If one is changed, the other almost certainly has to
453 | have the corresponding change made.
454 |
455 | YOU HAVE BEEN WARNED!
456 |
457 | :param r1: Accumulated info for the end 1 reads.
458 | :param r2: Accumulated info for the end 2 reads.
459 | :param args: Our command line arguments
460 | :param start_time: The run's start time
461 | :return: None
462 | """
463 |
464 | # Here we have completely processed both input files. Write out
465 | # the statistics
466 | r1_stats = r1.stats()
467 | r2_stats = r2.stats()
468 |
469 | bn_fq1 = os.path.split(args.fastqs[0])[1]
470 | bn_fq2 = os.path.split(args.fastqs[1])[1]
471 |
472 | with open(os.path.join(args.odir,
473 | '{0}_{1}_stat'.format(bn_fq1, bn_fq2)), 'w') as sf:
474 | print >> sf, 'Input files:'
475 | print >> sf, 'Read 1: {0}'.format(args.fastqs[::2])
476 | print >> sf, 'Read 2: {0}'.format(args.fastqs[1::2])
477 | print >> sf, 'QC statistics'
478 | print >> sf, 'Statistic\tRead 1\tRead 2'
479 |
480 | try:
481 | f_pct_hq1 = float(r1_stats['output_reads']) / \
482 | float(r1_stats['total_reads'])
483 | pct_hq1 = '{0:.2%}'.format(f_pct_hq1)
484 | except ZeroDivisionError:
485 | pct_hq1 = 'N/A'
486 | try:
487 | f_pct_hq2 = float(r2_stats['output_reads']) / \
488 | float(r2_stats['total_reads'])
489 | pct_hq2 = '{0:.2%}'.format(f_pct_hq2)
490 | except ZeroDivisionError:
491 | pct_hq2 = 'N/A'
492 | print >> sf, 'Percentage of HQ reads\t{0}\t{1}'.\
493 | format(pct_hq1, pct_hq2)
494 |
495 | print >> sf, 'Total number of reads\t{0}\t{1}'.format(
496 | r1_stats['total_reads'],
497 | r2_stats['total_reads'])
498 | print >> sf, 'Total number of HQ filtered reads\t{0}\t{1}'.\
499 | format(r1_stats['output_reads'], r2_stats['output_reads'])
500 | print >> sf, 'Detailed QC statistics'
501 | print >> sf, 'Reads passing filter\t{0}\t{1}'.\
502 | format(r1_stats['hq_reads'], r2_stats['hq_reads'])
503 |
504 | try:
505 | pct_rpf1 = '{0:.2%}'.format(float(r1_stats['hq_reads']) /
506 | float(r1_stats['total_reads']))
507 | except ZeroDivisionError:
508 | pct_rpf1 = 'N/A'
509 | try:
510 | pct_rpf2 = '{0:.2%}'.format(float(r2_stats['hq_reads']) /
511 | float(r2_stats['total_reads']))
512 | except ZeroDivisionError:
513 | pct_rpf2 = 'N/A'
514 | print >> sf, 'Percent reads passing filter\t{0}\t{1}'.\
515 | format(pct_rpf1, pct_rpf2)
516 |
517 | print >> sf, 'Max Trimmed Length\t{0}\t{1}'.\
518 | format(r1_stats['max_trimmed_length'],
519 | r2_stats['max_trimmed_length'])
520 | print >> sf, 'Min Trimmed Length\t{0}\t{1}'.\
521 | format(r1_stats['min_trimmed_length'],
522 | r2_stats['min_trimmed_length'])
523 | print >> sf, 'Mean Trimmed Length\t{0}\t{1}'.\
524 | format(r1_stats['mean_trimmed_length'],
525 | r2_stats['mean_trimmed_length'])
526 | print >> sf, 'Run start time\t{0}'.\
527 | format(datetime.datetime.strftime(start_time, '%H:%M:%S'))
528 | end_time = datetime.datetime.now()
529 | print >> sf, 'Run end time\t{0}'.\
530 | format(datetime.datetime.strftime(end_time, '%H:%M:%S'))
531 |
532 | if r1_stats['total_reads'] == 0 or r2_stats['total_reads'] == 0:
533 | # This will be the same as sys.exit(6)
534 | print >> sys.stderr, 'Failure: total reads == 0\nExiting' \
535 | 'with status 6'
536 | return 6
537 | if f_pct_hq1 < FastqRead.min_pct_hq_reads or \
538 | f_pct_hq2 < FastqRead.min_pct_hq_reads:
539 | # This will be the same effect as sys.exit(3)
540 | print >> sys.stderr, 'Failure: not enough high quality ' \
541 | 'read percent: e1: {}, e2: {} ' \
542 | 'required: {}\n' \
543 | 'Exiting with status 3'.\
544 | format(f_pct_hq1, f_pct_hq2, FastqRead.min_pct_hq_reads)
545 | return 3
546 | # Success!
547 | return 0
548 |
549 | def main():
550 | start_time = datetime.datetime.now()
551 | args = parse_args()
552 |
553 | # If we are doing paired end processing, make sure that we have
554 | # pairs (i.e., an even number of files, and split the list of
555 | # files into end-specific lists.
556 | num_fastqs = len(args.fastqs)
557 | paired_end = ((num_fastqs != 1) and (not args.single_end))
558 |
559 | if paired_end:
560 | # Paired end; need to be an even number of fastqs.
561 | if num_fastqs % 2 != 0:
562 | print >> sys.stderr, 'Odd number of fastq files ({0}) in ' \
563 | 'paired-end mode. Exiting...'.format(
564 | num_fastqs
565 | )
566 | sys.exit(4)
567 |
568 | # Now split the lists:
569 | e1_fastqs = args.fastqs[::2]
570 | e2_fastqs = args.fastqs[1::2]
571 | else:
572 | # Make a copy. We need the original later.
573 | e1_fastqs = args.fastqs[:]
574 | e2_fastqs = None
575 |
576 | r1 = FastqRead(e1_fastqs, args.odir, args.suffix)
577 |
578 | # We may be processing single end reads. Everything with r2 is
579 | # conditional on having a second fastq.
580 | if paired_end:
581 | r2 = FastqRead(e2_fastqs, args.odir, args.suffix)
582 |
583 | # Check if we want timestamps output to track progress
584 | if args.timestamp:
585 | r1.do_timestamp()
586 |
587 | # The criteria are class members, not instance.
588 | FastqRead.set_criteria(args.hq_pct, args.filter_hq, args.trim_hq,
589 | args.trim_5, args.min_len_pct,
590 | args.min_pct_hq_reads)
591 |
592 | r1_ok = False
593 |
594 | # If we don't have paired end reads, we just want the tests
595 | # below to care about end 1. In this case, initialize R2_ok to
596 | # True
597 | r2_ok = not paired_end
598 |
599 | # Loop over the whole file. We'll exit this with a break.
600 | while True:
601 | # Do NOT move these into the if statement below; we need to
602 | # keep them in sync. If they are in the if, and r1 fails,
603 | # r2 will not be executed.
604 | r1_ok = r1.next()
605 | if paired_end:
606 | r2_ok = r2.next()
607 | if not (r1_ok and r2_ok):
608 | # One or both files are exhausted. Must both end at the
609 | # same read.
610 | if r1_ok or (paired_end and r2_ok):
611 | print >> sys.stderr, \
612 | 'Input files {0} and {1} are different lengths.\n' \
613 | 'Exiting.'.format(
614 | r1.get_filename(),
615 | r2.get_filename())
616 | sys.exit(5)
617 | # Get the next files in the list to continue processing.
618 | # Since we ensured above that the lists
619 | # were the same length, we don't need to do equivalency
620 | # tests here. We can simply test for list exhaustion on r1
621 | # which works for both single and paired end. If it
622 | # succeeds and we're paired end, we can blindly get the next
623 | # end 2 file.
624 | r1_ok = r1.next_file()
625 | if not r1_ok:
626 | # We've exhausted the list of input files.
627 | break
628 | if paired_end:
629 | # Guaranteed to succeed: lists are equal length.
630 | r2.next_file()
631 |
632 | # Back to the top to get a read from the new files
633 | continue
634 |
635 | r1_ok = r1.filter()
636 | if paired_end:
637 | r2_ok = r2.filter()
638 | if not (r1_ok and r2_ok):
639 | # Filtering this read failed... Next!
640 | continue
641 |
642 | r1_ok = r1.trim()
643 | if paired_end:
644 | r2_ok = r2.trim()
645 | if not (r1_ok and r2_ok):
646 | # This read trimmed to be too short.
647 | continue
648 |
649 | r1.write()
650 | if paired_end:
651 | r2.write()
652 |
653 |
654 | if paired_end:
655 | status = output_stats_paired(r1, r2, args, start_time)
656 | r2.close()
657 | else:
658 | status = output_stats_single(r1, args, start_time)
659 | r1.close()
660 |
661 | return status
662 |
663 | if __name__ == '__main__':
664 | status = main()
665 | sys.exit(status)
666 |
--------------------------------------------------------------------------------
/CTP_PDX/bin/read_group_from_fastq.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """
4 | read_group_from_fastq.py
5 |
6 | Using a fastq file's name and the contents of its first line,
7 | build the option string needed for bwa to mark every read, assuming Illumina
8 | casava 1.8 conventions.
9 |
10 | Input: the fastq file specified as argv[1], the first command line argument.
11 | Handles compressed or uncompressed fastqs.
12 | Output: the second command line argument, if specified, else, sys.stdout.
13 |
14 | Notes:
15 | We will usually be handling standard Illumina Casava 1.8+ output, which
16 | has a regular file naming format and read name format. If any of the
17 | steps here fail, cause the pipeline to fail rather than producing
18 | untraceable output.
19 | """
20 |
21 | import sys
22 | import os
23 | import re
24 | import time
25 | import gzip
26 | import argparse
27 | #import cga_version
28 | try:
29 | import bz2file as bz2
30 | except ImportError:
31 | import bz2
32 |
33 |
34 | def parse_args():
35 | parser = argparse.ArgumentParser(version='V2.0')
36 | parser.add_argument('-p', '--picard', action='store_true',
37 | help="Use Picard format for read group line")
38 | parser.add_argument('-t', '--tumor', action='store_true',
39 | help="Sample is tumor in a tumor/normal pair")
40 | parser.add_argument('-n', '--normal', action='store_true',
41 | help="Sample is normal in a tumor/normal pair")
42 | parser.add_argument('fastq',
43 | help="Path to fastq file for sample")
44 | parser.add_argument('output', nargs='?',
45 | help="Output file name [STDOUT]")
46 |
47 | args = parser.parse_args()
48 |
49 | if args.tumor:
50 | if args.normal:
51 | # Check for a conflict.
52 | parser.error("Must not specify both --tumor and --normal.")
53 | args.sample_type = "Tumor_"
54 | elif args.normal:
55 | args.sample_type = "Normal_"
56 | else:
57 | args.sample_type = ""
58 |
59 | return args
60 |
61 |
62 | def multi_open(name):
63 | if name.endswith('.gz'):
64 | f = gzip.open(name)
65 | elif name.endswith('.bz2'):
66 | f = bz2.BZ2File(name)
67 | else:
68 | f = open(name)
69 | return f
70 |
71 |
72 | def make_fake(args):
73 | """
74 | If we can't get adequate data from the file, use timestamps.
75 | :return:
76 | """
77 | # Sleep for 2 seconds, to make sure that a previous invocation
78 | # will have a different time stamp.
79 | time.sleep(2)
80 |
81 | ts = time.strftime('%H%M%S')
82 |
83 | id = 'ID_' + ts
84 | lb = 'LIB_' + ts
85 | sm = 'SAMPLE_' + ts
86 | bc = 'RUN_' + ts
87 | output(id, lb, sm, bc, args)
88 | sys.exit(0)
89 |
90 |
91 | def main():
92 | #cga_version.parse_options()
93 |
94 | args = parse_args()
95 |
96 | # First get the info from the filename
97 | fn = os.path.split(args.fastq)[1]
98 |
99 | if 'fastq' not in fn and 'fq' not in fn:
100 | print >> sys.stderr, "Not seemingly a fastq file:", fn
101 | make_fake(args)
102 | # Does not return...
103 |
104 | # Now split the basename portion into its constituent parts.
105 | fn_parts = fn.split('_')
106 |
107 | # Scan for the "GES" starting a filename part. If found,
108 | # That separates the Sample name portion from the Library name.
109 | # If GES is not found starting a part, use the whole filename
110 | # as both the Sample name and the Library name.
111 | # Maybe redo this with regular expressions, but for now, it works.
112 | pos = -1
113 | for n in range(len(fn_parts)):
114 | if fn_parts[n].startswith("GES"):
115 | pos = n
116 | break
117 | if pos == -1:
118 | # Didn't find the GES marker. Use the filename up to the end name.
119 | match = re.search('(.*)[._]R[12]_.*',fn)
120 | if match is not None:
121 | fn = match.group(1)
122 | else:
123 | # something is seriously odd here, but we'll just use the
124 | # whole filename
125 | pass
126 |
127 | cust_id = ges_id = fn
128 | else:
129 | cust_id = '_'.join(fn_parts[:pos])
130 | ges_parts = fn_parts[pos:]
131 | pos = 999 # Way bigger than the number of parts we'll see.
132 | for n in range(len(ges_parts)):
133 | if ges_parts[n] == 'R1' or ges_parts[n] == 'R2':
134 | pos = n
135 | break
136 | ges_id = '_'.join(ges_parts[:pos])
137 |
138 | # Sanity check that we have some amount of text for our fields. The
139 | # down stream tools can't tolerate empty fields in the read group
140 | # information.
141 | if not ges_id:
142 | ges_id = fn
143 |
144 | if not cust_id:
145 | cust_id = ges_id
146 |
147 | # Now the parts from the first readname--the first line of the file.
148 | # When split on ':', the readname contains
149 | # - the ID in the first four fields.
150 | # Note: the leading '@' needs to be stripped.
151 | try:
152 | inf = multi_open(sys.argv[1])
153 | line = inf.readline()
154 | except IOError, e:
155 | print sys.stderr, "Couldn't read the file: {0}\n {1}". \
156 | format(fn, e.message)
157 | make_fake(args)
158 | # Does not return
159 |
160 | # Example line:
161 | # @HISEQ2000:190:D19U8ACXX:5:1101:1492:1901 1:N:0:TAGCTT
162 | parts = line[1:].strip().split(' ')
163 | read_name = parts[0]
164 |
165 | # Example read_name: HISEQ2000:190:D19U8ACXX:5:1101:1492:1901
166 | rparts = read_name.split(':')
167 | if len(rparts) >= 4:
168 | rparts = rparts[:4]
169 |
170 | # Try to add the bar code in:
171 | bar_code = "no_barcode"
172 | if len(parts) >= 2:
173 | # Example comment: 1:N:0:TAGCTT
174 | comment = parts[1]
175 | cparts = comment.split(':')
176 | if len(cparts) == 4:
177 | bar_code = cparts[3]
178 | rparts.append(bar_code)
179 |
180 | id = ':'.join(rparts)
181 | # Example id: HISEQ2000:190:D19U8ACXX:5:TAGCTT
182 |
183 | output(id, ges_id, cust_id, bar_code, args)
184 |
185 | def output(id, ges_id, cust_id, bar_code, args):
186 | if args.output is not None:
187 | of = open(args.output, 'w')
188 | else:
189 | of = sys.stdout
190 |
191 | if args.picard:
192 | line = 'RGID={0}{1} RGLB={0}{2} ' \
193 | 'RGPL=ILLUMINA RGSM={3} RGPU={4}'.\
194 | format(args.sample_type, id, ges_id, cust_id, bar_code)
195 | else :
196 | line = '@RG\\tID:{0}{1}\\tLB:{0}{2}\\tSM:{3}\\tPL:ILLUMINA'.\
197 | format(args.sample_type, id, ges_id, cust_id)
198 | # This needs to be a single line file; no terminating \n
199 | print >> of, line,
200 | if of != sys.stdout:
201 | of.close()
202 |
203 | if __name__ == '__main__':
204 | main()
--------------------------------------------------------------------------------
/CTP_PDX/bwa_mem.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 | First process a fastq file to extract read group information with
7 | read_group_from_fastq.py. Then align a pair of fastq files.
8 | Any quality checking and trimming must be done before this step.
9 |
10 |
19 |
20 |
21 | bwakit/0.7.15
22 | samtools/0.1.18
23 | python/2.7.3
24 |
25 |
26 |
27 |
32 |
33 |
36 |
37 | read_group_from_fastq.py --version
38 | {in_1} {read_group}
39 |
40 |
41 |
46 |
47 |
48 |
49 |
51 |
53 |
55 |
57 |
58 |
59 |
61 |
63 |
65 |
66 |
68 |
69 |
71 |
72 |
73 |
74 |
75 |
80 |
82 |
84 |
86 |
88 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 | hg38_bwa_alt_aware_indices/run-bwamem -t 12 {rg} -o out -H {bwa7_idx_pfx} {in_1} {in_2} | sh
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/CTP_PDX/config_file_SingleSample_PDX_Panel:
--------------------------------------------------------------------------------
1 | VaPrePro2.targetfilegatk=targets_sorted_gatk_hg38.bed
2 | HSmetrics.targetfilepicard=targets_sorted_gatk_hg38_picard.bed
3 | HSmetrics.baitfilepicard=targets_gatk_hg38_picard.bed
4 | UnifiedGenotyper.sample_ploidy=4
5 | UnifiedGenotyper.targetfilegatk=targets_sorted_gatk_hg38.bed
6 | pindelmicroindel.targetfilepindel=targets_sorted_gatk_hg38.bed
7 | gatkcoverage.targetfile=hg38_noheader_withNames.bed
8 |
--------------------------------------------------------------------------------
/CTP_PDX/gatkcoveragestats.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 |
5 |
15 |
16 |
17 |
18 |
19 |
20 | java/1.7.0
21 | GATK/3.4-0
22 | bedtools/2.17.0
23 | python/2.7.3
24 |
25 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | java -jar {gatk} --help | grep "(GATK)"
34 | -Djava.io.tmpdir=$TMPDIR -Xmx12g -jar {gatk} -T DepthOfCoverage -R {in_2} --outputFormat table -o {out_1} -I {in_1} -L {targetfile} --omitPerSampleStats --omitIntervalStatistics --omitLocusTable
35 |
36 |
37 |
38 |
39 | {out_1} {out_2} {out_3} {targetfile}
40 |
41 |
42 |
43 |
44 |
45 | coveragecalculator.py --version
46 |
47 |
48 | {out_3} {out_4}
49 |
50 |
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/CTP_PDX/microIndel_calling.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | Call micro INDELs from our pre-processed BAM file.
5 |
6 |
16 |
17 |
19 |
20 |
22 |
24 |
26 |
28 |
30 |
32 |
34 |
36 |
38 |
40 |
42 |
44 |
46 |
48 |
50 |
51 |
52 | samtools/0.1.19
53 | pindel/0.2.5a3
54 | bedtools/2.17.0
55 | java/1.7.0
56 | python/2.7.3
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | echo -n "samtools: "; samtools 2>&1 | grep Version
67 | index {in_1}
68 |
69 |
70 |
71 |
72 | sn=`basename {in_2}`
73 |
74 |
75 |
76 | -e "{in_1}\t350\t$sn" > {pindel_config}
77 |
78 |
79 |
80 | pindel --version | grep ^Pindel
81 | --config-file {pindel_config}
82 | --fasta {in_3}
83 | -o {in_5}/$sn
84 |
85 |
86 |
87 | |in_5|/${sn}_D |in_5|/${sn}_SI > |in_5|/${sn}_DSI
88 |
89 |
90 |
91 | pindel2vcf --help | grep ^Version
92 | -p |in_5|/${sn}_DSI
93 | -r |in_3| -R hg38 -d 20150925
94 | --max_size 50
95 | --vcf |vcf1| -G --het_cutoff 0.05
96 |
97 |
98 |
99 | bedtools --version
100 | intersect -header -a {vcf1} -b {targetfilepindel} -f 1.0 > {out_1}
101 |
102 |
103 |
104 | filter_for_minimum_depth --version
105 | {out_1} {out_2}
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 | |in_5|/${sn}_BP |in_5|/${sn}_D |in_5|/${sn}_DSI |in_5|/${sn}_INT |in_5|/${sn}_INT_final |in_5|/${sn}_INV |in_5|/${sn}_LI |in_5|/${sn}_RP |in_5|/${sn}_SI |in_5|/${sn}_TD
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/CTP_PDX/qual_statistics.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Checks paired end fastq files for overall quality, and performs
5 | some statistical analysis.
6 | Terminates the run if less than 50% HQ reads.
7 |
8 |
9 |
24 |
25 |
26 | python/2.7.3
27 |
28 |
29 | filter_trim.py --version
30 | -M 50 -d {in_3} {in_1} {in_2}
31 |
32 |
33 |
--------------------------------------------------------------------------------
/CTP_PDX/removeFiles.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Remove Files
4 |
5 |
6 |
7 |
8 |
9 | rm raw_gatk.vcf.idx raw_indels.vcf.idx raw_snps.vcf.idx filtered_indels.vcf.idx filtered_snps.vcf.idx *.bai out.hla.top out.log.bwamem out.log.hla out.hla.all
10 |
11 |
12 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_annotation.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | Annotate variants with snpEff and SnpSift.
5 |
6 |
15 |
16 |
17 |
18 |
20 |
21 |
23 |
25 |
26 |
28 |
29 |
30 |
32 |
33 |
35 |
36 |
38 |
39 |
40 |
42 |
43 |
45 |
46 |
48 |
49 |
51 |
52 |
54 |
55 |
57 |
58 |
59 | java/1.8.0
60 | perl/cga
61 | vcf-tools/0.1.12a
62 | python/2.7.3
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
75 | vcftools | grep VCFtools
76 | {in_1} {in_2} | vcf-sort > {vcf1}
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
88 | java -jar {snpSift} 2>&1 | grep version
89 | -jar {snpSift} filter --addFilter "lowAF" --rmFilter "PASS" 'ALT_AF[ANY] < 5' {vcf1} > {vcf2}
90 |
91 |
92 |
93 |
94 | java -jar {snpEff} 2>&1 | grep version
95 | -jar {snpEff} eff -v -lof -onlyTr {transcript_list} -hgvs GRCh38.84
96 | -noStats {vcf2} > {vcf3}
97 |
98 |
99 |
100 |
101 |
102 | java -jar {snpSift_4} 2>&1 | grep version
103 | -jar {snpSift_4} dbnsfp -v -db {dbNSFP} -noDownload
104 | -f SIFT_score,SIFT_pred,Polyphen2_HDIV_score,MutationAssessor_score,phyloP100way_vertebrate,1000Gp3_AF,1000Gp3_AFR_AF,1000Gp3_EUR_AF,1000Gp3_AMR_AF,1000Gp3_EAS_AF,ESP6500_AA_AF,ESP6500_EA_AF,ExAC_AC,ExAC_AF
105 | {vcf3} > {vcf4}
106 |
107 |
108 |
109 |
110 |
111 |
115 | java -jar {snpEff} 2>&1 | grep version
116 | {vcf4} | {snpSift_onePerLine} > {vcf5}
117 |
118 |
119 |
120 |
121 | java -jar {snpSift} 2>&1 | grep version
122 | -jar {snpSift} annotate -id {in_4} {vcf5} > {vcf6}
123 |
124 |
125 |
126 |
127 |
128 | java -jar {snpEff} 2>&1 | grep version
129 | -jar {snpSift_4} extractFields {vcf6}
130 | CHROM POS REF ALT ID FILTER DP_HQ ALT_AF "LOF[*].NUMTR"
131 | "LOF[*].PERC"
132 | "EFF[*].GENE" "EFF[*].EFFECT" "EFF[*].IMPACT" "EFF[*].FUNCLASS" "EFF[*].RANK" "EFF[*].CODON" "EFF[*].AA" "EFF[*].AA_LEN" "EFF[*].CODING" "EFF[*].TRID" "dbNSFP_SIFT_score" "dbNSFP_SIFT_pred" "dbNSFP_Polyphen2_HDIV_score" "dbNSFP_MutationAssessor_score" "dbNSFP_phyloP100way_vertebrate" "dbNSFP_1000Gp3_AF" "dbNSFP_1000Gp3_AFR_AF" "dbNSFP_1000Gp3_EUR_AF" "dbNSFP_1000Gp3_AMR_AF" "dbNSFP_1000Gp3_EAS_AF" "dbNSFP_ESP6500_AA_AF" "dbNSFP_ESP6500_EA_AF" "dbNSFP_ExAC_AC" "dbNSFP_ExAC_AF" "CALLER" > {out_1}
133 |
134 |
135 |
136 |
137 | -f {out_1} -d
138 |
139 |
140 |
141 |
142 |
143 |
144 | /out_1/ | awk -F '\t' 'BEGIN {OFS="\t"} $6 == "FILTER" || $6 == "PASS" || $6 == "" || $6 == "."'
145 |
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_calling.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | callng & first pass filtering of the variants from our pre-processed BAM file using GATK unifiedGenotyper & GATK variantFiltration, respectively
5 |
6 |
14 |
15 |
16 | java/1.7.0
17 | GATK/3.4-0
18 | perl
19 | vcf-tools/0.1.12a
20 |
21 |
23 |
25 |
27 |
28 |
30 |
32 |
33 |
34 |
36 |
37 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
49 |
50 | java -jar {gatk} --help | grep "(GATK)"
51 | -Djava.io.tmpdir=$TMPDIR
52 | -Xmx2g -jar {gatk} -R {in_3}
53 | -T UnifiedGenotyper -I {in_1}
54 | --dbsnp {dbsnp_144} -glm BOTH -o {rawgatk} -stand_call_conf 50.0
55 | -stand_emit_conf 30.0 -dt NONE -L {targetfilegatk} --sample_ploidy 4 --disable_auto_index_creation_and_locking_when_reading_rods
56 |
57 |
58 |
59 | java -jar {gatk} --help | grep "(GATK)"
60 | -Djava.io.tmpdir=$TMPDIR
61 | -Xmx2g -jar {gatk} -T SelectVariants -R {in_3} -V {rawgatk} -L {targetfilegatk} -selectType SNP -o {rawsnp}
62 |
63 |
64 |
65 | java -jar {gatk} --help | grep "(GATK)"
66 | -Djava.io.tmpdir=$TMPDIR
67 | -Xmx2g -jar {gatk} -T SelectVariants -R {in_3} -V {rawgatk} -L {targetfilegatk} -selectType INDEL -o {rawindel}
68 |
69 |
70 |
71 |
72 |
73 | java -jar {gatk} --help | grep "(GATK)"
74 | -Djava.io.tmpdir=$TMPDIR
75 | -Xmx2g -jar {gatk} -R {in_3}
76 | -T VariantFiltration --variant {rawsnp} --clusterSize 3 --clusterWindowSize 10 --filterExpression "QD < 2.0" --filterName "lowQD" --filterExpression "FS > 60.0" --filterName "strandbias" --filterExpression "MQ < 40" --filterName "lowMQ" --filterExpression "MQRankSum < -12.5" --filterName "lowMQRankSum" --filterExpression "ReadPosRankSum < -8" --filterName "lowReadPosRankSum" -o {filteredsnp}
77 |
78 |
79 |
80 |
81 | java -jar {gatk} --help | grep "(GATK)"
82 | -Djava.io.tmpdir=$TMPDIR
83 | -Xmx2g -jar {gatk} -R {in_3}
84 | -T VariantFiltration --variant {rawindel} --filterExpression "QD < 2.0" --filterName "lowQD" --filterExpression "FS > 200.0" --filterName "strandbias" --filterExpression "ReadPosRankSum < -20" --filterName "lowReadPosRankSum" -o {filteredindel}
85 |
86 |
87 |
88 |
89 |
92 | vcftools | grep VCFtools
93 | {filteredsnp} {filteredindel} | vcf-sort > {out_1}
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_filtration.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | Filter variants using depth and allele frequency cut-off
5 |
6 |
12 |
13 |
14 | python/2.7.3
15 | java/1.7.0
16 |
17 |
18 |
20 |
21 |
23 |
25 |
27 |
28 |
29 |
30 |
31 |
36 | allele_depth_min_and_AF_from_ADs.py -v
37 | {in_1} {vcf1} 140
38 |
39 |
40 |
41 |
42 | java -jar {snpSift} 2>&1 | grep version
43 | -jar {snpSift} annotate -id {dbsnp} {vcf1} > {vcf2}
44 |
45 |
46 |
47 |
48 |
49 |
53 | java -jar {snpSift} 2>&1 | grep version
54 | -jar {snpSift} filter --addFilter "lowAF" --rmFilter "PASS" 'ALT_AF[ANY] < 5' {vcf2} > {out_1}
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_filtration_pindel.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 | Filter variants with DP less than 140, call variant effects with vep
6 |
7 |
13 |
14 |
15 | python/2.7.3
16 | java/1.7.0
17 | perl/5.24.0
18 | tabix/0.2.6
19 |
20 |
21 |
22 |
24 |
26 |
27 |
29 |
30 |
32 |
33 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
48 | allele_depth_min_and_AF_from_ADs.py -v
49 | {in_1} {vcf1} 140
50 |
51 |
52 |
53 |
54 | java -jar {snpSift} 2>&1 | grep version
55 | -jar {snpSift} annotate -id {dbsnp} {vcf1} > {vcf2}
56 |
57 |
58 |
59 |
60 |
61 |
62 | {vcf2} {out_1}
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_pre_proc_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | GATK steps to get ready for variant calling. Part 1 of 3.
4 |
5 |
9 |
10 | java/1.8.0
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 | echo -n "Picard SortSam"; java -jar {PicardMaster} SortSam --version
21 | -Djava.io.tmpdir=$TMPDIR
22 | -Xmx4g -jar {PicardMaster} SortSam SO=coordinate
23 | INPUT={in_1} OUTPUT={Sorted} VALIDATION_STRINGENCY=SILENT
24 | CREATE_INDEX=true
25 |
26 |
27 |
28 |
29 | echo -n "Picard MarkDuplicates "; java -jar {PicardMaster} MarkDuplicates --version
30 | -Djava.io.tmpdir=$TMPDIR
31 | -Xmx4g -jar {PicardMaster} MarkDuplicates I={Sorted}
32 | O={out_1} M={out_2} REMOVE_DUPLICATES=true
33 | CREATE_INDEX=true VALIDATION_STRINGENCY=SILENT
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_pre_proc_2.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | GATK steps to get ready for variant calling. Part 2 & 3 of 3.
5 |
6 |
14 | GATK/3.4-0
15 | java/1.7.0
16 |
17 |
19 |
21 |
23 |
24 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
37 |
39 |
40 |
41 |
42 |
43 |
44 | java -jar {gatk} --help | grep "(GATK)"
45 | -Djava.io.tmpdir=$TMPDIR
46 | -Xmx2g -jar {gatk} -I {in_1}
47 | -R {in_3} -T RealignerTargetCreator
48 | -o {aligner_intervals}
49 | -known {1000G_Mills_indels_vcf} -known {hg38_indels_vcf} --disable_auto_index_creation_and_locking_when_reading_rods
50 | -L {targetfilegatk}
51 |
52 |
53 |
55 |
57 |
58 |
60 |
62 |
63 |
64 | -Djava.io.tmpdir=$TMPDIR
65 | -Xmx4g -jar {gatk} -I
66 | {in_1} -R {in_3} -T IndelRealigner
67 | -targetIntervals {aligner_intervals} -o {realigned_bam}
68 | -known {1000G_Mills_indels_vcf} -known {hg38_indels_vcf} --disable_auto_index_creation_and_locking_when_reading_rods
69 | -L {targetfilegatk}
70 |
71 |
72 |
73 |
74 |
75 |
76 | -Djava.io.tmpdir=$TMPDIR
77 | -Xmx4g -jar {gatk}
78 | -T BaseRecalibrator -I {realigned_bam} -R {in_3}
79 | -knownSites {dbsnp_vcf} -knownSites {1000G_Mills_indels_vcf}
80 | -knownSites {hg38_indels_vcf} -o {recal_data} --disable_auto_index_creation_and_locking_when_reading_rods
81 |
82 |
83 |
87 |
88 | -Xmx4g -jar {gatk}
89 | -T PrintReads -R {in_3} -I {realigned_bam}
90 | -BQSR {recal_data} -o {out_1} --disable_auto_index_creation_and_locking_when_reading_rods
91 |
92 |
93 |
--------------------------------------------------------------------------------
/CTP_PDX/variant_pre_proc_3.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Calculating the target coverage using picard hybrid selection metrics
4 |
5 |
13 |
14 | java/1.7.0
15 | python/2.7.3
16 |
17 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
30 |
31 |
32 |
33 | echo -n "Picard CalculateHsMetrics "; java -jar {picard} --version
34 |
35 | -Djava.io.tmpdir=$TMPDIR
36 | -jar -Xmx2g {picard}
37 | TARGET_INTERVALS={targetfilepicard}
38 | BAIT_INTERVALS={baitfilepicard}
39 | REFERENCE_SEQUENCE={in_3}
40 | INPUT={in_1} OUTPUT={out_1}
41 | VALIDATION_STRINGENCY=LENIENT
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | {out_1}
51 |
52 |
53 |
54 |
--------------------------------------------------------------------------------
/CTP_PDX/xenome_classification_DNA.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | classify the input fastq file to human and mouse
4 |
5 |
15 |
16 |
17 | xenome
18 | perl
19 |
20 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 | classify {threads} -P {in_3}
38 | --pairs --host-name mouse --graft-name human -i {in_1} -i {in_2}
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # SOFTWARE LICENSE AGREEMENT
2 | ## FOR NON-COMMERCIAL USE
3 |
4 | This Software License Agreement (this “Agreement”) is made between you (“You,” “Your,” or “Licensee”) and The Jackson Laboratory (“Licensor”). This Agreement grants to You a license to the Licensed Software subject to Your acceptance of all the terms and conditions contained in this Agreement. Please read the terms and conditions carefully. Your access and use of the Licensed Software shall be deemed your acceptance of this Agreement and the terms and conditions contained herein. If You do not agree to these terms, Licensor is unwilling to grant you the license contained in this Agreement and You should not access or use the Licensed Software.
5 |
6 | ## 1. LICENSE
7 |
8 | ### 1.1 Grant.
9 | Subject to the terms and conditions of this Agreement, Licensor hereby grants to Licensee a worldwide, royalty-free, non-exclusive, non-transferable, non-sublicensable license to download, copy, display, and use the Licensed Software for Non-Commercial purposes only. “Licensed Software” means the current version of the software made available to You vis Licensor’s website and requiring your acceptance of the terms and conditions of this Agreement as a condition of use. “Non-Commercial” means not intended or directed toward commercial advantage or monetary compensation.
10 |
11 | ### 1.2 License Limitations.
12 | Nothing in this Agreement shall be construed to confer any rights upon Licensee except as expressly granted herein. Licensee may not use or exploit the Licensed Software other than expressly permitted by this Agreement. Licensee may not, nor may Licensee permit any third party, to modify, translate, reverse engineer, decompile, disassemble or create derivative works based on the Licensed Software or any portion thereof. Subject to Section 1.1, Licensee may distribute the Licensed Software to a third party, provided that the recipient agrees to use the Licensed Software on the terms and conditions of this Agreement. Licensee acknowledges that Licensor reserves the right to offer to Licensee or any third party a license for commercial use and distribution of the Licensed Software on terms and conditions different than those contained in this Agreement. If You are interested in commercial use of the Licensed Software, please contact the Licensor.
13 |
14 | ## 2. OWNERSHIP OF INTELLECTUAL PROPERTY
15 |
16 | ### 2.1 Ownership Rights.
17 | Except for the limited license rights expressly granted to Licensee under this Agreement, Licensee acknowledges that all right, title and interest in and to the Licensed Software and all intellectual property rights therein shall remain with Licensor or its licensors, as applicable.
18 |
19 | ## 3. DISCLAIMER OF WARRANTY AND LIMITATION OF LIABILITY
20 |
21 | ### 3.1 Disclaimer of Warranty.
22 | LICENSOR PROVIDES THE LICENSED SOFTWARE ON A NO-FEE BASIS “AS IS” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. LICENSOR EXPRESSLY DISCLAIMS ALL WARRANTIES OR CONDITIONS OF ANY KIND, INCLUDING ANY WARRANTY OF MERCHANTABILITY, TITLE, SECURITY, ACCURACY, NON-INFRINGEMENT OR FITNESS FOR A PARTICULAR PURPOSE.
23 |
24 | ### 3.2 Limitation of Liability.
25 | LICENSEE ASSUMES FULL RESPONSIBILITY AND RISK FOR ANY LOSS RESULTING FROM LICENSEE’s DOWNLOADING AND USE OF THE LICENSED SOFTWARE. IN NO EVENT SHALL LICENSOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, ARISING FROM THE LICENSED SOFTWARE OR LICENSEE’S USE OF THE LICENSED SOFTWARE, REGARDLESS OF WHETHER LICENSOR IS ADVISED, OR HAS OTHER REASON TO KNOW, OR IN FACT KNOWS, OF THE POSSIBILITY OF THE FOREGOING.
26 |
27 | ### 3.3 Acknowledgement.
28 | Without limiting the generality of Sections 3.1 and 3.2, Licensee acknowledges that the Licensed Software is provided as an information resource only and should not be relied on for any diagnostic or treatment purposes.
29 |
30 | ## 4. TERM AND TERMINATION
31 |
32 | ### 4.1 Term.
33 | This Agreement commences on the date this Agreement is executed and will continue until terminated in accordance with Section 4.2.
34 |
35 | ### 4.2 Termination.
36 | If Licensee breaches any provision hereunder, or otherwise engages in any unauthorized use of the Licensed Software, Licensor may terminate this Agreement immediately. Licensee may terminate this Agreement at any time upon written notice to Licensor. Upon termination, the license granted hereunder will terminate and Licensee will immediately cease using the Licensed Software and destroy all copies of the Licensed Software in its possession. Licensee will certify in writing that it has complied with the foregoing obligation.
37 |
38 | ## 5. MISCELLANEOUS
39 |
40 | ### 5.1 Future Updates.
41 | Use of the Licensed Software under this Agreement is subject to the terms and conditions contained herein. New or updated software may require additional or revised terms of use. Licensor will provide notice of and make available to Licensee any such revised terms.
42 |
43 | ### 5.2 Entire Agreement.
44 | This Agreement, including any Attachments hereto, constitutes the sole and entire agreement between the parties as to the subject matter set forth herein and supersedes are previous license agreements, understandings, or arrangements between the parties relating to such subject matter.
45 |
46 | ### 5.3 Governing Law.
47 | This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the State of Maine, U.S.A., without regard to conflict of laws principles. The parties agree that any disputes between them may be heard only in the state or federal courts in the State of Maine, and the parties hereby consent to venue and jurisdiction in those courts.
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PDX Pipelines
2 |
3 | ## Overview
4 | These pipelines were developed using the Civet framework and, as such, are designed to be command line analysis pipelines which execute through a batch system commonly used by High Performance Computing (HPC) systems. For more information on Civet, visit https://github.com/TheJacksonLaboratory/civet.
5 |
6 | The pipeline is defined by an XML file which describes the file inputs (hardcoded reference files, files input by the user, or files generated by another step in the pipeline) and the steps that utilize these files. Each tool described in the pipeline that can be invoked by the pipeline is encoded in a separate XML file; these tools can be shared amongst pipelines. Additionally, the pipelines use python (v. 2.7.3).
7 |
8 | ### Disclaimer
9 | Note that the pipeline and tool files provided here are merely examples of how to construct pipelines and create separate tool xml files; additional information is necessary to explicitly run the pipelines. Python, R, and Perl scripts are mostly functional with some scripts require bit tweaking to fit individual user needs. Although the CTP and RNA pipelines provided here are for paired end samples, the pipelines can be modified for single end samples.
10 |
11 | ### License
12 | This software is licensed for non-commercial purposes only. See the [LICENSE.md](https://github.com/TheJacksonLaboratory/PDX-Analysis-Workflows/blob/master/LICENSE.md) file in this repository for the terms and conditions of this license.
13 |
14 | ## Pipelines
15 |
16 | ### CTP
17 | This pipeline is designed for processing tumor samples. The pipeline takes as input a single sample for which paired-end sequencing has been performed, and outputs annotated variants and insertions and deletions (indels).
18 |
19 | The pipeline encompasses several different steps. The first step is a quality control step in which low-quality bases are trimmed, and low-quality reads are discarded. The trimmed fastq files are then processed, and reads are classified as human or mouse; human reads are retained, and other reads are discarded. The trimmed reads are then aligned against the human genome (hg38) using the BWA-MEM algorithm (bwakit v. 0.7.15) and a SAM file generated using samtools (v. 0.1.18). There are then many processing steps that involve the removal of duplicates using picard (v. 2.8.1) followed by re-alignment around indels and base quality recalibration (GATK v. 3.4.0 and java v. 1.7.0); a BAM file is generated. Target coverage is then calculated using picard hybrid selection metrics. Initial variants are then called and filtered using GATK unifiedGenotyper and variantFiltration. SnpEff (v. 4.2) and SnpSift, in addition to a python script, are employed to filter variants with coverage less than 140 and call variant effects; variants are then further filtered using allele frequency cut-offs. Micro indels are then called and filtered using pindel (v. 0.2.5a3). SnpEff and SnpSift are utilized to annotate the filtered variants and micro indels. Alignment, quality control, duplication metrics, and coverage statistics are then compiled into a single file. Scripts, in addition to GATK, are employed to calculate coverage. Finally, intermediate files are removed from the run directory.
20 |
21 | The provided config file can be used to specify specific files or set various options; for example, specifying sample ploidy.
22 |
23 | ### RNA
24 | This pipeline is designed for processing RNA-seq samples. The pipeline takes as input a single sample for which paired-end sequencing has been performed, and outputs read counts.
25 |
26 | The first step takes as input raw fastq files and outputs fastq files which contain trimmed reads. The trimmed fastq files are then processed, and reads are classified as human or mouse; human reads are retained, and other reads are discarded. Reads are then aligned against the human genome (hg38) using the bowtie2 option for rsem (v. 1.2.19) and a BAM file is output. Gene names are then added and read counts normalized. Read group information is then added to the BAM file followed by picard (v. 2.8.1) reordering and sorting of the BAM file. Alignment metrics are then generated using picard followed by the compilation of all metrics (e.g., statistics from filtering, rsem, and picard). Finally, a classifier is run, and exon level coverage statistics are determined from the alignment file; this requires samtools (v. 0.1.18) and bedtools (v. 2.25.0).
27 |
28 |
29 | ### CNV
30 | This pipeline is designed for processing Affymetrix SNP 6.0 array samples. The pipeline takes as input a single tumor sample (CEL file) and outputs copy numbers for genes and segments.
31 |
32 | The pipeline encompasses four separate steps: 1) allele-specific signal from CEL files and normalization with HapMap CEL files; 2) output of Log R ratios (LRR) and B-Allele frequencies (BAF); 3) output of aberrant tumor cell fraction, ploidy, and segments containing gains or losses; 4) annotation of ASCAT segments with loss of heterozygosity (LOH), chromosome arm fraction, and ploidy and annotation of genes with copy number. LRR and BAF are calculated using PennCNV-Affy and Affymetrix Power Tools (v. 1.15.0). The GC correction, heterozygous SNP estimation, and the computation of aberrant cell fraction, ploidy, and allele-specific copy numbers are performed following the implementation of the ASCAT R package (v. 2.4.3) in the included R script. Annotation is performed using the included Perl scripts.
33 |
--------------------------------------------------------------------------------
/RNA_PDX/XenomeRnaSeqSingleSamplePE.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
7 |
8 |
9 |
10 |
15 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
43 |
46 |
47 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
62 |
66 |
67 |
68 |
69 |
70 |
71 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
125 |
126 |
130 |
131 |
135 |
136 |
137 |
138 |
144 |
145 |
146 |
147 |
148 |
152 |
153 |
154 |
155 |
156 |
--------------------------------------------------------------------------------
/RNA_PDX/add_gene_name_normalization_out.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Add geneName to RSEM isoforms.results file
4 |
5 |
6 |
16 |
17 |
18 | R
19 | perl/cga
20 |
21 |
22 | -i1 {in_1} -i2 {in_2} -a1 {in_3}
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/GeneName_and_Normalization_without_UCSC.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use Getopt::Long;
4 |
5 | my $usage = <<'USAGE';
6 |
7 | #########################GeneName_and_Normalization_without_UCSC.pl#############################
8 |
9 | usage: GeneName_and_Normalization_without_UCSC.pl [options]
10 |
11 | -i1 = infile1 [RSEM ENSEMBL genes.results]
12 | -i2 = infile2 [RSEM ENSEMBL isoforms.results]
13 | -a1 = accession1 [accession File1]
14 |
15 | ##################################################################################
16 |
17 | USAGE
18 |
19 | my ($infile1, $infile2, $accessionFile1);
20 |
21 | my $result = GetOptions("i1=s" => \$infile1, "i2=s" => \$infile2, "a1=s" => \$accessionFile1);
22 |
23 | die $usage unless ($infile1); ##### Mandatory arguments
24 | die $usage unless ($infile2); ##### Mandatory arguments
25 | die $usage unless ($accessionFile1); ##### Mandatory arguments
26 |
27 | ###########################################################################################################################
28 |
29 |
30 | my $seventyFifthQuartileGene = quantileCal($infile1);
31 | my $seventyFifthQuartileIso = quantileCal($infile2);
32 |
33 | #######Gene.results modification#############
34 |
35 |
36 |
37 | open(FILEACC, $accessionFile1) || die "cannot open the $accessionFile1 ";####hg38/mm10_final
38 |
39 | my %hashGene = ();
40 |
41 | while(my $readFileACC = )
42 | {
43 | if($readFileACC =~ /^\s*.*?\s+(.*?)\s+(.*?)\s+(.*)$/)
44 | {
45 | my $key1 = $1;
46 | my $valueG = "$2";
47 | # my $UCSC = $3;
48 |
49 | if(!exists($hashGene{$key1}))
50 | {
51 | $hashGene{$key1}= $valueG; ####making ID by ENSG
52 | }
53 |
54 | }
55 |
56 | }
57 |
58 |
59 | open(FILEINGENE, $infile1) || die "cannot open the $infile1 file";####*genes.results
60 | open(FILEOUTGENEName, ">$infile1.withGeneName") || die "cannot open the file";
61 | open(FILEOUTGENENorm, ">$infile1.Normalized") || die "cannot open the file";
62 |
63 |
64 | my $flagGene = 0;
65 | my ($keyG, $ENSName, $ExpectedcountG);
66 |
67 | while(my $readFileGene = )
68 | {
69 | if($flagGene == 0 )
70 | {
71 | $flagGene = 1;
72 | chomp $readFileGene;
73 | print FILEOUTGENEName "$readFileGene\tGeneName\n";
74 | print FILEOUTGENENorm "gene_id\ttranscript_id(s)\tGeneName\tnormalized_count\n";
75 | next;
76 | }
77 |
78 | if($flagGene == 1)
79 | {
80 | if($readFileGene =~ /^\s*(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*)$/)
81 | {
82 | $keyG = $1;
83 | $ENSName = $2;
84 | $ExpectedcountG = sprintf("%.2f",(($5/$seventyFifthQuartileGene)*1000));
85 |
86 | if(exists($hashGene{$keyG}))
87 | {
88 | chomp $readFileGene;
89 | print FILEOUTGENEName "$readFileGene\t$hashGene{$keyG}\n";
90 | print FILEOUTGENENorm "$keyG\t$ENSName\t$hashGene{$keyG}\t$ExpectedcountG\n";
91 | }
92 |
93 | else
94 | {
95 | chomp $readFileGene;
96 | print FILEOUTGENEName "$readFileGene\t-\n";
97 | print FILEOUTGENENorm "$keyG\t$ENSName\t-\t$ExpectedcountG\n";
98 |
99 | }
100 |
101 |
102 | }
103 | }
104 | }
105 |
106 |
107 | ##################################################################################################################
108 |
109 |
110 | close(FILEACC);
111 |
112 | #######Isoforms.results modification#############
113 |
114 |
115 | open(FILEACC, $accessionFile1) || die "cannot open the $accessionFile1 ";####hg19/mm10_final
116 |
117 | my %hashIso = ();
118 |
119 | while(my $readFileACC = )
120 | {
121 | if($readFileACC =~ /^\s*(.*?)\s+.*?\s+(.*?)\s+(.*)$/)
122 | {
123 | my $key2 = $1;
124 | my $valueI = "$2";
125 | #my $UCSC_I = $3;
126 |
127 | if(!exists($hashIso{$key2}))
128 | {
129 | $hashIso{$key2}= $valueI; ####making ID by ENST
130 | }
131 |
132 | }
133 |
134 | }
135 |
136 |
137 | open(FILEINISO, $infile2) || die "cannot open the $infile2 file";####*genes.results
138 | open(FILEOUTISOName, ">$infile2.withGeneName") || die "cannot open the file";
139 | open(FILEOUTISONorm, ">$infile2.Normalized") || die "cannot open the file";
140 |
141 | my $flagIso = 0;
142 | my ($keyI, $ENSIName, $ExpectedcountI);
143 |
144 | while(my $readFileIso = )
145 | {
146 | if($flagIso == 0 )
147 | {
148 | $flagIso = 1;
149 | chomp $readFileIso;
150 | print FILEOUTISOName "$readFileIso\tGeneName\n";
151 | print FILEOUTISONorm "transcript_id\tgene_id\tGeneName\tnormalized_count\n";
152 | next;
153 | }
154 |
155 | if($flagIso == 1)
156 | {
157 | if($readFileIso =~ /^\s*(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*)$/)
158 | {
159 | $keyI = $1;
160 | $ENSIName = $2;
161 | $ExpectedcountI = sprintf("%.2f",(($5/$seventyFifthQuartileIso)*300));
162 |
163 | if(exists($hashIso{$keyI}))
164 | {
165 | chomp $readFileIso;
166 | print FILEOUTISOName "$readFileIso\t$hashIso{$keyI}\n";
167 | print FILEOUTISONorm "$keyI\t$ENSIName\t$hashIso{$keyI}\t$ExpectedcountI\n";
168 | }
169 |
170 | else
171 | {
172 | chomp $readFileIso;
173 | print FILEOUTISOName "$readFileIso\t-\n";
174 | print FILEOUTISONorm "$keyI\t$ENSIName\t-\t$ExpectedcountI\n";
175 |
176 | }
177 |
178 |
179 | }
180 | }
181 | }
182 |
183 |
184 |
185 | sub quantileCal
186 | {
187 |
188 | my $filename = shift;
189 |
190 | open(FILEOUT1, ">$filename.TMP") || die "cannot open the file";
191 |
192 | print FILEOUT1 "X=read.table(\"$filename\", header=T)\n";
193 | print FILEOUT1 "head(X)\n";
194 | print FILEOUT1 "Y=subset(X, X\$expected_count > 0)\n";
195 | print FILEOUT1 "head(Y)\n";
196 | print FILEOUT1 "Z=sort(Y[,5])\n";
197 | print FILEOUT1 "quantile(Z)\n";
198 |
199 |
200 | system("cat $filename.TMP | R --vanilla >$filename.TMP2 ");
201 |
202 | open(FILEINTMP, "$filename.TMP2") || die "cannot open the file";
203 |
204 | my $flag = 0;
205 | my $value;
206 |
207 | label:while(my $readFile = )
208 | {
209 | if($readFile =~ /quantile/)
210 | {
211 | $flag = 1;
212 | next;
213 | }
214 |
215 | elsif($flag == 1)
216 | {
217 | $flag = 2;
218 | next;
219 | }
220 |
221 | elsif($flag == 2)
222 | {
223 | if($readFile =~ /^\s*(.*?)\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*)$/)
224 | {
225 | $value = $4;
226 | last label;
227 | }
228 |
229 | }
230 | }
231 |
232 | close(FILEOUT1);
233 | close(FILEINTMP);
234 |
235 | system("rm $filename.TMP $filename.TMP2 ");
236 |
237 | return $value;
238 |
239 | }
240 |
241 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/filter_rna_coverage.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | from __future__ import print_function
3 | """
4 | For determining whether we have enough data to make meaningful RNA expression
5 | estimations, we'll look at the number of human reads available after the
6 | Xenome step.
7 | """
8 | import sys
9 | import argparse
10 |
11 |
12 | def parse_args():
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('-m', '--minimum-reads', default='1000000',
15 | help="Minimum number of human reads [default: 1000000]")
16 | parser.add_argument('files', nargs='+',
17 | help="The file[s] to test.")
18 |
19 | return parser.parse_args()
20 |
21 |
22 | def process_file(fn, minimum, multiple):
23 | """
24 | Determine whether there was adequate coverage of bases in this file.
25 | NOTE: Returns True if the run was OK.
26 | :param fn:
27 | :param minimum:
28 | :param multiple:
29 | :return: True if meets criterion, False otherwise
30 | """
31 | line_found = False
32 | summary_section = False
33 | line = ""
34 | for line in open(fn):
35 | line = line.strip()
36 | if line == 'Summary':
37 | summary_section = True
38 | continue
39 | if summary_section and line.endswith('human'):
40 | line_found = True
41 | break
42 | if not line_found:
43 | print("Could not find coverage line in:", fn, file=sys.stderr)
44 | return False
45 |
46 | count = int(line.split()[0])
47 | if count < minimum:
48 | if multiple:
49 | print("{0}\t{1}".format(count, fn))
50 | else:
51 | print("Too low human read count: {0} {1}".format(
52 | count, fn))
53 | return False
54 | return True
55 |
56 |
57 | def main():
58 | args = parse_args()
59 |
60 | minimum_reads = int(args.minimum_reads)
61 | multiple = len(args.files) > 1
62 |
63 | success = True
64 | if multiple:
65 | print("Human reads\tRun")
66 |
67 | for fn in args.files:
68 | success &= process_file(fn, minimum_reads, multiple)
69 | if not success:
70 | sys.exit(1)
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/filter_trim.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """
4 | filter_trim.py
5 |
6 | Script does the folllowing:
7 | 1. filter_quality: The quality value considered high quality when
8 | filtering reads.
9 | 2. filter_percent_bases: The percentage of a reads bases that must
10 | be of high quality to consider the read to be high quality.
11 | 3. trimming_quality: The quality value needed for a base to terminate
12 | trimming.
13 | 4. post_trim_length_pct: The minimum length of a read required post
14 | trimming to retain the read, expressed as a percentage of the
15 | initial read length.
16 | 5. trim_5: Whether to trim bases from the 5' end of the read as well
17 | as the the 3' end.
18 | 6. min_pct_hq_reads: The minimum percentage of high quality reads out
19 | of all reads, required for successful exit. Fewer high quality reads
20 | will result in a failure exit status return to the shell, allowing
21 | pipelines using this program to fail. Default: 0.0 (always return
22 | success).
23 | 7. single_end: Use single end mode, even if there are multiple fastq
24 | files presented.
25 |
26 | Trimming occurs per read; trimming does not need to match read 1 vs
27 | read 2 for paired end data. Filtering and acting on the
28 | post_trim_length_min occurs on a per-end basis; however if one read is
29 | discarded due to these criteria, the other end's read must be discarded
30 | as well.
31 |
32 | Inputs:
33 | 1. A sequence of pairs of fastq files from a paired-end sequencing run
34 | or runs. All pairs must be for the same sample.
35 |
36 | Outputs:
37 | 1. One or two filtered, trimmed fastq files. The paired reads may not
38 | be the same length due to trimming.
39 | 2. Statistics file: The pipeline uses several criteria from this
40 | step to determine whether the run was good enough to analyze.
41 | a. Percent HQ reads: Number of reads written to the filtered
42 | file(s) / number of reads in the input file(s).
43 |
44 | We also report the following statistics, but they aren't used to
45 | judge the quality of a run:
46 | b. Total Reads (Reported once, since both ends must be the same.)
47 | c. Total HQ reads (Reported once, since both ends must be the
48 | same.)
49 | d. Min, Max and Mean trimmed read lengths for reads whose trimmed
50 | length is sufficient to retain the read, reported separately for
51 | each end.
52 |
53 | All output file naming is based on the first file in each fastq file
54 | list the following input fastqs are not represented in the output file
55 | names.
56 |
57 | Exit status:
58 | 0 = Success
59 | non-0 = Failure:
60 | 1) Could not open an input file or create an output file
61 | 2) Python 2.7 or higher not used
62 | 3) Insufficient high quality reads
63 | 4) Odd number of fastq files when in paired mode
64 | 5) Input files in a pair are not the same length
65 | 6) Zero total reads, percent of high quality reads not computed
66 | """
67 | __author__ = 'simons'
68 |
69 | import sys
70 | import os
71 | import math
72 | import gzip
73 | import datetime
74 | import inspect
75 |
76 | # In Python 2.7, the core bz2 module can't process multi-stream files, such
77 | # as those produced by pbzip2. In Python 3.4 and above, it can. The
78 | # Python 3 version has been backported to Python 2.7, and is available
79 | # as bz2file. Conditionally load it to handle a wider array of files;
80 | # fall back to the core bz2 module.
81 | try:
82 | import bz2file as bz2
83 | except:
84 | print >> sys.stderr, 'Could not import bz2file; using the core bz2.'
85 | import bz2
86 |
87 | # Support the version command.
88 | cmd_folder = os.path.realpath(os.path.abspath(os.path.split(
89 | inspect.getfile(inspect.currentframe()))[0]))
90 | lib_folder = os.path.join(cmd_folder, '../lib')
91 | if lib_folder not in sys.path:
92 | sys.path.insert(0, lib_folder)
93 |
94 | try:
95 | import argparse
96 | except:
97 | print >> sys.stderr, 'This program requires python V2.7 or higher.'
98 | sys.exit(2)
99 |
100 |
101 | # The guts of this program. All processing of reads.
102 | class FastqRead(object):
103 | trim_5 = False
104 | trim_hq = 30
105 | read_hq = 30
106 | pct_hq = 0.7
107 |
108 | def __init__(self, fastqs, odir=None, suffix='_filtered_trimmed'):
109 | ofn = os.path.split(fastqs[0])[1] + suffix
110 | if odir:
111 | ofn_path = os.path.join(odir, ofn)
112 | else:
113 | ofn_path = ofn
114 | try:
115 | self.of = open(ofn_path, 'w')
116 | except IOError:
117 | print >> sys.stderr, \
118 | 'Could not open "{0}". Exiting.'.format(ofn_path)
119 | sys.exit(1)
120 |
121 | self.fastqs = fastqs
122 | self.fn = None
123 | self.f = None
124 | self.total_reads = 0
125 | self.hq_reads = 0
126 | self.output_reads = 0
127 | self.min_trimmed_length = sys.maxint
128 | self.max_trimmed_length = -1
129 | self.total_trimmed_length = 0
130 | self.trimmed_reads = 0
131 | self.name = ''
132 | self.bases = ''
133 | self.plus = ''
134 | self.qual = ''
135 | self.timestamp = False
136 | self.line_count = 0
137 |
138 | # Initialize our first input fastq
139 | self.next_file()
140 |
141 | def next_file(self):
142 | #Do we have any files left?
143 | if not self.fastqs:
144 | return False
145 |
146 | self.fn = self.fastqs.pop(0)
147 | try:
148 | self.f = FastqRead.open(self.fn)
149 | except IOError:
150 | print >> sys.stderr, \
151 | 'Could not open "{0}". Exiting.'.format(self.fn)
152 | sys.exit(1)
153 | return True
154 |
155 | def get_filename(self):
156 | return self.fn
157 |
158 | @staticmethod
159 | def open(name):
160 | """
161 | Intended to be private to the class...
162 |
163 | A flexible open routine that can handle plain text files or
164 | files compressed with gzip or bzip2. Only used for the
165 | input files. Output files are emitted uncompressed, until the
166 | tools in the next leg of the pipeline can work properly with
167 | compressed files.
168 |
169 | :param name: The filename to open.
170 | :return: A file object for the named file.
171 | """
172 | if name.endswith('.gz'):
173 | f = gzip.open(name)
174 | elif name.endswith('.bz2'):
175 | f = bz2.BZ2File(name)
176 | else:
177 | f = open(name)
178 | return f
179 |
180 | def stats(self):
181 | s = {}
182 | s['total_reads'] = self.total_reads
183 | s['hq_reads'] = self.hq_reads
184 | s['output_reads'] = self.output_reads
185 | s['max_trimmed_length'] = self.max_trimmed_length
186 | s['min_trimmed_length'] = self.min_trimmed_length
187 | try:
188 | tmp_mean = float(self.total_trimmed_length) / \
189 | float(self.trimmed_reads)
190 | s['mean_trimmed_length'] = '{0:.2f}'.format(tmp_mean)
191 | except ZeroDivisionError:
192 | s['mean_trimmed_length'] = 'N/A'
193 |
194 | return s
195 |
196 | def next(self):
197 | """
198 |
199 | :return: True iff the read was successfully retrieved from
200 | the file.
201 | """
202 | name = self.f.readline()
203 | # Test whether we had a successful read.
204 | # Will be zero length if EOF reached.
205 | if not name:
206 | return False
207 | self.name = name.strip()
208 | self.bases = self.f.readline().strip()
209 | self.plus = self.f.readline().strip()
210 | self.qual = self.f.readline().strip()
211 |
212 | # All four lines must have content to be a valid read.
213 | if len(self.bases) == 0 or \
214 | len(self.plus) == 0 or \
215 | len(self.qual) == 0:
216 | print >> sys.stderr, 'NAME:', self.name
217 | print >> sys.stderr, 'BASES:', self.bases
218 | print >> sys.stderr, 'PLUS:', self.plus
219 | print >> sys.stderr, 'QUAL:', self.qual
220 | raise ValueError('Incomplete read found in file {0}'.
221 | format(self.fn))
222 |
223 | self.total_reads += 1
224 | if self.timestamp:
225 | self.line_count += 1
226 | if self.line_count % 1000000 == 0:
227 | print >> sys.stderr, \
228 | datetime.datetime.strftime(datetime.datetime.now(),
229 | '%H:%M:%S'), \
230 | self.line_count
231 | return True
232 |
233 | def do_timestamp(self):
234 | self.timestamp = True
235 |
236 | @staticmethod
237 | def set_criteria(pct_hq=0.7,
238 | read_hq=30,
239 | trim_hq=30,
240 | trim_5=False,
241 | min_pct=0.7,
242 | min_pct_hq_reads=0.0):
243 |
244 | FastqRead.pct_hq = float(pct_hq)
245 | if FastqRead.pct_hq > 1.0:
246 | FastqRead.pct_hq /= 100.0
247 |
248 | # Use phred33 quality scoring
249 | FastqRead.read_hq = chr(int(read_hq) + 33)
250 | FastqRead.trim_hq = chr(int(trim_hq) + 33)
251 | FastqRead.trim_5 = trim_5 # Passed in as boolean
252 | FastqRead.min_pct = float(min_pct)
253 | if FastqRead.min_pct > 1.0:
254 | FastqRead.min_pct /= 100.0
255 | FastqRead.min_pct_hq_reads = float(min_pct_hq_reads)
256 | if FastqRead.min_pct_hq_reads > 1.0:
257 | FastqRead.min_pct_hq_reads /= 100.0
258 |
259 | # Cache the minimum length of a trimmed read.
260 | min_len = None
261 |
262 | def trim(self):
263 | """
264 |
265 | :return: True if the read is long enough after trimming.
266 | """
267 |
268 | original_length = len(self.qual)
269 | if FastqRead.trim_5:
270 | for p5 in range(original_length):
271 | if self.qual[p5] >= FastqRead.trim_hq:
272 | break
273 | else:
274 | p5 = 0
275 |
276 | for p3 in range(original_length - 1, -1, -1):
277 | if self.qual[p3] >= FastqRead.trim_hq:
278 | break
279 |
280 | tlg = (p3 - p5) + 1 # Length after trimming.
281 |
282 | if FastqRead.min_len is None:
283 | FastqRead.min_len = \
284 | math.ceil(FastqRead.min_pct * original_length)
285 | if tlg < FastqRead.min_len:
286 | return False
287 |
288 | self.bases = self.bases[p5:p3 + 1]
289 | self.qual = self.qual[p5:p3 + 1]
290 |
291 | assert tlg == len(self.qual), "Length calculation is broken"
292 |
293 | # Track our trimmed length stats
294 | if tlg > self.max_trimmed_length:
295 | self.max_trimmed_length = tlg
296 | if tlg < self.min_trimmed_length:
297 | self.min_trimmed_length = tlg
298 | self.total_trimmed_length += tlg # To compute the mean
299 | self.trimmed_reads += 1
300 | return True
301 |
302 | def filter(self):
303 | """
304 |
305 | :return: True if the read passed HQ filtering criteria
306 | """
307 | lg = len(self.qual)
308 | lq_reads_allowed = math.floor(float(lg) *
309 | (1.0 - FastqRead.pct_hq))
310 | lq_reads = 0
311 | for n in range(lg):
312 | if self.qual[n] < FastqRead.read_hq:
313 | lq_reads += 1
314 | if lq_reads > lq_reads_allowed:
315 | return False
316 | self.hq_reads += 1
317 | return True
318 |
319 | def write(self):
320 | print >> self.of, self.name
321 | print >> self.of, self.bases
322 | print >> self.of, self.plus
323 | print >> self.of, self.qual
324 | self.output_reads += 1
325 |
326 | def close(self):
327 | self.of.close()
328 |
329 | # End of class FastqRead.
330 |
331 |
332 | def parse_args():
333 |
334 | parser = argparse.ArgumentParser(description=
335 | "Perform filtering and trimming of paired end fastq "
336 | "files", version="2.0")
337 | parser.add_argument("-p", "--hq_pct", default="70", help=
338 | "Percentage of bases that must be high quality [70]")
339 | parser.add_argument('-f', '--filter_hq', default="30", help=
340 | 'Numeric quality value to pass filtering [30]')
341 | parser.add_argument('-t', '--trim_hq', default="30", help=
342 | 'Numeric quality value to not be trimmed [30]')
343 | parser.add_argument('-m', '--min_len_pct', default="70", help=
344 | 'Minimum read length after trimming to '
345 | 'retain read. (percentage 0-100) [70]')
346 | parser.add_argument('-M', '--min_pct_hq_reads', default=0, help=
347 | 'Minimum percentage of reads classified as High '
348 | 'Quality reads (percentage 0-100) [0]')
349 | parser.add_argument('-5', '--trim_5', action="store_true", help=
350 | "Trim 5' end as well as 3' [False]")
351 | parser.add_argument('-s', '--suffix', default='_filtered_trimmed',
352 | help='Suffix to construct the output file name '
353 | '[_filtered_trimmed]')
354 | parser.add_argument('-S', '--single_end', action="store_true",
355 | help="Use single end mode with multiple fastq files " \
356 | "[False]")
357 | parser.add_argument('-d', '--directory', dest='odir', default='.',
358 | help=
359 | 'Directory in which to write the output files '
360 | '[current directory]')
361 | parser.add_argument('-i', '--timestamp', action='store_true', help=
362 | 'Emit a timestamp ever 1,000,000 reads [False]')
363 | parser.add_argument("fastqs", nargs="+")
364 | args = parser.parse_args()
365 | return args
366 |
367 |
368 | def output_stats_single(r1, args, start_time):
369 | """
370 | Report the statistics for a single end run.
371 |
372 | NOTE WELL!!!
373 | This routine and output_stats_paired have the same logic
374 | flow. If one is changed, the other almost certainly has to
375 | have the corresponding change made.
376 |
377 | YOU HAVE BEEN WARNED!
378 |
379 | :param r1: Accumulated info for the reads.
380 | :param args: Our command line arguments
381 | :param start_time: The run's start time
382 | :return: None
383 | """
384 |
385 | # Here we have completely processed the input file. Write out
386 | # the statistics
387 | r1_stats = r1.stats()
388 | bn_fq1 = os.path.split(args.fastqs[0])[1]
389 |
390 | with open(os.path.join(args.odir, bn_fq1 + '_stat'), 'w') as sf:
391 | print >> sf, 'Input file:'
392 | print >> sf, 'Read 1: {0}'.format(args.fastqs[::2])
393 | print >> sf, 'QC statistics'
394 | print >> sf, 'Statistic\tRead 1'
395 |
396 | try:
397 | f_pct_hq = float(r1_stats['output_reads']) / \
398 | float(r1_stats['total_reads'])
399 | pct_hq = '{0:.2%}'.format(f_pct_hq)
400 | except ZeroDivisionError:
401 | pct_hq = 'N/A'
402 | print >> sf, 'Percentage of HQ reads\t{0}'.format(pct_hq)
403 |
404 | print >> sf, 'Total number of reads\t{0}'.format(
405 | r1_stats['total_reads'])
406 | print >> sf, 'Total number of HQ filtered reads\t{0}'.\
407 | format(r1_stats['output_reads'])
408 | print >> sf, 'Detailed QC statistics'
409 | print >> sf, 'Reads passing filter\t{0}'.\
410 | format(r1_stats['hq_reads'])
411 |
412 | try:
413 | pct_rpf = '{0:.2%}'.format(float(r1_stats['hq_reads']) /
414 | float(r1_stats['total_reads']))
415 | except ZeroDivisionError:
416 | pct_rpf = 'N/A'
417 | print >> sf, 'Percent reads passing filter\t{0}'.format(pct_rpf)
418 |
419 | print >> sf, 'Max Trimmed Length\t{0}'.\
420 | format(r1_stats['max_trimmed_length'])
421 | print >> sf, 'Min Trimmed Length\t{0}'.\
422 | format(r1_stats['min_trimmed_length'])
423 | print >> sf, 'Mean Trimmed Length\t{0}'.\
424 | format(r1_stats['mean_trimmed_length'])
425 | print >> sf, 'Run start time\t{0}'.\
426 | format(datetime.datetime.strftime(start_time, '%H:%M:%S'))
427 | end_time = datetime.datetime.now()
428 | print >> sf, 'Run end time\t{0}'.\
429 | format(datetime.datetime.strftime(end_time, '%H:%M:%S'))
430 |
431 | if r1_stats['total_reads'] == 0:
432 | # This will be the same as sys.exit(6)
433 | print >> sys.stderr, 'Failure: total reads == 0\nExiting' \
434 | 'with status 6'
435 | return 6
436 | if f_pct_hq < FastqRead.min_pct_hq_reads:
437 | # This will be the same effect as sys.exit(3)
438 | print >> sys.stderr, 'Failure: not enough high quality ' \
439 | 'read percent: {} required: {}\n' \
440 | 'Exiting with status 3'.\
441 | format(f_pct_hq, FastqRead.min_pct_hq_reads)
442 | return 3
443 | # Success!
444 | return 0
445 |
446 | def output_stats_paired(r1, r2, args, start_time):
447 | """
448 | Report the statistics for a paired end run.
449 |
450 | NOTE WELL!!!
451 | This routine and output_stats_single have the same logic
452 | flow. If one is changed, the other almost certainly has to
453 | have the corresponding change made.
454 |
455 | YOU HAVE BEEN WARNED!
456 |
457 | :param r1: Accumulated info for the end 1 reads.
458 | :param r2: Accumulated info for the end 2 reads.
459 | :param args: Our command line arguments
460 | :param start_time: The run's start time
461 | :return: None
462 | """
463 |
464 | # Here we have completely processed both input files. Write out
465 | # the statistics
466 | r1_stats = r1.stats()
467 | r2_stats = r2.stats()
468 |
469 | bn_fq1 = os.path.split(args.fastqs[0])[1]
470 | bn_fq2 = os.path.split(args.fastqs[1])[1]
471 |
472 | with open(os.path.join(args.odir,
473 | '{0}_{1}_stat'.format(bn_fq1, bn_fq2)), 'w') as sf:
474 | print >> sf, 'Input files:'
475 | print >> sf, 'Read 1: {0}'.format(args.fastqs[::2])
476 | print >> sf, 'Read 2: {0}'.format(args.fastqs[1::2])
477 | print >> sf, 'QC statistics'
478 | print >> sf, 'Statistic\tRead 1\tRead 2'
479 |
480 | try:
481 | f_pct_hq1 = float(r1_stats['output_reads']) / \
482 | float(r1_stats['total_reads'])
483 | pct_hq1 = '{0:.2%}'.format(f_pct_hq1)
484 | except ZeroDivisionError:
485 | pct_hq1 = 'N/A'
486 | try:
487 | f_pct_hq2 = float(r2_stats['output_reads']) / \
488 | float(r2_stats['total_reads'])
489 | pct_hq2 = '{0:.2%}'.format(f_pct_hq2)
490 | except ZeroDivisionError:
491 | pct_hq2 = 'N/A'
492 | print >> sf, 'Percentage of HQ reads\t{0}\t{1}'.\
493 | format(pct_hq1, pct_hq2)
494 |
495 | print >> sf, 'Total number of reads\t{0}\t{1}'.format(
496 | r1_stats['total_reads'],
497 | r2_stats['total_reads'])
498 | print >> sf, 'Total number of HQ filtered reads\t{0}\t{1}'.\
499 | format(r1_stats['output_reads'], r2_stats['output_reads'])
500 | print >> sf, 'Detailed QC statistics'
501 | print >> sf, 'Reads passing filter\t{0}\t{1}'.\
502 | format(r1_stats['hq_reads'], r2_stats['hq_reads'])
503 |
504 | try:
505 | pct_rpf1 = '{0:.2%}'.format(float(r1_stats['hq_reads']) /
506 | float(r1_stats['total_reads']))
507 | except ZeroDivisionError:
508 | pct_rpf1 = 'N/A'
509 | try:
510 | pct_rpf2 = '{0:.2%}'.format(float(r2_stats['hq_reads']) /
511 | float(r2_stats['total_reads']))
512 | except ZeroDivisionError:
513 | pct_rpf2 = 'N/A'
514 | print >> sf, 'Percent reads passing filter\t{0}\t{1}'.\
515 | format(pct_rpf1, pct_rpf2)
516 |
517 | print >> sf, 'Max Trimmed Length\t{0}\t{1}'.\
518 | format(r1_stats['max_trimmed_length'],
519 | r2_stats['max_trimmed_length'])
520 | print >> sf, 'Min Trimmed Length\t{0}\t{1}'.\
521 | format(r1_stats['min_trimmed_length'],
522 | r2_stats['min_trimmed_length'])
523 | print >> sf, 'Mean Trimmed Length\t{0}\t{1}'.\
524 | format(r1_stats['mean_trimmed_length'],
525 | r2_stats['mean_trimmed_length'])
526 | print >> sf, 'Run start time\t{0}'.\
527 | format(datetime.datetime.strftime(start_time, '%H:%M:%S'))
528 | end_time = datetime.datetime.now()
529 | print >> sf, 'Run end time\t{0}'.\
530 | format(datetime.datetime.strftime(end_time, '%H:%M:%S'))
531 |
532 | if r1_stats['total_reads'] == 0 or r2_stats['total_reads'] == 0:
533 | # This will be the same as sys.exit(6)
534 | print >> sys.stderr, 'Failure: total reads == 0\nExiting' \
535 | 'with status 6'
536 | return 6
537 | if f_pct_hq1 < FastqRead.min_pct_hq_reads or \
538 | f_pct_hq2 < FastqRead.min_pct_hq_reads:
539 | # This will be the same effect as sys.exit(3)
540 | print >> sys.stderr, 'Failure: not enough high quality ' \
541 | 'read percent: e1: {}, e2: {} ' \
542 | 'required: {}\n' \
543 | 'Exiting with status 3'.\
544 | format(f_pct_hq1, f_pct_hq2, FastqRead.min_pct_hq_reads)
545 | return 3
546 | # Success!
547 | return 0
548 |
549 | def main():
550 | start_time = datetime.datetime.now()
551 | args = parse_args()
552 |
553 | # If we are doing paired end processing, make sure that we have
554 | # pairs (i.e., an even number of files, and split the list of
555 | # files into end-specific lists.
556 | num_fastqs = len(args.fastqs)
557 | paired_end = ((num_fastqs != 1) and (not args.single_end))
558 |
559 | if paired_end:
560 | # Paired end; need to be an even number of fastqs.
561 | if num_fastqs % 2 != 0:
562 | print >> sys.stderr, 'Odd number of fastq files ({0}) in ' \
563 | 'paired-end mode. Exiting...'.format(
564 | num_fastqs
565 | )
566 | sys.exit(4)
567 |
568 | # Now split the lists:
569 | e1_fastqs = args.fastqs[::2]
570 | e2_fastqs = args.fastqs[1::2]
571 | else:
572 | # Make a copy. We need the original later.
573 | e1_fastqs = args.fastqs[:]
574 | e2_fastqs = None
575 |
576 | r1 = FastqRead(e1_fastqs, args.odir, args.suffix)
577 |
578 | # We may be processing single end reads. Everything with r2 is
579 | # conditional on having a second fastq.
580 | if paired_end:
581 | r2 = FastqRead(e2_fastqs, args.odir, args.suffix)
582 |
583 | # Check if we want timestamps output to track progress
584 | if args.timestamp:
585 | r1.do_timestamp()
586 |
587 | # The criteria are class members, not instance.
588 | FastqRead.set_criteria(args.hq_pct, args.filter_hq, args.trim_hq,
589 | args.trim_5, args.min_len_pct,
590 | args.min_pct_hq_reads)
591 |
592 | r1_ok = False
593 |
594 | # If we don't have paired end reads, we just want the tests
595 | # below to care about end 1. In this case, initialize R2_ok to
596 | # True
597 | r2_ok = not paired_end
598 |
599 | # Loop over the whole file. We'll exit this with a break.
600 | while True:
601 | # Do NOT move these into the if statement below; we need to
602 | # keep them in sync. If they are in the if, and r1 fails,
603 | # r2 will not be executed.
604 | r1_ok = r1.next()
605 | if paired_end:
606 | r2_ok = r2.next()
607 | if not (r1_ok and r2_ok):
608 | # One or both files are exhausted. Must both end at the
609 | # same read.
610 | if r1_ok or (paired_end and r2_ok):
611 | print >> sys.stderr, \
612 | 'Input files {0} and {1} are different lengths.\n' \
613 | 'Exiting.'.format(
614 | r1.get_filename(),
615 | r2.get_filename())
616 | sys.exit(5)
617 | # Get the next files in the list to continue processing.
618 | # Since we ensured above that the lists
619 | # were the same length, we don't need to do equivalency
620 | # tests here. We can simply test for list exhaustion on r1
621 | # which works for both single and paired end. If it
622 | # succeeds and we're paired end, we can blindly get the next
623 | # end 2 file.
624 | r1_ok = r1.next_file()
625 | if not r1_ok:
626 | # We've exhausted the list of input files.
627 | break
628 | if paired_end:
629 | # Guaranteed to succeed: lists are equal length.
630 | r2.next_file()
631 |
632 | # Back to the top to get a read from the new files
633 | continue
634 |
635 | r1_ok = r1.filter()
636 | if paired_end:
637 | r2_ok = r2.filter()
638 | if not (r1_ok and r2_ok):
639 | # Filtering this read failed... Next!
640 | continue
641 |
642 | r1_ok = r1.trim()
643 | if paired_end:
644 | r2_ok = r2.trim()
645 | if not (r1_ok and r2_ok):
646 | # This read trimmed to be too short.
647 | continue
648 |
649 | r1.write()
650 | if paired_end:
651 | r2.write()
652 |
653 |
654 | if paired_end:
655 | status = output_stats_paired(r1, r2, args, start_time)
656 | r2.close()
657 | else:
658 | status = output_stats_single(r1, args, start_time)
659 | r1.close()
660 |
661 | return status
662 |
663 | if __name__ == '__main__':
664 | status = main()
665 | sys.exit(status)
666 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/lymphoma_classifier.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """
4 | Based on expected expression of a small set of genes, determine whether this
5 | supposed tumor sample has been converted into a lymphoma.
6 |
7 | We use two cut-offs, one for passaged tumors, the other for patient.
8 | """
9 | from __future__ import print_function
10 | import sys
11 | import requests
12 | import argparse
13 | import math
14 |
15 |
16 | def parse_args():
17 | parser = argparse.ArgumentParser()
18 |
19 | parser.add_argument('-t', '--tumor', default=3, type=int,
20 | help="Cut-off for tumor samples")
21 | parser.add_argument('-p', '--patient', default=7, type=int,
22 | help="Cut-off for patient samples")
23 | parser.add_argument('-o', '--output', default='lymphoma_score.txt',
24 | help="Output file into which the total z-score is "
25 | "written")
26 | parser.add_argument('normalized_counts',
27 | help="file containing gene name in column 0, and "
28 | "normalized gene count in column 3")
29 | parser.add_argument('expected_expression',
30 | help="file containing gene name in column 0, "
31 | "expected (average) expression in column 3, "
32 | "and standard deviation in column 4")
33 | parser.add_argument('sample_name',
34 | help="first part of the fastq file name, from which "
35 | "we extract the sample and model")
36 | return parser.parse_args()
37 |
38 |
39 | def get_expected(fn):
40 | d = {}
41 | for line in open(fn):
42 | # parts[0]: gene name
43 | # parts[1]: expected up/down regulation
44 | # parts[3]: average expression
45 | # parts[4]: stddev
46 | parts = [x.strip() for x in line.split()]
47 | d[parts[0]] = {'updown': float(parts[1]),
48 | 'average': float(parts[3]),
49 | 'stddev': float(parts[4])}
50 | return d
51 |
52 |
53 | def z_score(count, updown, average, stddev):
54 | arbitrary_scale_factor = 25.0
55 | l2 = math.log(count + 1.0, 2)
56 | z = updown * (l2 - average) / (arbitrary_scale_factor * stddev)
57 | return z
58 |
59 |
60 | def process_counts_file(fn, expected):
61 | z_total = 0.0
62 | for line in open(fn):
63 | parts = [x.strip() for x in line.split()]
64 | if parts[0] in expected:
65 | e = expected[parts[0]]
66 | z_total += z_score(float(parts[3]),
67 | e['updown'],
68 | e['average'],
69 | e['stddev'])
70 | return z_total
71 |
72 |
73 | def get_model_sample(name):
74 | parts = name.split('_')
75 | # After naming changes were put in place in early 2017, the model is
76 | # always first, and the sample is always second.
77 | model = parts[0]
78 | sample = parts[1]
79 | return model, sample
80 |
81 |
82 | def is_lymphoma(sample):
83 | """
84 | Try to determine whether this sample is from a model known (in ELIMS) to be
85 | a lymphoma. If any step fails, assume the model is not a known lymphoma.
86 | That is safe, because then we'll run the classifier code.
87 | :param sample: The sample name.
88 | :return: True if the model for this sample is known to be a lymphoma.
89 | """
90 | r = requests.get('http://pdx-dashboard.jax.org/elims/JSON/all',
91 | {'id': sample})
92 | if r.status_code != 200:
93 | print("In is_lymphoma(): Request for {0} failed with "
94 | "status {1}. Assuming not lymphoma.".format(sample, r.status_code),
95 | file=sys.stderr)
96 | return False
97 |
98 | j = r.json()
99 | data = j['data']
100 | if len(data) > 1:
101 | print("Multiple sample entries returned. Assuming not lymphoma",
102 | file=sys.stderr)
103 | return False
104 | details = data[0]['details']
105 | if len(details) > 1:
106 | print("Multiple sets of details returned. Assuming not lymphoma",
107 | file=sys.stderr)
108 | return False
109 | return 'lymphoma' in details[0]['clinical_diagnosis'].lower()
110 |
111 |
112 | def is_patient(sample):
113 | if sample.endswith('PT'):
114 | return True
115 | if sample[0] != 'J':
116 | return False
117 |
118 | # Last attempt: look up a J sample.
119 | r = requests.get('http://pdx-dashboard.jax.org/elims/JSON/all',
120 | {'id': sample})
121 | if r.status_code != 200:
122 | print("In is_patient(): Request for {0} failed with status {1}. "
123 | "Assuming not a patient sample.".format(sample, r.status_code))
124 | return False
125 | j = r.json()
126 | data = j['data']
127 | if len(data) > 1:
128 | print("Multiple sample entries returned. Assuming not patient",
129 | file=sys.stderr)
130 | return False
131 | summary = data[0]['summary']
132 | return summary['passage'] == 'Patient'
133 |
134 |
135 | def main():
136 | print("Starting lymphoma_classifier")
137 | print("Starting lymphoma_classifier", file=sys.stderr)
138 | args = parse_args()
139 | assume_statuses = False
140 | try:
141 | # We are processing data from outside organizations, which don't use
142 | # our naming schemes. Handle it gracefully. Assuming that this
143 | # is not a lymphoma model, and not a patient sample.
144 | model, sample = get_model_sample(args.sample_name)
145 | except:
146 | assume_statuses = True
147 |
148 | # This whole thing doesn't matter if the model is a lymphoma. Look it
149 | # up, based on the sample name.
150 | if (not assume_statuses) and is_lymphoma(sample):
151 | msg = "Sample is from lymphoma model. Not evaluating."
152 | print(msg, file=open(args.output, 'w'))
153 | print(msg)
154 | print(msg, file=sys.stderr)
155 | return 0
156 |
157 | expected = get_expected(args.expected_expression)
158 | z_total = process_counts_file(args.normalized_counts, expected)
159 | print(z_total, file=open(args.output, 'w'))
160 | if (not assume_statuses) and is_patient(sample):
161 | cutoff = args.patient
162 | else:
163 | cutoff = args.tumor
164 | print("Finished lymphoma_classifier")
165 | print("Finished lymphoma_classifier", file=sys.stderr)
166 | if z_total > cutoff:
167 | return 1
168 | return 0
169 |
170 | if __name__ == '__main__':
171 | main()
172 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/read_group_from_fastq.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | """
4 | read_group_from_fastq.py
5 |
6 | Input: the fastq file specified as argv[1], the first command line argument.
7 | Handles compressed or uncompressed fastqs.
8 | Output: the second command line argument, if specified, else, sys.stdout.
9 |
10 | Notes:
11 | We will usually be handling standard Illumina Casava 1.8+ output, which
12 | has a regular file naming format and read name format. If any of the
13 | steps here fail, cause the pipeline to fail rather than producing
14 | untraceable output.
15 | """
16 |
17 | import sys
18 | import os
19 | import re
20 | import time
21 | import gzip
22 | import argparse
23 |
24 | try:
25 | import bz2file as bz2
26 | except ImportError:
27 | import bz2
28 |
29 |
30 | def parse_args():
31 | parser = argparse.ArgumentParser(version='V2.0')
32 | parser.add_argument('-p', '--picard', action='store_true',
33 | help="Use Picard format for read group line")
34 | parser.add_argument('-t', '--tumor', action='store_true',
35 | help="Sample is tumor in a tumor/normal pair")
36 | parser.add_argument('-n', '--normal', action='store_true',
37 | help="Sample is normal in a tumor/normal pair")
38 | parser.add_argument('fastq',
39 | help="Path to fastq file for sample")
40 | parser.add_argument('output', nargs='?',
41 | help="Output file name [STDOUT]")
42 |
43 | args = parser.parse_args()
44 |
45 | if args.tumor:
46 | if args.normal:
47 | # Check for a conflict.
48 | parser.error("Must not specify both --tumor and --normal.")
49 | args.sample_type = "Tumor_"
50 | elif args.normal:
51 | args.sample_type = "Normal_"
52 | else:
53 | args.sample_type = ""
54 |
55 | return args
56 |
57 |
58 | def multi_open(name):
59 | if name.endswith('.gz'):
60 | f = gzip.open(name)
61 | elif name.endswith('.bz2'):
62 | f = bz2.BZ2File(name)
63 | else:
64 | f = open(name)
65 | return f
66 |
67 |
68 | def make_fake(args):
69 | """
70 | If we can't get adequate data from the file, use timestamps.
71 | :return:
72 | """
73 | # Sleep for 2 seconds, to make sure that a previous invocation
74 | # will have a different time stamp.
75 | time.sleep(2)
76 |
77 | ts = time.strftime('%H%M%S')
78 |
79 | id = 'ID_' + ts
80 | lb = 'LIB_' + ts
81 | sm = 'SAMPLE_' + ts
82 | bc = 'RUN_' + ts
83 | output(id, lb, sm, bc, args)
84 | sys.exit(0)
85 |
86 |
87 | def main():
88 | #cga_version.parse_options()
89 |
90 | args = parse_args()
91 |
92 | # First get the info from the filename
93 | fn = os.path.split(args.fastq)[1]
94 |
95 | if 'fastq' not in fn and 'fq' not in fn:
96 | print >> sys.stderr, "Not seemingly a fastq file:", fn
97 | make_fake(args)
98 | # Does not return...
99 |
100 | # Now split the basename portion into its constituent parts.
101 | fn_parts = fn.split('_')
102 |
103 | # Scan for the "GES" starting a filename part. If found,
104 | # That separates the Sample name portion from the Library name.
105 | # If GES is not found starting a part, use the whole filename
106 | # as both the Sample name and the Library name.
107 | # Maybe redo this with regular expressions, but for now, it works.
108 | pos = -1
109 | for n in range(len(fn_parts)):
110 | if fn_parts[n].startswith("GES"):
111 | pos = n
112 | break
113 | if pos == -1:
114 | # Didn't find the GES marker. Use the filename up to the end name.
115 | match = re.search('(.*)[._]R[12]_.*',fn)
116 | if match is not None:
117 | fn = match.group(1)
118 | else:
119 | # something is seriously odd here, but we'll just use the
120 | # whole filename
121 | pass
122 |
123 | cust_id = ges_id = fn
124 | else:
125 | cust_id = '_'.join(fn_parts[:pos])
126 | ges_parts = fn_parts[pos:]
127 | pos = 999 # Way bigger than the number of parts we'll see.
128 | for n in range(len(ges_parts)):
129 | if ges_parts[n] == 'R1' or ges_parts[n] == 'R2':
130 | pos = n
131 | break
132 | ges_id = '_'.join(ges_parts[:pos])
133 |
134 | # Sanity check that we have some amount of text for our fields. The
135 | # down stream tools can't tolerate empty fields in the read group
136 | # information.
137 | if not ges_id:
138 | ges_id = fn
139 |
140 | if not cust_id:
141 | cust_id = ges_id
142 |
143 | # Now the parts from the first readname--the first line of the file.
144 | # When split on ':', the readname contains
145 | # - the ID in the first four fields.
146 | # Note: the leading '@' needs to be stripped.
147 | try:
148 | inf = multi_open(sys.argv[1])
149 | line = inf.readline()
150 | except IOError, e:
151 | print sys.stderr, "Couldn't read the file: {0}\n {1}". \
152 | format(fn, e.message)
153 | make_fake(args)
154 | # Does not return
155 |
156 | # Example line:
157 | # @HISEQ2000:190:D19U8ACXX:5:1101:1492:1901 1:N:0:TAGCTT
158 | parts = line[1:].strip().split(' ')
159 | read_name = parts[0]
160 |
161 | # Example read_name: HISEQ2000:190:D19U8ACXX:5:1101:1492:1901
162 | rparts = read_name.split(':')
163 | if len(rparts) >= 4:
164 | rparts = rparts[:4]
165 |
166 | # Try to add the bar code in:
167 | bar_code = "no_barcode"
168 | if len(parts) >= 2:
169 | # Example comment: 1:N:0:TAGCTT
170 | comment = parts[1]
171 | cparts = comment.split(':')
172 | if len(cparts) == 4:
173 | bar_code = cparts[3]
174 | rparts.append(bar_code)
175 |
176 | id = ':'.join(rparts)
177 | # Example id: HISEQ2000:190:D19U8ACXX:5:TAGCTT
178 |
179 | output(id, ges_id, cust_id, bar_code, args)
180 |
181 | def output(id, ges_id, cust_id, bar_code, args):
182 | if args.output is not None:
183 | of = open(args.output, 'w')
184 | else:
185 | of = sys.stdout
186 |
187 | if args.picard:
188 | line = 'RGID={0}{1} RGLB={0}{2} ' \
189 | 'RGPL=ILLUMINA RGSM={3} RGPU={4}'.\
190 | format(args.sample_type, id, ges_id, cust_id, bar_code)
191 | else :
192 | line = '@RG\\tID:{0}{1}\\tLB:{0}{2}\\tSM:{3}\\tPL:ILLUMINA'.\
193 | format(args.sample_type, id, ges_id, cust_id)
194 | # This needs to be a single line file; no terminating \n
195 | print >> of, line,
196 | if of != sys.stdout:
197 | of.close()
198 |
199 | if __name__ == '__main__':
200 | main()
201 |
--------------------------------------------------------------------------------
/RNA_PDX/bin/summary_QC_metrics.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use strict;
3 | use warnings;
4 |
5 | open(FILEIN1, $ARGV[0]) || die "cannot open the file"; ####filter *_stat
6 | open(FILEIN2, $ARGV[1]) || die "cannot open the file"; ####Xenome Stats
7 | open(FILEIN3, $ARGV[2]) || die "cannot open the file"; ####rsem aln stat
8 | open(FILEIN4, $ARGV[3]) || die "cannot open the file"; ####picard stats
9 |
10 | my ($value1, $value2, $value3);
11 |
12 |
13 | while(my $readFile1 = )
14 | {
15 | if($readFile1 =~ /^\s*Percentage of HQ reads\s+(.*?)\s+(.*)/)
16 | {
17 | $value1 = $1;
18 | }
19 | elsif($readFile1 =~ /^\s*Total number of reads\s+(.*?)\s+(.*).*$/)
20 | {
21 | $value2 = $1;
22 |
23 | }
24 | elsif($readFile1 =~ /^\s*Total number of HQ filtered reads\s+(.*?)\s+(.*).*$/)
25 | {
26 | $value3 = $1;
27 | }
28 |
29 | }
30 |
31 | print "Total number of Read Pairs\t$value2\n";
32 | print "Total number of HQ filtered reads\t$value3\n";
33 | print "Percentage of HQ read Pairs\t$value1\n";
34 |
35 | my $flag = 0;
36 |
37 |
38 | while(my $readFile2 = )
39 | {
40 | if(($readFile2 =~ /^\s*count\s+percent\s+class.*$/) && ($flag == 0))
41 | {
42 | $flag = 1;
43 | }
44 | elsif(($flag == 1) && ($readFile2 =~ /^\s*(.*?)\s+(.*?)\s+(.*)$/))
45 | {
46 | print "Xenome $3\t$1\t$2\n";
47 | }
48 | elsif(($flag == 1) && ($readFile2 =~ /^\s*(.*?)\s+(.*?)\s+(.*)$/))
49 | {
50 | print "Xenome $3\t$1\t$2\n";
51 | }
52 | elsif(($flag == 1) && ($readFile2 =~ /^\s*(.*?)\s+(.*?)\s+(.*)$/))
53 | {
54 | print "Xenome $3\t$1\t$2\n";
55 | }
56 | elsif(($flag == 1) && ($readFile2 =~ /^\s*(.*?)\s+(.*?)\s+(.*)$/))
57 | {
58 | print "Xenome $3\t$1\t$2\n";
59 | }
60 | elsif(($flag == 1) && ($readFile2 =~ /^\s*(.*?)\s+(.*?)\s+(.*)$/))
61 | {
62 | print "Xenome $3\t$1\t$2\n";
63 | }
64 | }
65 |
66 | $flag = 0;
67 |
68 |
69 | while(my $readFile3 = )
70 | {
71 | if(($readFile3 =~ /^\s*(\d+)\s+.*$/) && ($flag == 0))
72 | {
73 | print "Total number of input reads for RSEM transcriptome Alignment\t$1\n";
74 | $flag = 1;
75 | }
76 | elsif(($readFile3 =~ /^\s*(.*?)\s+\((.*)\).*$/) && ($flag == 1))
77 | {
78 | print "Total number of paired reads for RSEM transcriptome Alignment\t$1\t$2\n";
79 | $flag = 2;
80 | }
81 | elsif(($readFile3 =~ /^\s*(.*?)\s+\((.*)\).*$/) && ($flag == 2))
82 | {
83 | print "Total number of reads aligned concordantly 0 times from RSEM transcriptome Alignment\t$1\t$2\n";
84 | $flag = 3;
85 | }
86 | elsif(($readFile3 =~ /^\s*(.*?)\s+\((.*)\).*$/) && ($flag == 3))
87 | {
88 | print "Total number of reads aligned concordantly exactly 1 time from RSEM transcriptome Alignment\t$1\t$2\n";
89 | $flag = 4;
90 | }
91 | elsif(($readFile3 =~ /^\s*(.*?)\s+\((.*)\).*$/) && ($flag == 4))
92 | {
93 | print "Total number of reads aligned concordantly >1 time from RSEM transcriptome Alignment\t$1\t$2\n";
94 | $flag = 5;
95 | }
96 | elsif(($readFile3 =~ /^\s*(.*?)\%\s+.*$/) && ($flag == 5))
97 | {
98 | print "Overall alignment rate\t$1\%\n";
99 | }
100 |
101 |
102 | }
103 |
104 | $flag = 0;
105 |
106 | my @splitHeader = ();
107 | my @splitValue = ();
108 |
109 | while(my $readFile4 = )
110 | {
111 |
112 | if(($readFile4 =~ /^\s*##\s+METRICS\s+CLASS\s+picard.analysis.RnaSeqMetrics.*$/) && ($flag == 0))
113 | {
114 | $flag = 1;
115 | next;
116 | }
117 | elsif(($readFile4 =~ /^\s*PF_BASES.*$/) && ($flag == 1))
118 | {
119 | $flag = 2;
120 | chomp $readFile4;
121 | @splitHeader = split("\t", $readFile4);
122 | next;
123 | }
124 | elsif($flag == 2)
125 | {
126 | $flag = 3;
127 | chomp $readFile4;
128 | @splitValue = split("\t", $readFile4);
129 | next;
130 | }
131 |
132 | }
133 |
134 | for (my $i =0; $i<=$#splitHeader-3; $i++)
135 | {
136 | print "$splitHeader[$i]\t$splitValue[$i]\n";
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/RNA_PDX/classifier_and_coverage.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 | Run a classifier and exon level coverage statistics from alignment file
7 |
8 |
17 |
18 |
19 | samtools/0.1.18
20 | bedtools/2.25.0
21 | python/2.7.3
22 |
23 |
24 |
25 |
26 |
27 | -o {out_1} {in_1} {in_2} {in_3}
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/RNA_PDX/picard_alignment_metrics.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Collecting Alignment metrics.
4 |
5 |
20 |
21 | java/1.8.0
22 | samtools/0.1.18
23 | python/2.7.9
24 | R/3.1.1
25 | picard/2.8.1/picard.jar
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | -Djava.io.tmpdir=$TMPDIR
34 | -Xmx4g -jar /opt/compsci/picard/2.8.1/picard.jar AddOrReplaceReadGroups
35 | INPUT={in_1} OUTPUT={out_1}
36 | SORT_ORDER=coordinate
37 | {rg}
38 | CREATE_INDEX=true
39 |
40 |
41 |
42 |
43 |
44 | -Djava.io.tmpdir=$TMPDIR
45 | -Xmx4g -jar /opt/compsci/picard/2.8.1/picard.jar ReorderSam
46 | INPUT={out_1}
47 | OUTPUT={out_2}
48 | REFERENCE={in_3}
49 | CREATE_INDEX=true
50 |
51 |
52 |
53 |
54 |
55 | -Djava.io.tmpdir=$TMPDIR
56 | -Xmx8g -jar /opt/compsci/picard/2.8.1/picard.jar SortSam SO=coordinate
57 | INPUT={out_2}
58 | OUTPUT={out_3}
59 | VALIDATION_STRINGENCY=SILENT
60 | CREATE_INDEX=true
61 |
62 |
63 |
64 |
65 |
66 | -Djava.io.tmpdir=$TMPDIR
67 | -Xmx4g -jar /opt/compsci/picard/2.8.1/picard.jar CollectRnaSeqMetrics
68 | I={out_3}
69 | O={out_4}
70 | REF_FLAT={in_4}
71 | RIBOSOMAL_INTERVALS={in_5} {SEQ_STRAND}
72 | CHART_OUTPUT={out_5}
73 |
74 |
75 |
76 |
--------------------------------------------------------------------------------
/RNA_PDX/qual_statistics.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Checks paired end fastq files for overall quality, and performs
5 | some statistical analysis.
6 | Terminates the run if less than 50% HQ reads.
7 |
8 |
9 |
24 |
25 |
26 | python/2.7.3
27 |
28 |
29 |
30 |
31 |
32 |
33 | filter_trim.py --version
34 | {qual_cutoff} -d {in_3} {in_1} {in_2}
35 |
36 |
37 |
--------------------------------------------------------------------------------
/RNA_PDX/read_group.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Process a fastq file to extract read group information.
4 |
5 |
6 |
11 |
12 | python/2.7.3
13 |
14 |
15 |
16 | read_group_from_fastq.py --version
17 | {in_1} {out_1} -p
18 |
19 |
20 |
--------------------------------------------------------------------------------
/RNA_PDX/rsem_alignment.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 | Align a pair of (filtered, trimmed) fastq files. Any quality checking and trimming
7 | should be done before this step.
8 |
9 |
17 |
18 |
19 | samtools/0.1.18
20 | bowtie2/2.2.0
21 | rsem/1.2.19
22 |
23 |
24 |
25 | base=`basename {out_1} .genome.sorted.bam`
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | -p 8 {phredquals} {seed-length} {strand-specific} --time --output-genome-bam --bowtie2 --paired-end {in_1} {in_2} {in_3} $base
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/RNA_PDX/summary_metrics.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Collecting Summary metrics.
4 |
5 |
6 |
15 |
16 | perl
17 |
18 |
19 | {in_1} {in_2} {in_3} {in_4}
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/RNA_PDX/xenome_classification_RNA.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 | Classify input reads into mouse and human subsets (also both,
6 | ambiguous, and neither). Terminate the run if there are fewer than N
7 | human reads. N defaults to 1,000,000.
8 |
9 |
22 |
23 |
24 | xenome/1.0.0
25 | perl
26 | python/2.7.3
27 |
28 |
29 |
30 |
31 | {threads} -P
32 | {in_3}
33 | --pairs --host-name mouse --graft-name human -i {in_1} -i {in_2}
34 |
35 |
36 |
37 | {min_reads} *xenome_stats.txt
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------