├── .gitignore
├── Analysis
    ├── TCGA_24_manuscript_analysis.Rmd
    └── TCGA_24_manuscript_analysis.html
├── Analysis_datasets
    ├── .RData
    ├── .Rhistory
    ├── 10_14_predictions_raw
    │   ├── RSEM_q_log_200_f
    │   │   ├── REPORT.html
    │   │   ├── model.txt
    │   │   ├── parameters.txt
    │   │   ├── predictions.png
    │   │   ├── probabilities.txt
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    │   ├── TPM_q_log_200_f
    │   │   ├── REPORT.html
    │   │   ├── model.txt
    │   │   ├── parameters.txt
    │   │   ├── predictions.png
    │   │   ├── probabilities.txt
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    │   └── fpkm_q_log_200_f
    │   │   ├── REPORT.html
    │   │   ├── model.txt
    │   │   ├── parameters.txt
    │   │   ├── predictions.png
    │   │   ├── probabilities.txt
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    ├── 5_01_predictions_raw
    │   ├── fpkmlog_no
    │   │   ├── REPORT.html
    │   │   ├── predictions.png
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    │   ├── rsem
    │   │   ├── REPORT.html
    │   │   ├── predictions.png
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    │   ├── rsem_no
    │   │   ├── REPORT.html
    │   │   ├── predictions.png
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    │   └── tpmlog_no
    │   │   ├── REPORT.html
    │   │   ├── predictions.png
    │   │   ├── signature.cdt
    │   │   ├── signature.png
    │   │   └── signature_s.cdt
    ├── Classification_12_LUAD_LUSC_Predictions.txt
    ├── Classification_20_LUAD_LUSC_Predictions.txt
    ├── GFP18_HER2_Rsubread_FPKM.txt
    ├── GFP18_HER2_Rsubread_TPM.txt
    ├── GFP18_HER2_Rsubread_geneCounts.txt
    ├── GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt
    ├── GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt
    ├── PANCAN12_19583_by_3380_numZeroes.txt
    ├── PANCAN20_19583_by_3380_numZeroes.txt
    ├── Rsem_10_14.txt
    ├── TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt
    └── rsubread_10_14.txt
├── Codes
    ├── BuildMatrixFile.py
    ├── CalcAUC.R
    ├── CalcAccuracy.R
    ├── Classify_luad_vs_lusc.R
    ├── CombineScalarValues.py
    ├── FileContainsText.py
    ├── GetFileExtension.py
    ├── IdentifyDiscordantPredictions.R
    ├── IdentifyInconsistentPredictions.R
    ├── LUSC_vs_LUAD.R
    ├── ParseCgHubQueryResults.py
    ├── ParseSampleTypes.py
    ├── PeekMatrix.py
    ├── PlotDiscordant.R
    ├── PrintMatrixDimensions.py
    ├── ProcessClinicalData.R
    ├── ProcessRnaSeqFeatureCounts.R
    ├── Split.py
    ├── TransposeData.py
    ├── biological_rep.R
    ├── numZero.R
    └── utilities.py
├── LICENSE
├── README.md
├── Scripts
    ├── LUSC_LUAD_discordant_analysis
    ├── normalize_tcga_rsubread
    ├── process_tcga_level_3
    ├── process_tcga_rsubread
    └── summarize_tcga_rsubread
├── TCGA_CancerType_Abbreviations.txt
└── TCGA_CancerType_Publishable.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.txt
 2 | FeatureCounts
 3 | FPKM
 4 | FPKMlog
 5 | TPM
 6 | TPMlog
 7 | DownloadSamples
 8 | Temp
 9 | CancerTypes
10 | InProgress
11 | go
12 | Query.xml
13 | *.key
14 | XmlFiles
15 | Genome
16 | *.jar
17 | temp*
18 | Scripts/*_rsubread2
19 | Codes/ProcessRnaSeqFeatureCounts2.R
20 | nohup*
21 | Stats
22 | update_git
23 | commit_git
24 | Analysis/*_cache
25 | Analysis/*_cache/*
26 | Analysis/*_files
27 | Analysis/*_files/*
28 | Codes/ForMoom
29 | Codes/ForMoom/*
30 | Analysis/*20*
31 | 


--------------------------------------------------------------------------------
/Analysis_datasets/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/.RData


--------------------------------------------------------------------------------
/Analysis_datasets/.Rhistory:
--------------------------------------------------------------------------------
  1 | for (i in 1:4)
  2 | + {
  3 | +   hist(iris[,i], main=colnames(iris)[i])
  4 | + }
  5 | dskasd
  6 | for (i in 1:4)
  7 | hist(iris[,i], main=colnames(iris)[i])
  8 | par(mfrow=c(2,2))
  9 | for (i in 1:4)
 10 | hist(iris[,i], main=colnames(iris)[i])
 11 | library("knitr", lib.loc="~/Library/R/3.1/library")
 12 | install.packages(c("car", "colorspace", "manipulate", "Rcpp", "RcppArmadillo"))
 13 | install.packages("mgcv", lib="/Library/Frameworks/R.framework/Versions/3.1/Resources/library")
 14 | source('~/.active-rstudio-document')
 15 | date: March 9, 2015
 16 | date: 3/9/2015
 17 | date:
 18 | #date:
 19 | date: 03-19-2015
 20 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species))
 21 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
 22 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)))
 23 | print(1:length(levels(iris$Species)))
 24 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
 25 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1)
 26 | print(1:length(levels(iris$Species)))
 27 | par(mfrow=c(2,2))
 28 | for (i in 1:(ncol(iris) - 1))
 29 | hist(iris[,i], main=colnames(iris[i]), xlab="centimeters")
 30 | par(mfrow=c(1,1))
 31 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
 32 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1)
 33 | print(1:length(levels(iris$Species)))
 34 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
 35 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lwd=1)
 36 | print(1:length(levels(iris$Species)))
 37 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
 38 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
 39 | print(1:length(levels(iris$Species)))
 40 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20)
 41 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
 42 | print(1:length(levels(iris$Species)))
 43 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20)
 44 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.5)
 45 | print(1:length(levels(iris$Species)))
 46 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.5)
 47 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
 48 | print(1:length(levels(iris$Species)))
 49 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
 50 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
 51 | print(1:length(levels(iris$Species)))
 52 | library("lattice", lib.loc="/Library/Frameworks/R.framework/Versions/3.1/Resources/library")
 53 | install.packages("lattice")
 54 | print(head(irisData))
 55 | irisData = select(iris, -Species)
 56 | print(head(irisData))
 57 | irisData = select(iris, -Species)
 58 | #print(head(irisData))
 59 | library(dplyr)
 60 | irisData = select(iris, -Species)
 61 | #print(head(irisData))
 62 | library(dplyr)
 63 | install.packages("dplyr")
 64 | library(dplyr)
 65 | irisData = select(iris, -Species)
 66 | print(head(irisData))
 67 | library(dplyr)
 68 | irisData = select(iris, -Species)
 69 | head(irisData)
 70 | irisData = scale(irisData)
 71 | head(irisData)
 72 | boxplot(irisData)
 73 | irisData = scale(irisData)
 74 | head(irisData)
 75 | boxplot(irisData)
 76 | boxplot(irisData$Petal.Width~iris$Species)
 77 | for (i in 1:(ncol(iris) - 1))
 78 | boxplot(irisData[,i] ~ iris$Species, main=colnames(iris)[i], ylab="centimeters")
 79 | pcIrisData = prcomp(irisData)
 80 | pcIrisData
 81 | pcIrisData = prcomp(t(irisData))
 82 | pcIrisData
 83 | pcIrisData = prcomp(irisData)
 84 | pcIrisData
 85 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st Principal Component", ylab="2nd Principal Component", pch=20, cex=1.25)
 86 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point()
 87 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data")
 88 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25)
 89 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
 90 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25)
 91 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
 92 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species), pch=20, cex=1.25)
 93 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
 94 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
 95 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point()
 96 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data")
 97 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
 98 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
 99 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
100 | plot(iris$Petal.Length, iris$Petal.Width)
101 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot")
102 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=10)
103 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=12)
104 | ```
105 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=111)
106 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18)
107 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15)
108 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="green")
109 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="red")
110 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", xlab="Petal Length", ylab="Petal Width")
111 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=3, xlab="Petal Length", ylab="Petal Width")
112 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
113 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot of petal features")
114 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Sepal characteristics among species", pch=20, cex=1.25)
115 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
116 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
117 | model <- lm(iris$Petal.Length ~ iris$Petal.Width)
118 | abline(model, lwd = 2)
119 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
120 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
121 | abline(model, lwd = 2)
122 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
123 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
124 | abline(model, lwd = 2)
125 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
126 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
127 | abline(model, lwd = 4)
128 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
129 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
130 | abline(model, lwd = 4, lty=2)
131 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18)
132 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width")
133 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width")
134 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
135 | abline(model, lwd = 3, lty=2, col="gray")
136 | abline(model, lwd = 3, lty=2, col="darkgray")
137 | pca$rotation
138 | pcIrisData$rotation
139 | percent <- 100 * pcIrisData$sdev^2 / sum(pca$sdev^2)
140 | percent
141 | percent <- 100 * pcIrisData$sdev^2 / sum(pcIrisData$sdev^2)
142 | percent
143 | barplot(percent)
144 | barplot(percent, names.arg=1:4, xlab="Principal Component", ylab="% variance explained")
145 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
146 | xlim(-1.,1.25) +
147 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
148 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
149 | ggplot(rotation_data) +
150 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
151 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
152 | xlim(-1.,1.25) +
153 | ylim(-1.,1.) +
154 | coord_fixed() # fix aspect ratio to 1:1
155 | library(ggplot)
156 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
157 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
158 | ggplot(rotation_data) +
159 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
160 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
161 | xlim(-1.,1.25) +
162 | ylim(-1.,1.) +
163 | coord_fixed() # fix aspect ratio to 1:1
164 | library(ggplot2)
165 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
166 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
167 | ggplot(rotation_data) +
168 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
169 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
170 | xlim(-1.,1.25) +
171 | ylim(-1.,1.) +
172 | coord_fixed() # fix aspect ratio to 1:1
173 | library(graphics)
174 | library(ggplot2)
175 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
176 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
177 | ggplot(rotation_data) +
178 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
179 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
180 | xlim(-1.,1.25) +
181 | ylim(-1.,1.) +
182 | coord_fixed() # fix aspect ratio to 1:1
183 | library(graphics)
184 | library(ggplot2)
185 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
186 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
187 | #ggplot(rotation_data) +
188 | #  geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
189 | #  geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
190 | #  xlim(-1.,1.25) +
191 | #  ylim(-1.,1.) +
192 | #  coord_fixed() # fix aspect ratio to 1:1
193 | library(ggplot2)
194 | library(grid)
195 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
196 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
197 | #ggplot(rotation_data) +
198 | #  geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
199 | #  geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
200 | #  xlim(-1.,1.25) +
201 | #  ylim(-1.,1.) +
202 | #  coord_fixed() # fix aspect ratio to 1:1
203 | library(ggplot2)
204 | library(grid)
205 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
206 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
207 | ggplot(rotation_data) +
208 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
209 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
210 | xlim(-1.,1.25) +
211 | ylim(-1.,1.) +
212 | coord_fixed() # fix aspect ratio to 1:1
213 | ?prcomp
214 | iris
215 | ?kmeans
216 | install.packages("useful")
217 | plot(k1, data=iris)
218 | source('~/.active-rstudio-document')
219 | k1 <- kmeans(x=iris[, 1:4], centers=3)
220 | plot(k1)
221 | plot(k1, data=iris)
222 | k1 <- kmeans(x=iris[, 1:4], centers=3)
223 | library(useful)
224 | plot(k1)
225 | irisData = iris[,-5] # Negative sign excludes the specified column
226 | head(irisData)
227 | irisData = scale(irisData)
228 | head(irisData)
229 | boxplot(irisData)
230 | ?subset
231 | ?c
232 | x = rep(1, 1000)
233 | x
234 | hist(x)
235 | plot(density(x))
236 | ls()
237 | library(dplyr)
238 | ?inner_join
239 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets")
240 | setwd("TCGA_RNASeq_clinical/Analysis_datasets")
241 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets")
242 | getwd()
243 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
244 | # Rsubread pipeline, gene counts
245 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
246 | # TCGA pipeline, normalized expression files
247 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
248 | # Rsubread pipeline, FPKM values
249 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
250 | rsub_fpkmlog<-log2(rsub_fpkm+1)
251 | # Rsubread pipeline, TPM values
252 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
253 | rsub_tpmlog<-log2(rsub_tpm+1)
254 | # Clinical data
255 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped
256 | ```
257 | # TCGA pipeline, expected counts
258 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
259 | # Rsubread pipeline, gene counts
260 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
261 | # TCGA pipeline, normalized expression files
262 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
263 | # Rsubread pipeline, FPKM values
264 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
265 | rsub_fpkmlog<-log2(rsub_fpkm+1)
266 | # Rsubread pipeline, TPM values
267 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
268 | rsub_tpmlog<-log2(rsub_tpm+1)
269 | # Clinical data
270 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped
271 | rsub_preds<-read.table("rsubread_10_14.txt", sep='\t', header=1, row.names=1)
272 | tcga_preds<-read.table("Rsem_10_14.txt", sep='\t', header=1, row.names=1)
273 | pancan12_zero<-read.table("PANCAN12_19583_by_3380_numZeroes.txt",row.names=1,sep='\t')# File is at Analysis_datasets
274 | pancan20_tpm_zero<-read.table("PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',row.names=1)# File is at Analysis_datasets
275 | data12 = read.table("Classification_12_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1) # File is at Analysis_datasets
276 | data20 = read.table("Classification_20_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1)# File is at Analysis_datasets
277 | #This function calculates the standardized mean using Hedge's formula
278 | standardized_mean<-function(m.1,sd.1,n.1,m.2,sd.2,n.2){
279 | sd_pooled=sqrt(((n.1-1)*sd.1^2+(n.2-1)*sd.2^2)/(n.1+n.2-2))
280 | (m.1-m.2)/sd_pooled
281 | }
282 | #This function merges two matrices on row names, sets the common items as rownames  and removes the extra column resulting from merge function.
283 | merge_drop<-function(x,y,by=0)
284 | {
285 | new_m<-merge(x,y,by=by)
286 | rownames(new_m)<-new_m$Row.names
287 | return(new_m[,2:length(colnames(new_m))])
288 | }
289 | #This function plots the ROC based on the actual and predicted class
290 | plotROC = function(actual, probabilities, plotCI=FALSE)
291 | {
292 | # bottom, left, top, right
293 | par(mar=c(4.5, 4.7, 0.0, 0.5),lwd=4)
294 | library(pROC)
295 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE)
296 | lowerBoundAuc = format(roc_result$ci[1], digits=3)
297 | midAuc = format(roc_result$ci[2], digits=3)
298 | upperBoundAuc = format(roc_result$ci[3], digits=3)
299 | if (plotCI)
300 | {
301 | ci(roc_result)
302 | sens.ci <- ci.se(roc_result)
303 | plot(sens.ci, type="shape", col="gray95")
304 | plot(sens.ci, type="bars")
305 | plot(roc_result, add=TRUE)
306 | }
307 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep=""))
308 | par(mar=c(5.1, 4.1, 2.1, 2.1))
309 | }
310 | ##########computing the empiric cumulative distribution per sample overlaied on same graph########
311 | ###using TCGA pipelined aligned data
312 | ecdf_all_ex<-apply(log2(rsem_her2_expected_counts+1),2,ecdf)
313 | par( mfrow = c( 1, 2 ) )
314 | plot(ecdf_all_ex[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="TCGA pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5)
315 | legend(10,10,c("GFP", "HER2"), col = c("blue","brown"))
316 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")}
317 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")}
318 | ###using Rsubread pipeline aligned data
319 | ecdf_all<-apply(log2(feature+1),2,ecdf)
320 | plot(ecdf_all[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="Rsubread pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5)
321 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
322 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
323 | ############computing total number of read counts per samples and plotting them as dot plots####
324 | expected_counts<-apply(rsem_her2_expected_counts,2,sum)
325 | feature_counts<-apply(feature,2,sum)
326 | # Creating a plot showing total mapped reads per sample
327 | par( mfrow = c( 1, 2 ),lwd=4 )
328 | x = c(rep(1, 12), rep(2, 5)) # this indicates where on the x axis to plot
329 | par(mar=c(3.1, 4.6, 2.1, 0.6)) # figure margins
330 | boxplot(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",main="TCGA Pipeline",col='grey75',medcol="grey75",lwd=4,border = "grey35")
331 | points(jitter(x, factor=2), c(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1)), pch=4, cex=2, col=1,  xaxt="n",cex.lab=1.5)
332 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5)
333 | boxplot(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",col='grey75',medcol="grey75",lwd=4,main="Rsubread Pipeline",border = "grey35")
334 | points(jitter(x, factor=1.5), c(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1)), pch=4,cex=2,cex.lab=1.5,col="black")
335 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5)
336 | #######Boxplotting ERBB2 gene counts in HMEC samples#####
337 | par(mfrow = c(1, 1),lwd=4)
338 | names=c('TCGA\nGFP','TCGA\nHER2','Rsubread\nGFP', 'Rsubread\nHER2')
339 | rsem_her2<-data.frame(t(rsem_her2_expected_counts["ERBB2",]))
340 | rsub_her2<-data.frame(t(feature["ERBB2",]))
341 | x = c(rep(1, 12), rep(2, 5),rep(3, 12), rep(4, 5))
342 | boxplot(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1),log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1),ylab="",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,main=paste('Comparing TGCA and Rsubread Pipelines','\n', 'in Differentiating HER2 Overexpression from Controls',sep=''),border = "grey35")
343 | points(jitter(rep(1,12),factor=2),log2(rsem_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col="black")
344 | points(jitter(rep(2,5),factor=2),log2(rsem_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black')
345 | points(jitter(rep(3,12),factor=2),log2(rsub_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col='black')
346 | points(jitter(rep(4,5),factor=2),log2(rsub_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black')
347 | axis(1, at=1:4, tick=T, labels=c("TCGA\nControl", "TCGA\nHER2","Rsubread\nControl", "Rsubread\nHER2"), cex.axis=0.8)
348 | ##using data processed by RSEM detected difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples
349 | ##t = -12.1833, df = 4.157, p-value = 0.0002081 but was worse than Rsubread
350 | t.test(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1))
351 | ##using not normalized data processed by Rsubread was much better at detecting difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples
352 | ##t = -46.6747, df = 8.35, p-value = 2.152e-11
353 | t.test(log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1))
354 | ###########here we are computing standardized mean difference using the exprected gene counts from TCGA pipeline and gene counts from Rsubread algorithm ############
355 | ####Hedge's standardized mean/effect size using TCGA pipeline
356 | standardized_mean(mean(log2(rsem_her2$ERBB2[13:17]+1)),sd(log2(rsem_her2$ERBB2[13:17]+1)),5,mean(log2(rsem_her2$ERBB2[1:12]+1)),sd(log2(rsem_her2$ERBB2[1:12]+1)),12)
357 | ####Hedge's standardized mean/effect size using Rsubread pipeline
358 | standardized_mean(m.1=mean((log2(rsub_her2$ERBB2[13:17]+1))),sd.1=sd((log2(rsub_her2$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(rsub_her2$ERBB2[1:12]+1))),sd.2=sd((log2(rsub_her2$ERBB2[1:12]+1))),n.2 = 12)
359 | #######################comparing gene counts results ############
360 | par( mfrow = c( 1,3 ) ,lwd=4)
361 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),]
362 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene
363 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene)
364 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1)
365 | ###Coefficient of variation in GFP samples across all common genes
366 | ####Coefficient of variation in TCGA pipeline processed data
367 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_fpkm),]
368 | hist(na.omit(apply(com_genes_TCGA,1,sd)/apply(com_genes_TCGA,1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
369 | hist(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
370 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the control samples:",median(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean))),sep=" "))
371 | hist(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
372 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the HER2-overexpressed samples:",median(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean))),sep=" "))
373 | tcga_her2_normalized<-data.frame(t(TCGA_her2["ERBB2",]))
374 | ####Coefficient of variation in Rsubread pipeline processed data
375 | com_genes_fpkm<-rsub_fpkm[rownames(rsub_fpkm)%in%rownames(com_genes_TCGA),]
376 | hist(na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)),main = "Rsubread FPKM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes",breaks=20)
377 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_fpkm[,1:12],1,sd)/apply(com_genes_fpkm[,1:12],1,mean)))),sep=''))
378 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)))),sep=''))
379 | rsub_fpkmlog_her2<-data.frame(t(rsub_fpkmlog["ERBB2",]))
380 | rsub_fpkm_her2<-data.frame(t(rsub_fpkm["ERBB2",]))
381 | com_genes_tpm<-rsub_fpkm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),]
382 | hist(na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)),main = "Rsubread TPM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes")
383 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_tpm[,1:12],1,sd)/apply(com_genes_tpm[,1:12],1,mean)))),sep=''))
384 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)))),sep=''))
385 | rsub_tpm_her2<-data.frame(t(rsub_tpm["ERBB2",]))
386 | rsub_tpmlog_her2<-data.frame(t(rsub_tpmlog["ERBB2",]))
387 | #######post normalization ecdf
388 | ecdf_all_ex<-apply(log2(TCGA_her2+1),2,ecdf)
389 | par( mfrow = c( 1, 3 ) )
390 | plot(ecdf_all_ex[[1]],xlab=NA, ylab = NA,col="blue",main="TCGA Level 3",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,)
391 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")}
392 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")}
393 | ###using Rsubread pipeline aligned data
394 | ecdf_all<-apply(rsub_fpkmlog,2,ecdf)
395 | plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
396 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
397 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
398 | ecdf_all_t<-apply(rsub_tpmlog,2,ecdf)
399 | plot(ecdf_all_t[[1]],col="blue",main="Rsubread TPM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
400 | for(i in 2:12){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="blue")}
401 | for(i in 13:17){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="brown")}
402 | ###Creating boxplots of the normalized ERBB2 expression
403 | par( mfrow = c( 1, 1 ) )
404 | par(mar=c(5, 4.5, 3.5, 0.5))
405 | boxplot(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1),rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17],rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17],ylab="log2(HER2 gene expression values)",main="Comparing HER2 normalized expression between\n control and her2 samples",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,border = "grey35")
406 | names=c("TCGA\nGFP","TCGA\nHER2","Rsubred FPKM\nGFP", "Rsubred FPKM\nHER2","Rsubred TPM\nGFP", "Rsubred TPM\nHER2")
407 | text(seq(1,6,by=1),par("usr")[3] - 2, labels = names, srt = 45, pos = 1, xpd = TRUE)
408 | points(jitter(rep(1,12),factor=2),log2(tcga_her2_normalized$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5)
409 | points(jitter(rep(2,5),factor=2),log2(tcga_her2_normalized$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5)
410 | points(jitter(rep(3,12),factor=2),rsub_fpkmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5)
411 | points(jitter(rep(4,5),factor=2),rsub_fpkmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5)
412 | points(jitter(rep(5,12),factor=2),rsub_tpmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5)
413 | points(jitter(rep(6,5),factor=2),rsub_tpmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5)
414 | ###t.test to see if there is significance
415 | t.test(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1))
416 | t.test(rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17])
417 | t.test(rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17])
418 | ###Standardized mean difference: TCGA pipeline normalized ERBB2 expression values
419 | standardized_mean(m.1=mean((log2(tcga_her2_normalized$ERBB2[13:17]+1))),sd.1=sd((log2(tcga_her2_normalized$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(tcga_her2_normalized$ERBB2[1:12]+1))),sd.2=sd((log2(tcga_her2_normalized$ERBB2[1:12]+1))),n.2=12)
420 | ###Standardized mean difference: Rsubread pipeline FPKM normalized ERBB2 expression values
421 | standardized_mean(mean(rsub_fpkmlog_her2$ERBB2[13:17]),sd(rsub_fpkmlog_her2$ERBB2[13:17]),5,mean(rsub_fpkmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12)
422 | ###Standardized mean difference:Rsubread pipeline TPM normalized ERBB2 expression values
423 | standardized_mean(mean(rsub_tpmlog_her2$ERBB2[13:17]),sd(rsub_tpmlog_her2$ERBB2[13:17]),5,mean(rsub_tpmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12)
424 | colnames(pancan12_zero)<-"PANCAN12"
425 | colnames(pancan20_tpm_zero)<-"TPM"
426 | all_zeros<-merge_drop(pancan12_zero,pancan20_tpm_zero)
427 | #3380 samples are common
428 | par(mfrow = c(1, 2),lwd=4)
429 | h1<-hist(all_zeros$PANCAN12,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks = 25)
430 | abline(v=median(all_zeros$PANCAN12),col="red",lty=2)
431 | h2<-hist(all_zeros$TPM,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks=25)
432 | abline(v=median(all_zeros$TPM),col="red",lty=2)
433 | t.test(all_zeros$PANCAN12,all_zeros$TPM)
434 | #############Predicted HER2 pathway activity analysis#############################################
435 | all_preds<-merge_drop(rsub_preds,tcga_preds,by=0)
436 | brca_clinical<-subset(clinicals,clinicals[,'tumor_tissue_site']=='Breast',select=c("bcr_patient_barcode","her2_status_by_ihc"))
437 | common_all<-merge_drop(all_preds,brca_clinical,by=0)
438 | all_preds_pos_neg<-subset(common_all,common_all$her2_status_by_ihc=="Negative"|common_all$her2_status_by_ihc=="Positive")
439 | all_ranked<-apply(all_preds_pos_neg[,1:3],2,rank)
440 | all<-cbind(all_ranked,all_preds_pos_neg[,4:5])
441 | ihc_neg<-subset(all,all$her2_status_by_ihc=="Negative")
442 | ihc_pos<-subset(all,all$her2_status_by_ihc=="Positive")
443 | ##############boxplot of ranked estimated HER2 pathway activity
444 | ##in TCGA BRCA samples####
445 | par(mfrow = c(1, 1))
446 | par(mar=c(5, 4.6, 2.5, 0.6)) # figure margins
447 | boxplot(ihc_pos$Rsem_log_q_200_f,ihc_neg$Rsem_log_q_200_f,ihc_pos$FPKM_log_q_200_f,ihc_neg$FPKM_log_q_200_f,ihc_pos$TPM_log_q_200_f,ihc_neg$TPM_log_q_200_f,cex.axis=1.5, cex.lab=1.5,outpch=NA,range=0,cex.axis=1, cex.lab=0.7,outpch=NA,col='grey75',medcol="grey5",lwd=4,border = "grey5", main="Comparison of rank-based estimate \nof HER2 activation",ylab="Ranked HER2 prediction")
448 | names=c("TCGA\nLevel3\nHER2(+)","TCGA\nLevel3\nHER2(-)","Rsubred\nFPKM\nHER2(+)", "Rsubred\nFPKM\nHER2(-)","Rsubred\nTPM\nHER2(+)", "Rsubred\nTPM\nHER2(-)")
449 | text(seq(1,6,by=1),par("usr")[3] - 4.5, labels = names, srt = 45, pos = 1, xpd = TRUE)
450 | ihc_neg_t<-subset(common_all,common_all$her2_status_by_ihc=="Negative")
451 | ihc_pos_t<-subset(common_all,common_all$her2_status_by_ihc=="Positive")
452 | ##coefficient of variation in TCGA pipeline processed HER2 predictions
453 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$Rsem_log_q_200_f)/mean(ihc_neg_t$Rsem_log_q_200_f),sep=' '))
454 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$Rsem_log_q_200_f)/mean(ihc_pos_t$Rsem_log_q_200_f),sep=' '))
455 | ##coefficient of variation in Rsubread FPKM pipeline processed HER2 predictions
456 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$FPKM_log_q_200_f)/mean(ihc_neg_t$FPKM_log_q_200_f),sep="  "))
457 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$FPKM_log_q_200_f)/mean(ihc_pos_t$FPKM_log_q_200_f),sep="  "))
458 | ##coefficient of variation in Rsubread TPM pipeline processed HER2 predictions
459 | print(paste("Coefficient of variation in Rsubread TPM  processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$TPM_log_q_200_f)/mean(ihc_neg_t$TPM_log_q_200_f),sep="  "))
460 | print(paste("Coefficient of variation in Rsubread TPM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$TPM_log_q_200_f)/mean(ihc_pos_t$TPM_log_q_200_f),sep="  "))
461 | ##Calculating standardized mean differences between the HER2(+) and HER2(-) groups
462 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for TCGA Level 3 data :",standardized_mean(m.1=mean(ihc_pos_t$Rsem_log_q_200_f),sd.1=sd(ihc_pos_t$Rsem_log_q_200_f),n.1=length(ihc_pos_t$Rsem_log_q_200_f),m.2=mean(ihc_neg_t$Rsem_log_q_200_f),sd.2=sd(ihc_neg_t$Rsem_log_q_200_f),n.2=length(ihc_neg_t$Rsem_log_q_200_f)),sep=' '))
463 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for Rsubread FPKM data :",standardized_mean(m.1=mean(ihc_pos_t$FPKM_log_q_200_f),sd.1=sd(ihc_pos_t$FPKM_log_q_200_f),n.1=length(ihc_pos_t$FPKM_log_q_200_f),m.2=mean(ihc_neg_t$FPKM_log_q_200_f),sd.2=sd(ihc_neg_t$FPKM_log_q_200_f),n.2=length(ihc_neg_t$FPKM_log_q_200_f)),sep=' '))
464 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for FPKM TPM data :",standardized_mean(m.1=mean(ihc_pos_t$TPM_log_q_200_f),sd.1=sd(ihc_pos_t$TPM_log_q_200_f),n.1=length(ihc_pos_t$TPM_log_q_200_f),m.2=mean(ihc_neg_t$TPM_log_q_200_f),sd.2=sd(ihc_neg_t$TPM_log_q_200_f),n.2=length(ihc_neg_t$TPM_log_q_200_f)),sep=' '))
465 | ## t-tests comparing HER(+) and HER(-) prediction
466 | t.test(ihc_pos_t$Rsem_log_q_200_f,ihc_neg_t$Rsem_log_q_200_f)# For TCGA Level 3: p-value = 2.009e-05
467 | t.test(ihc_pos_t$FPKM_log_q_200_f,ihc_neg_t$FPKM_log_q_200_f)#For Rsubread FPKM: p-value = 1.493e-10
468 | t.test(ihc_pos_t$TPM_log_q_200_f,ihc_neg_t$TPM_log_q_200_f)#For Rsubread TPM:p-value = 3.197e-12
469 | par(mfrow = c(1, 1),lwd=4)
470 | actual12 = data12$ActualClass
471 | predictions12 = data12$LUAD_Probability
472 | auc = plotROC(actual12, predictions12, TRUE)
473 | title("TCGA Level 3 LUAD vs LUSC")
474 | actual20 = data20$ActualClass
475 | predictions20 = data20$LUAD_Probability
476 | auc = plotROC(actual20, predictions20, TRUE)
477 | title("Rsubread TPM LUAD vs LUSC")
478 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>GFP_RSEM_log_10_9_filtered.txt</SPAN> and <SPAN STYLE=background-color:yellow>HER2_RSEM_log_10_9_filtered.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_RSEM_log_BRCA_10_9_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 5 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Monday, 13 October 2014, 11:34 PM on adira.genetics.utah.edu.  It took 12m 2s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
  1 | Name	Coefficient
  2 | Intercept	4.524853
  3 | ERBB2	0.164782
  4 | HSPA7	-0.125612
  5 | GDF6	-0.111343
  6 | HSPA6	-0.097087
  7 | CCL2	-0.093873
  8 | CXCL10	-0.092074
  9 | LOC338651	0.079326
 10 | TNFSF14	-0.073710
 11 | CD248	-0.059249
 12 | IFIT1	-0.057644
 13 | DNAJA4	-0.053322
 14 | GNAO1	-0.050292
 15 | CRHR1	0.048706
 16 | EEF1A2	0.045896
 17 | HSPA1B	-0.045632
 18 | CCL20	-0.044527
 19 | TNFAIP2	-0.044330
 20 | LOC91948	0.042751
 21 | ATP6V0A4	0.038768
 22 | CFB	-0.037830
 23 | CALB2	0.036782
 24 | PADI1	0.035659
 25 | PDGFB	0.034971
 26 | LOC285629	-0.034876
 27 | CRYAB	-0.032468
 28 | GABRA2	0.030593
 29 | SOD2	-0.028653
 30 | ULBP1	-0.028346
 31 | KRT18	0.028246
 32 | GPR1	-0.027639
 33 | CXCL5	-0.027617
 34 | EPHA3	-0.026868
 35 | IL8	-0.025943
 36 | EPHA4	-0.025735
 37 | TLR3	-0.025646
 38 | HSPB8	-0.025054
 39 | RPSAP52	0.024980
 40 | RGS2	-0.024874
 41 | SLC2A12	-0.024861
 42 | KRT19	0.024626
 43 | TRANK1	-0.024277
 44 | MGP	0.023918
 45 | SAA1	-0.023534
 46 | SHC4	0.022446
 47 | KITLG	-0.022152
 48 | KRT8	0.022084
 49 | CGNL1	-0.021984
 50 | MYCL1	-0.021942
 51 | ANGPTL4	0.021650
 52 | PARP9	-0.021303
 53 | DNAJB4	-0.021262
 54 | SPON1	0.021236
 55 | PIK3C2B	-0.021143
 56 | PARP14	-0.021042
 57 | SERPINB1	0.020839
 58 | CXCL2	-0.020713
 59 | SERPINB13	-0.020613
 60 | SNX9	0.020262
 61 | TRIM22	-0.020121
 62 | DNAJB1	-0.019926
 63 | KANK4	-0.019885
 64 | GBP6	-0.019667
 65 | MLPH	0.019478
 66 | APOL6	-0.019334
 67 | OAS3	-0.019302
 68 | HSP90AA1	-0.019165
 69 | KRT81	0.019156
 70 | GM2A	-0.019126
 71 | ENGASE	-0.017973
 72 | KRT75	0.017856
 73 | CBLC	0.017765
 74 | CCNA1	0.017623
 75 | FERMT2	0.017321
 76 | CEACAM1	0.017130
 77 | SLC13A5	0.017066
 78 | MTSS1L	-0.017003
 79 | TCF4	-0.016884
 80 | PLAUR	0.016528
 81 | GPR110	0.016330
 82 | TP53AIP1	-0.016244
 83 | APAF1	0.016161
 84 | HSPH1	-0.016115
 85 | RAB6B	0.016005
 86 | LOXL4	0.015594
 87 | OSBP2	0.015384
 88 | HSPA8	-0.015298
 89 | UNC5B	-0.015048
 90 | RASA3	0.014898
 91 | KCNN4	0.014783
 92 | ANPEP	0.014734
 93 | AMACR	-0.014480
 94 | ZC3HAV1	-0.014280
 95 | COBLL1	-0.014277
 96 | ECT2	0.014259
 97 | SMURF2	0.014218
 98 | CBR1	-0.014049
 99 | TUFT1	0.013455
100 | C1R	-0.013313
101 | SESN2	-0.013303
102 | TWF2	0.013165
103 | INPP4B	0.013134
104 | SMO	-0.013129
105 | ITGB3	0.013106
106 | CAST	0.013084
107 | FBXW7	-0.013061
108 | VASP	0.012979
109 | SASH1	-0.012828
110 | MT2A	0.012725
111 | NAV3	0.012684
112 | NET1	0.012572
113 | CGN	0.012481
114 | SYTL2	-0.012440
115 | CYBASC3	-0.012341
116 | ST3GAL4	0.012295
117 | TNS3	-0.012073
118 | BCAR3	0.011678
119 | SEC24D	0.011623
120 | DTX4	-0.011553
121 | PYGB	0.011389
122 | MYO1E	0.011297
123 | PTPRE	0.011089
124 | GFPT1	0.011087
125 | ACTB	0.011033
126 | STIM2	-0.011012
127 | XPC	-0.011008
128 | MFI2	0.010950
129 | NFATC3	-0.010879
130 | C19orf66	-0.010511
131 | PDZD2	-0.010452
132 | ARHGEF2	0.010354
133 | TRIOBP	0.010316
134 | SLC34A2	-0.010288
135 | FRMD4A	-0.010219
136 | MAP3K2	-0.010081
137 | NPAS2	0.010074
138 | IGFL3	-0.009956
139 | ARHGAP12	0.009927
140 | SH2D3A	0.009911
141 | NAV2	-0.009866
142 | SMOC1	0.009764
143 | HERPUD1	0.009567
144 | WDR1	0.009562
145 | RASA1	0.009529
146 | MBD4	-0.009337
147 | PLEK2	0.009276
148 | BCAP29	0.009270
149 | ATG16L1	0.009237
150 | LDB1	-0.009222
151 | NCDN	-0.009177
152 | NEK9	-0.009083
153 | CSGALNACT2	0.009018
154 | ATP1B1	-0.008895
155 | APBB2	-0.008881
156 | CAPN2	0.008880
157 | CALM2	0.008674
158 | TRAFD1	-0.008589
159 | PGM1	0.008555
160 | FGFR2	-0.008354
161 | DOPEY1	-0.008331
162 | NISCH	-0.008191
163 | PI4KB	-0.008141
164 | TOR3A	-0.007819
165 | LRIG3	0.007766
166 | POLR2A	-0.007749
167 | NEU1	-0.007665
168 | KPNA4	0.007656
169 | PIK3CD	0.007606
170 | ANKRD13A	-0.007496
171 | TBRG1	-0.007462
172 | EPS15	0.007458
173 | TRIM5	-0.007361
174 | PCSK7	-0.007332
175 | ANKFY1	-0.007320
176 | C20orf194	0.007244
177 | C19orf42	-0.007162
178 | ITGA5	0.007095
179 | ARHGEF12	-0.006996
180 | STK40	-0.006932
181 | MLLT6	-0.006786
182 | C1orf85	-0.006767
183 | PTPN12	0.006480
184 | MAP2K4	-0.006351
185 | ZNF532	-0.006134
186 | AFAP1L2	0.006103
187 | ARID1B	-0.005924
188 | SEC14L1	0.005811
189 | PLEKHA6	-0.005776
190 | ELOVL1	0.005764
191 | CLASP1	-0.005727
192 | SMEK1	-0.005478
193 | NUMA1	-0.005168
194 | ZMYND8	0.005151
195 | PDXK	-0.005071
196 | MYO10	0.004929
197 | UBP1	-0.004780
198 | RCC2	0.004742
199 | SGK1	0.004731
200 | RFWD3	-0.004666
201 | C20orf3	-0.004354
202 | WDR91	-0.004333
203 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
 1 | NAME	VALUE
 2 | Binreg Version	2
 3 | Genes	200
 4 | Metagenes	2
 5 | Strip AFFX control	0
 6 | Log Train0	0
 7 | Log Train1	0
 8 | Log Test	0
 9 | Quantile Normalize	1
10 | Shift-Scale Normalize	0
11 | DWD Normalize	0
12 | DWD Normalize (Bild)	0
13 | Burn In	1000
14 | Samples	5000
15 | Skips	1
16 | Credible Interval	95
17 | Cross Validate	1
18 | Make Plots	1
19 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>Feature_GFP_TPMlog_10_6.txt</SPAN> and <SPAN STYLE=background-color:yellow>Feature_HER2_TPMlog_10_6.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_PANCAN20_Rsubread_BRCA_TPMlog_10_9_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 5 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Monday, 13 October 2014, 11:47 PM on adira.genetics.utah.edu.  It took 11m 41s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
  1 | Name	Coefficient
  2 | Intercept	-0.504928
  3 | ERBB2	0.305527
  4 | HSPA6	-0.158780
  5 | HSPA7	-0.151412
  6 | CCL2	-0.106984
  7 | DNAJA4	-0.093340
  8 | TNFAIP2	-0.075825
  9 | HSPA1A	-0.073306
 10 | EEF1A2	0.071440
 11 | PDGFB	0.067870
 12 | EPGN	-0.067303
 13 | HSPA1B	-0.066745
 14 | ATP6V0A4	0.062446
 15 | CFB	-0.060075
 16 | CALB2	0.058290
 17 | CRYAB	-0.054796
 18 | SAA2	-0.050794
 19 | PNMA2	0.050400
 20 | KRT80	0.050203
 21 | TNFRSF11B	0.048283
 22 | UCA1	0.046302
 23 | CXCL5	-0.045923
 24 | ANGPTL7	-0.044990
 25 | KPRP	0.044522
 26 | SOD2	-0.044234
 27 | SYTL5	0.043949
 28 | KRT19	0.043441
 29 | AKAP12	0.043351
 30 | SRMS	0.042485
 31 | PADI1	0.042177
 32 | GPR1	-0.041418
 33 | RGS2	-0.041195
 34 | MYADM	0.040819
 35 | SHC4	0.040550
 36 | BST2	-0.039644
 37 | EPHA3	-0.039500
 38 | KLK6	0.038871
 39 | KRT18	0.038599
 40 | SAA1	-0.038474
 41 | SPON1	0.038178
 42 | HSP90AA1	-0.038082
 43 | TSPAN18	0.037454
 44 | EPHA4	-0.037243
 45 | ANGPTL4	0.036491
 46 | PAQR7	-0.036256
 47 | ULBP1	-0.035505
 48 | HSPH1	-0.035296
 49 | PGM2L1	0.035069
 50 | CRHR1	0.034918
 51 | SERPINB13	-0.034840
 52 | PIK3C2B	-0.034825
 53 | PTK6	0.034722
 54 | CXCR1	0.034384
 55 | FAM198B	-0.034254
 56 | GRAMD2	-0.034033
 57 | DDAH1	0.033964
 58 | GPRC5A	0.033659
 59 | DAPK1	-0.033620
 60 | SLC1A1	0.033565
 61 | VWA1	0.033251
 62 | DNAJA1	-0.032433
 63 | SNX9	0.032379
 64 | KITLG	-0.032252
 65 | HSPB8	-0.032155
 66 | GBP6	-0.031284
 67 | C10orf10	0.030517
 68 | CCNA1	0.030310
 69 | GM2A	-0.030108
 70 | C8orf84	0.029972
 71 | ALDH1A3	0.029680
 72 | TRIM22	-0.029548
 73 | SREK1IP1	0.029351
 74 | KRT8	0.029074
 75 | NOTCH1	-0.028721
 76 | DNAJB4	-0.028676
 77 | FERMT2	0.027438
 78 | EMP1	0.027141
 79 | MAFF	0.026901
 80 | TCF4	-0.026670
 81 | DNAJB1	-0.026460
 82 | PARP14	-0.026319
 83 | PLAUR	0.026168
 84 | LOC644961	0.026082
 85 | KHDRBS3	0.025650
 86 | PLAU	0.025228
 87 | KANK4	-0.025090
 88 | ESR1	-0.024670
 89 | APOL6	-0.024617
 90 | KCNN4	0.024463
 91 | IGFL3	-0.024452
 92 | MTSS1L	-0.024210
 93 | RAPH1	0.024168
 94 | IFIT5	-0.024094
 95 | DUSP10	0.024043
 96 | PMP22	0.023801
 97 | VASP	0.023373
 98 | ARRDC4	-0.023118
 99 | SMO	-0.023104
100 | FAM176A	0.022803
101 | CBR1	-0.022764
102 | WWTR1	0.022599
103 | PGF	0.022576
104 | STX2	0.022286
105 | ZPLD1	0.022175
106 | KMO	-0.022123
107 | FAM214B	0.021843
108 | TUFT1	0.021717
109 | TNS3	-0.021558
110 | MAP6	0.021499
111 | ST3GAL4	0.021422
112 | HMGB3	0.021401
113 | HS6ST1	-0.021304
114 | DLC1	-0.021275
115 | POU2F1	0.021216
116 | APAF1	0.021057
117 | STOX2	-0.020845
118 | RASA3	0.020767
119 | HERC3	0.020487
120 | DFNB31	-0.020337
121 | FBXO22	-0.020150
122 | BRMS1	-0.020097
123 | IER3	0.020017
124 | NET1	0.019989
125 | CYBASC3	-0.019984
126 | PYGB	0.019830
127 | XPC	-0.019811
128 | BCAR3	0.019647
129 | ZXDB	0.019586
130 | CELF2	0.019402
131 | IGF2BP3	0.019325
132 | TIMP1	-0.019048
133 | ARHGAP12	0.019010
134 | NME7	0.018951
135 | ARV1	-0.018928
136 | CASP1	-0.018873
137 | MR1	-0.018826
138 | KCNJ5	-0.018762
139 | LRRC8C	0.018716
140 | TWF2	0.018592
141 | PPP3CC	0.018547
142 | ANKRD33B	-0.018542
143 | CAST	0.018294
144 | SH3KBP1	0.017947
145 | PODXL2	0.017847
146 | INPP4B	0.017676
147 | TNS4	0.017660
148 | DAB2	0.017551
149 | MFI2	0.017540
150 | RBMS2	0.017501
151 | FGFR2	-0.017469
152 | GFPT1	0.017427
153 | TP53AIP1	-0.017304
154 | NAV3	0.017121
155 | ARHGEF2	0.017063
156 | SESN1	-0.016845
157 | DNAJB9	0.016278
158 | NFE2L1	-0.016229
159 | TRIOBP	0.016197
160 | KIAA1671	-0.016057
161 | ZNFX1	-0.015835
162 | CROT	-0.015664
163 | SLC20A2	0.015334
164 | B2M	-0.015314
165 | UBB	-0.015001
166 | FBXW2	-0.014918
167 | LDB1	-0.014863
168 | SEC24D	0.014746
169 | MICALCL	0.014702
170 | MYO1E	0.014521
171 | RASSF1	0.014486
172 | TOR3A	-0.014460
173 | PIK3R1	-0.014459
174 | TRAFD1	-0.014282
175 | ANKRD13A	-0.014195
176 | SLC41A1	-0.014065
177 | MEF2D	0.013983
178 | PI4KB	-0.013683
179 | LRRFIP1	0.013638
180 | PRRC1	0.013535
181 | FRMD4A	-0.012667
182 | PNMAL1	-0.012235
183 | LPP	0.011861
184 | CAPN2	0.011646
185 | ADAR	-0.011625
186 | PRDM4	-0.011432
187 | APBB2	-0.011350
188 | SEC14L1	0.011315
189 | UBP1	-0.010824
190 | ASAP2	0.010731
191 | PRPSAP2	-0.010671
192 | PPP2R5B	0.010646
193 | NFATC3	-0.010535
194 | AFAP1	0.010482
195 | DCAF7	0.010296
196 | MYL12A	0.009901
197 | ARHGEF12	-0.009895
198 | STAT3	-0.009518
199 | ANKRD27	0.008986
200 | IFFO2	0.008553
201 | GTF2I	-0.008151
202 | CYB561	0.007650
203 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
 1 | NAME	VALUE
 2 | Binreg Version	2
 3 | Genes	200
 4 | Metagenes	2
 5 | Strip AFFX control	0
 6 | Log Train0	0
 7 | Log Train1	0
 8 | Log Test	0
 9 | Quantile Normalize	1
10 | Shift-Scale Normalize	0
11 | DWD Normalize	0
12 | DWD Normalize (Bild)	0
13 | Burn In	1000
14 | Samples	5000
15 | Skips	1
16 | Credible Interval	95
17 | Cross Validate	1
18 | Make Plots	1
19 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>Feature_GFP_FPKMlog_10_6.txt</SPAN> and <SPAN STYLE=background-color:yellow>Feature_HER2_FPKMlog_10_6.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_PANCAN20_Rsubread_BRCA_RPKMlog_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I logged the expression values in Feature_GFP_FPKMlog_10_6.txt and Feature_HER2_FPKMlog_10_6.txt.
12 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
13 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
14 | </UL>
15 | <P>
16 | <H3>II.  Results</H3>
17 | <TABLE BORDER=0 CELLSPACING=10>
18 | <TR>
19 | <TD COLSPAN=2 ALIGN=CENTER>
20 | <H3>200 Genes, 2 Metagenes</H3>
21 | </TD>
22 | </TR>
23 | <TR>
24 | <TD>
25 | <A HREF=signature.png>
26 | <IMG HEIGHT=480 SRC=signature.png>
27 | </A>
28 | </TD><TD>
29 | <A HREF=predictions.png>
30 | <IMG HEIGHT=480 SRC=predictions.png>
31 | </A>
32 | </TD>
33 | </TR>
34 | <TR>
35 | <TD VALIGN=TOP>
36 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 5 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
37 | </TD><TD VALIGN=TOP>
38 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
39 | probabilities.txt
40 | </A>.
41 | </TD>
42 | </TR>
43 | </TABLE>
44 | <P>
45 | <HR>
46 | <EM>This analysis was run on Monday, 13 October 2014, 11:21 PM on adira.genetics.utah.edu.  It took 12m 57s to complete.
47 | </EM>
48 | </BODY>
49 | </HTML>
50 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
  1 | Name	Coefficient
  2 | Intercept	0.168851
  3 | ERBB2	0.257577
  4 | HSPA7	-0.187866
  5 | HSPA6	-0.136333
  6 | GDF6	0.098740
  7 | DNAJA4	-0.080598
  8 | KPRP	0.074612
  9 | EEF1A2	0.069003
 10 | TNFAIP2	-0.067720
 11 | PDGFB	0.066514
 12 | TSPAN18	0.066512
 13 | HSPA1A	-0.062749
 14 | ATP6V0A4	0.058443
 15 | CFB	-0.058034
 16 | HSPA1B	-0.057605
 17 | EPGN	-0.057545
 18 | CALB2	0.054193
 19 | PNMA2	0.048449
 20 | SAA2	-0.047311
 21 | CRYAB	-0.046179
 22 | KRT80	0.045195
 23 | SRMS	0.043627
 24 | GPR1	-0.043320
 25 | UCA1	0.041757
 26 | TNFRSF11B	0.041583
 27 | FAM83A	0.040141
 28 | EPHA3	-0.039923
 29 | CXCL5	-0.039762
 30 | RGS2	-0.039724
 31 | DDAH1	0.039198
 32 | ULBP1	-0.038466
 33 | AKAP12	0.038418
 34 | SOD2	-0.037183
 35 | KRT19	0.036641
 36 | TLR3	-0.035985
 37 | SHC4	0.035642
 38 | PPP1R3C	-0.035295
 39 | PTK6	0.034658
 40 | SPON1	0.034473
 41 | MYADM	0.034361
 42 | BST2	-0.034136
 43 | GRAMD2	-0.034067
 44 | SAA1	-0.033523
 45 | HSP90AA1	-0.032999
 46 | KRT18	0.032801
 47 | EPHA4	-0.032767
 48 | PIK3C2B	-0.032631
 49 | KLK6	0.032407
 50 | CXCR1	0.031954
 51 | PGM2L1	0.031133
 52 | ANGPTL4	0.031075
 53 | PAQR7	-0.031038
 54 | DAPK1	-0.030705
 55 | FAM198B	-0.030230
 56 | SERPINB13	-0.030208
 57 | GBP6	-0.030003
 58 | VWA1	0.029805
 59 | SLC1A1	0.029764
 60 | HSPH1	-0.029464
 61 | KITLG	-0.028275
 62 | GPRC5A	0.027836
 63 | HSPB8	-0.027616
 64 | SNX9	0.027574
 65 | DNAJA1	-0.026591
 66 | C10orf10	0.026544
 67 | SREK1IP1	0.026213
 68 | GM2A	-0.026028
 69 | C8orf84	0.025904
 70 | CCNA1	0.025808
 71 | TRIM22	-0.025731
 72 | APOL6	-0.025483
 73 | KRT8	0.025158
 74 | DNAJB4	-0.025018
 75 | TCF4	-0.024505
 76 | NOTCH1	-0.024433
 77 | ALDH1A3	0.024322
 78 | MAFF	0.023981
 79 | PARP14	-0.023917
 80 | FERMT2	0.023615
 81 | IL7R	-0.023182
 82 | LOC644961	0.023169
 83 | KHDRBS3	0.022993
 84 | EMP1	0.022449
 85 | KMO	-0.022438
 86 | PLAUR	0.022023
 87 | DNAJB1	-0.022019
 88 | IFIT5	-0.021954
 89 | RAPH1	0.021690
 90 | KANK4	-0.021458
 91 | DUSP10	0.020861
 92 | SMO	-0.020834
 93 | DFNB31	-0.020759
 94 | MTSS1L	-0.020665
 95 | PLAU	0.020509
 96 | KCNN4	0.020505
 97 | PMP22	0.020330
 98 | STX2	0.020322
 99 | VASP	0.020230
100 | IGFL3	-0.020208
101 | POU2F1	0.020096
102 | WWTR1	0.019760
103 | FAM176A	0.019732
104 | PGF	0.019637
105 | ARRDC4	-0.019625
106 | TNS3	-0.019394
107 | CBR1	-0.019365
108 | RASA3	0.019126
109 | APAF1	0.018740
110 | HERC3	0.018697
111 | HMGB3	0.018691
112 | ZXDB	0.018650
113 | ST3GAL4	0.018588
114 | HS6ST1	-0.018541
115 | IGF2BP3	0.018523
116 | TUFT1	0.018493
117 | FAM214B	0.018467
118 | NET1	0.017866
119 | XPC	-0.017726
120 | FBXO22	-0.017678
121 | MR1	-0.017472
122 | CYBASC3	-0.017218
123 | KCNJ5	-0.017167
124 | IER3	0.017056
125 | NME7	0.016958
126 | PYGB	0.016808
127 | NAV3	0.016742
128 | BRMS1	-0.016648
129 | ARV1	-0.016434
130 | BCAR3	0.016403
131 | ARHGAP12	0.016383
132 | PPP3CC	0.016377
133 | PODXL2	0.016365
134 | PDZD2	-0.016253
135 | TWF2	0.016132
136 | RBMS2	0.016093
137 | CASP1	-0.015992
138 | TIMP1	-0.015829
139 | LRRC8C	0.015828
140 | SH3KBP1	0.015714
141 | CAST	0.015525
142 | TP53AIP1	-0.015300
143 | DAB2	0.015248
144 | FGFR2	-0.015210
145 | INPP4B	0.015146
146 | HMGN3	-0.015120
147 | SESN1	-0.014994
148 | TRIOBP	0.014970
149 | GFPT1	0.014771
150 | ARHGEF2	0.014671
151 | TNS4	0.014658
152 | MFI2	0.014631
153 | CROT	-0.014554
154 | KIAA1671	-0.013946
155 | ZNFX1	-0.013815
156 | DNAJB9	0.013602
157 | NFE2L1	-0.013277
158 | PIK3R1	-0.013264
159 | FBXW2	-0.013023
160 | RASSF1	0.012832
161 | MICALCL	0.012790
162 | SLC20A2	0.012767
163 | LDB1	-0.012706
164 | IGFBP4	-0.012603
165 | SEC24D	0.012592
166 | B2M	-0.012511
167 | CCDC50	0.012451
168 | SLC41A1	-0.012315
169 | TOR3A	-0.012280
170 | HERPUD1	0.012254
171 | TRAFD1	-0.012195
172 | MYO1E	0.012108
173 | MEF2D	0.012092
174 | FRMD4A	-0.011928
175 | LRRFIP1	0.011781
176 | ANKRD13A	-0.011763
177 | PI4KB	-0.011583
178 | PRRC1	0.011518
179 | UBB	-0.011513
180 | FAM129B	0.011441
181 | PNMAL1	-0.010498
182 | LPP	0.010416
183 | APBB2	-0.010189
184 | PRDM4	-0.010085
185 | ADAR	-0.010018
186 | SEC14L1	0.009938
187 | CAPN2	0.009793
188 | ASAP2	0.009678
189 | PPP2R5B	0.009550
190 | NFATC3	-0.009429
191 | PRPSAP2	-0.009416
192 | DCAF7	0.009216
193 | MEX3C	0.009174
194 | AFAP1	0.009148
195 | UBP1	-0.008794
196 | ARHGEF12	-0.008606
197 | SDC1	0.008466
198 | ADCY9	-0.008152
199 | STAT3	-0.008103
200 | ANKRD27	0.007958
201 | IFFO2	0.007081
202 | GTF2I	-0.006848
203 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
 1 | NAME	VALUE
 2 | Binreg Version	2
 3 | Genes	200
 4 | Metagenes	2
 5 | Strip AFFX control	0
 6 | Log Train0	1
 7 | Log Train1	1
 8 | Log Test	0
 9 | Quantile Normalize	1
10 | Shift-Scale Normalize	0
11 | DWD Normalize	0
12 | DWD Normalize (Bild)	0
13 | Burn In	1000
14 | Samples	5000
15 | Skips	1
16 | Credible Interval	95
17 | Cross Validate	1
18 | Make Plots	1
19 | 


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>Feature_GFP_FPKMlog_10_6.txt</SPAN> and <SPAN STYLE=background-color:yellow>Rsub_HER2_FPKMlog.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_PANCAN20_Rsubread_BRCA_RPKMlog_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 3 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Saturday, 02 May 2015, 02:43 PM on adira.genetics.utah.edu.  It took 23m 34s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>GFP_RSEM_log_10_9_filtered.txt</SPAN> and <SPAN STYLE=background-color:yellow>HER2_RSEM_3_log.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_RSEM_log_BRCA_10_9_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 3 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Friday, 01 May 2015, 11:17 AM on adira.genetics.utah.edu.  It took 13m 38s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>GFP_RSEM_log_10_9_filtered.txt</SPAN> and <SPAN STYLE=background-color:yellow>HER2_RSEM_3_log.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_RSEM_log_BRCA_10_9_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 3 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Saturday, 02 May 2015, 04:04 PM on adira.genetics.utah.edu.  It took 17m 54s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/REPORT.html:
--------------------------------------------------------------------------------
 1 | <HTML>
 2 | <HEAD>
 3 | <TITLE>CreateSignatures Report</TITLE>
 4 | </HEAD>
 5 | <BODY>
 6 | <CENTER><H1><EM>CreateSignatures</EM> Report</H1></CENTER>
 7 | <H3>I.  Analysis</H3>
 8 | <UL>
 9 | <LI>I ran a signature analysis using a training set of <SPAN STYLE=background-color:yellow>Feature_GFP_TPMlog_10_6.txt</SPAN> and <SPAN STYLE=background-color:yellow>Rsub_HER2_TPMlog.txt</SPAN>.  I generated predictions on <SPAN STYLE=background-color:yellow>TCGA_PANCAN20_Rsubread_BRCA_TPMlog_10_9_filtered.txt</SPAN>.
10 | <LI>I used the <SPAN STYLE=background-color:yellow>BinReg 2</SPAN> algorithm with <SPAN STYLE=background-color:yellow>200 genes</SPAN> and <SPAN STYLE=background-color:yellow>2 metagenes</SPAN>.
11 | <LI>I applied <SPAN STYLE=background-color:yellow>quantile</SPAN> normalization.
12 | <LI>For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 | </UL>
14 | <P>
15 | <H3>II.  Results</H3>
16 | <TABLE BORDER=0 CELLSPACING=10>
17 | <TR>
18 | <TD COLSPAN=2 ALIGN=CENTER>
19 | <H3>200 Genes, 2 Metagenes</H3>
20 | </TD>
21 | </TR>
22 | <TR>
23 | <TD>
24 | <A HREF=signature.png>
25 | <IMG HEIGHT=480 SRC=signature.png>
26 | </A>
27 | </TD><TD>
28 | <A HREF=predictions.png>
29 | <IMG HEIGHT=480 SRC=predictions.png>
30 | </A>
31 | </TD>
32 | </TR>
33 | <TR>
34 | <TD VALIGN=TOP>
35 | <B>Figure 1: Signature Heatmap. </B>In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the <EM>train 0</EM> data set, and the remaining 3 columns are the samples from the <EM>train 1</EM> data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | </TD><TD VALIGN=TOP>
37 | <B>Figure 2: Predictions.</B> This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.<P>The raw values from this plot are available as a tab-delimited text file: <A HREF=probabilities.txt>
38 | probabilities.txt
39 | </A>.
40 | </TD>
41 | </TR>
42 | </TABLE>
43 | <P>
44 | <HR>
45 | <EM>This analysis was run on Saturday, 02 May 2015, 03:35 PM on adira.genetics.utah.edu.  It took 21m 33s to complete.
46 | </EM>
47 | </BODY>
48 | </HTML>
49 | 


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png


--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png


--------------------------------------------------------------------------------
/Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt:
--------------------------------------------------------------------------------
  1 | SampleID	ActualClass	PredictedClass	LUAD_Probability	LUSC_Probability
  2 | TCGA-05-4244-01A-01R-1107-07	LUAD	LUAD	0.948	0.052
  3 | TCGA-05-4249-01A-01R-1107-07	LUAD	LUAD	0.946	0.054
  4 | TCGA-05-4250-01A-01R-1107-07	LUAD	LUAD	0.924	0.076
  5 | TCGA-05-4382-01A-01R-1206-07	LUAD	LUAD	0.94	0.06
  6 | TCGA-05-4384-01A-01R-1755-07	LUAD	LUAD	0.994	0.006
  7 | TCGA-05-4389-01A-01R-1206-07	LUAD	LUAD	0.86	0.14
  8 | TCGA-05-4390-01A-02R-1755-07	LUAD	LUAD	0.918	0.082
  9 | TCGA-05-4395-01A-01R-1206-07	LUAD	LUAD	0.776	0.224
 10 | TCGA-05-4396-01A-21R-1858-07	LUAD	LUAD	0.94	0.06
 11 | TCGA-05-4397-01A-01R-1206-07	LUAD	LUAD	0.632	0.368
 12 | TCGA-05-4398-01A-01R-1206-07	LUAD	LUAD	0.914	0.086
 13 | TCGA-05-4402-01A-01R-1206-07	LUAD	LUAD	0.968	0.032
 14 | TCGA-05-4403-01A-01R-1206-07	LUAD	LUAD	0.934	0.066
 15 | TCGA-05-4405-01A-21R-1858-07	LUAD	LUAD	0.968	0.032
 16 | TCGA-05-4410-01A-21R-1858-07	LUAD	LUAD	0.958	0.042
 17 | TCGA-05-4415-01A-22R-1858-07	LUAD	LUAD	0.714	0.286
 18 | TCGA-05-4417-01A-22R-1858-07	LUAD	LUAD	0.976	0.024
 19 | TCGA-05-4418-01A-01R-1206-07	LUAD	LUAD	0.85	0.15
 20 | TCGA-05-4420-01A-01R-1206-07	LUAD	LUAD	0.858	0.142
 21 | TCGA-05-4422-01A-01R-1206-07	LUAD	LUAD	0.938	0.062
 22 | TCGA-05-4424-01A-22R-1858-07	LUAD	LUAD	0.9	0.1
 23 | TCGA-05-4425-01A-01R-1755-07	LUAD	LUAD	0.952	0.048
 24 | TCGA-05-4426-01A-01R-1206-07	LUAD	LUAD	0.94	0.06
 25 | TCGA-05-4427-01A-21R-1858-07	LUAD	LUAD	0.914	0.086
 26 | TCGA-05-4430-01A-02R-1206-07	LUAD	LUAD	0.938	0.062
 27 | TCGA-05-4432-01A-01R-1206-07	LUAD	LUAD	0.932	0.068
 28 | TCGA-05-4433-01A-22R-1858-07	LUAD	LUAD	0.976	0.024
 29 | TCGA-05-4434-01A-01R-1206-07	LUAD	LUAD	0.776	0.224
 30 | TCGA-05-5420-01A-01R-1628-07	LUAD	LUAD	0.78	0.22
 31 | TCGA-05-5423-01A-01R-1628-07	LUAD	LUAD	0.952	0.048
 32 | TCGA-05-5425-01A-02R-1628-07	LUAD	LUAD	0.928	0.072
 33 | TCGA-05-5428-01A-01R-1628-07	LUAD	LUAD	0.834	0.166
 34 | TCGA-05-5429-01A-01R-1628-07	LUAD	LUAD	0.854	0.146
 35 | TCGA-05-5715-01A-01R-1628-07	LUAD	LUAD	0.96	0.04
 36 | TCGA-35-3615-01A-01R-0946-07	LUAD	LUAD	0.984	0.016
 37 | TCGA-35-4122-01A-01R-1107-07	LUAD	LUAD	0.748	0.252
 38 | TCGA-35-4123-01A-01R-1107-07	LUAD	LUAD	0.818	0.182
 39 | TCGA-35-5375-01A-01R-1628-07	LUAD	LUSC	0.44	0.56
 40 | TCGA-38-4625-01A-01R-1206-07	LUAD	LUAD	0.766	0.234
 41 | TCGA-38-4626-01A-01R-1206-07	LUAD	LUAD	0.964	0.036
 42 | TCGA-38-4627-01A-01R-1206-07	LUAD	LUAD	0.89	0.11
 43 | TCGA-38-4628-01A-01R-1206-07	LUAD	LUAD	0.896	0.104
 44 | TCGA-38-4629-01A-02R-1206-07	LUAD	LUAD	0.878	0.122
 45 | TCGA-38-4630-01A-01R-1206-07	LUAD	LUAD	0.52	0.48
 46 | TCGA-38-4631-01A-01R-1755-07	LUAD	LUAD	0.744	0.256
 47 | TCGA-38-4632-01A-01R-1755-07	LUAD	LUAD	0.888	0.112
 48 | TCGA-38-6178-01A-11R-1755-07	LUAD	LUAD	0.948	0.052
 49 | TCGA-38-7271-01A-11R-2039-07	LUAD	LUAD	0.944	0.056
 50 | TCGA-44-2655-01A-01R-0946-07	LUAD	LUAD	0.992	0.008
 51 | TCGA-44-2656-01A-02R-0946-07	LUAD	LUAD	0.988	0.012
 52 | TCGA-44-2657-01A-01R-1107-07	LUAD	LUAD	0.97	0.03
 53 | TCGA-44-2659-01A-01R-0946-07	LUAD	LUAD	0.976	0.024
 54 | TCGA-44-2661-01A-01R-1107-07	LUAD	LUAD	0.972	0.028
 55 | TCGA-44-2662-01A-01R-0946-07	LUAD	LUAD	0.926	0.074
 56 | TCGA-44-2665-01A-01R-0946-07	LUAD	LUAD	0.894	0.106
 57 | TCGA-44-2666-01A-01R-0946-07	LUAD	LUAD	0.978	0.022
 58 | TCGA-44-2668-01A-01R-0946-07	LUAD	LUAD	0.83	0.17
 59 | TCGA-44-3396-01A-01R-1206-07	LUAD	LUAD	0.946	0.054
 60 | TCGA-44-3398-01A-01R-1107-07	LUAD	LUAD	0.822	0.178
 61 | TCGA-44-3918-01A-01R-1107-07	LUAD	LUAD	0.934	0.066
 62 | TCGA-44-3919-01A-02R-1107-07	LUAD	LUAD	0.978	0.022
 63 | TCGA-44-4112-01A-01R-1107-07	LUAD	LUAD	0.944	0.056
 64 | TCGA-44-5643-01A-01R-1628-07	LUAD	LUSC	0.04	0.96
 65 | TCGA-44-5644-01A-21R-2039-07	LUAD	LUAD	0.838	0.162
 66 | TCGA-44-5645-01A-01R-1628-07	LUAD	LUAD	0.96	0.04
 67 | TCGA-44-6145-01A-11R-1755-07	LUAD	LUAD	0.982	0.018
 68 | TCGA-44-6146-01A-11R-1755-07	LUAD	LUAD	0.93	0.07
 69 | TCGA-44-6147-01A-11R-1755-07	LUAD	LUAD	0.946	0.054
 70 | TCGA-44-6148-01A-11R-1755-07	LUAD	LUAD	0.864	0.136
 71 | TCGA-44-6774-01A-21R-1858-07	LUAD	LUAD	0.898	0.102
 72 | TCGA-44-6775-01A-11R-1858-07	LUAD	LUAD	0.972	0.028
 73 | TCGA-44-6776-01A-11R-1858-07	LUAD	LUAD	0.976	0.024
 74 | TCGA-44-6777-01A-11R-1858-07	LUAD	LUAD	0.946	0.054
 75 | TCGA-44-6778-01A-11R-1858-07	LUAD	LUAD	0.878	0.122
 76 | TCGA-44-6779-01A-11R-1858-07	LUAD	LUAD	0.87	0.13
 77 | TCGA-44-7659-01A-11R-2066-07	LUAD	LUAD	0.98	0.02
 78 | TCGA-44-7660-01A-11R-2066-07	LUAD	LUSC	0.46	0.54
 79 | TCGA-44-7661-01A-11R-2066-07	LUAD	LUAD	0.906	0.094
 80 | TCGA-44-7662-01A-11R-2066-07	LUAD	LUAD	0.938	0.062
 81 | TCGA-44-7667-01A-31R-2066-07	LUAD	LUAD	0.568	0.432
 82 | TCGA-44-7669-01A-21R-2066-07	LUAD	LUAD	0.816	0.184
 83 | TCGA-44-7670-01A-11R-2066-07	LUAD	LUAD	0.744	0.256
 84 | TCGA-44-7671-01A-11R-2066-07	LUAD	LUAD	0.976	0.024
 85 | TCGA-44-7672-01A-11R-2066-07	LUAD	LUAD	0.988	0.012
 86 | TCGA-44-8117-01A-11R-2241-07	LUAD	LUAD	0.918	0.082
 87 | TCGA-44-8119-01A-11R-2241-07	LUAD	LUAD	0.878	0.122
 88 | TCGA-44-8120-01A-11R-2241-07	LUAD	LUAD	0.976	0.024
 89 | TCGA-49-4486-01A-01R-1206-07	LUAD	LUAD	0.938	0.062
 90 | TCGA-49-4487-01A-21R-1858-07	LUAD	LUAD	0.904	0.096
 91 | TCGA-49-4488-01A-01R-1755-07	LUAD	LUAD	0.918	0.082
 92 | TCGA-49-4490-01A-21R-1858-07	LUAD	LUAD	0.892	0.108
 93 | TCGA-49-4494-01A-01R-1206-07	LUAD	LUAD	0.89	0.11
 94 | TCGA-49-4501-01A-01R-1206-07	LUAD	LUAD	0.976	0.024
 95 | TCGA-49-4505-01A-01R-1206-07	LUAD	LUAD	0.978	0.022
 96 | TCGA-49-4506-01A-01R-1206-07	LUAD	LUAD	0.698	0.302
 97 | TCGA-49-4507-01A-01R-1206-07	LUAD	LUAD	0.798	0.202
 98 | TCGA-49-4510-01A-01R-1206-07	LUAD	LUAD	0.966	0.034
 99 | TCGA-49-4512-01A-21R-1858-07	LUAD	LUAD	0.958	0.042
100 | TCGA-49-4514-01A-21R-1858-07	LUAD	LUAD	0.824	0.176
101 | TCGA-49-6742-01A-11R-1858-07	LUAD	LUAD	0.954	0.046
102 | TCGA-49-6743-01A-11R-1858-07	LUAD	LUAD	0.868	0.132
103 | TCGA-49-6744-01A-11R-1858-07	LUAD	LUAD	0.998	0.002
104 | TCGA-49-6745-01A-11R-1858-07	LUAD	LUAD	0.956	0.044
105 | TCGA-49-6761-01A-31R-1949-07	LUAD	LUAD	0.84	0.16
106 | TCGA-49-6767-01A-11R-1858-07	LUAD	LUAD	0.808	0.192
107 | TCGA-50-5044-01A-21R-1858-07	LUAD	LUAD	0.768	0.232
108 | TCGA-50-5049-01A-01R-1628-07	LUAD	LUAD	0.922	0.078
109 | TCGA-50-5051-01A-21R-1858-07	LUAD	LUAD	0.912	0.088
110 | TCGA-50-5055-01A-01R-1628-07	LUAD	LUAD	0.912	0.088
111 | TCGA-50-5066-01A-01R-1628-07	LUAD	LUAD	0.762	0.238
112 | TCGA-50-5066-02A-11R-2090-07	LUAD	LUAD	0.938	0.062
113 | TCGA-50-5068-01A-01R-1628-07	LUAD	LUAD	0.856	0.144
114 | TCGA-50-5072-01A-21R-1858-07	LUAD	LUAD	0.848	0.152
115 | TCGA-50-5931-01A-11R-1755-07	LUAD	LUSC	0.226	0.774
116 | TCGA-50-5932-01A-11R-1755-07	LUAD	LUAD	0.954	0.046
117 | TCGA-50-5933-01A-11R-1755-07	LUAD	LUAD	0.89	0.11
118 | TCGA-50-5935-01A-11R-1755-07	LUAD	LUAD	0.984	0.016
119 | TCGA-50-5936-01A-11R-1628-07	LUAD	LUAD	0.95	0.05
120 | TCGA-50-5939-01A-11R-1628-07	LUAD	LUAD	0.918	0.082
121 | TCGA-50-5941-01A-11R-1755-07	LUAD	LUAD	0.962	0.038
122 | TCGA-50-5942-01A-21R-1755-07	LUAD	LUAD	0.974	0.026
123 | TCGA-50-5944-01A-11R-1755-07	LUAD	LUAD	0.99	0.01
124 | TCGA-50-5946-01A-11R-1755-07	LUAD	LUAD	0.854	0.146
125 | TCGA-50-5946-02A-11R-2090-07	LUAD	LUAD	0.904	0.096
126 | TCGA-50-6590-01A-12R-1858-07	LUAD	LUAD	0.682	0.318
127 | TCGA-50-6591-01A-11R-1755-07	LUAD	LUAD	0.632	0.368
128 | TCGA-50-6592-01A-11R-1755-07	LUAD	LUAD	0.792	0.208
129 | TCGA-50-6593-01A-11R-1755-07	LUAD	LUAD	0.97	0.03
130 | TCGA-50-6594-01A-11R-1755-07	LUAD	LUAD	0.832	0.168
131 | TCGA-50-6595-01A-12R-1858-07	LUAD	LUAD	0.74	0.26
132 | TCGA-50-6597-01A-11R-1858-07	LUAD	LUAD	0.87	0.13
133 | TCGA-50-6673-01A-11R-1949-07	LUAD	LUAD	0.934	0.066
134 | TCGA-50-7109-01A-11R-2039-07	LUAD	LUAD	0.95	0.05
135 | TCGA-53-7624-01A-11R-2066-07	LUAD	LUAD	0.576	0.424
136 | TCGA-53-7626-01A-12R-2066-07	LUAD	LUAD	0.986	0.014
137 | TCGA-53-7813-01A-11R-2170-07	LUAD	LUAD	0.952	0.048
138 | TCGA-55-1592-01A-01R-0946-07	LUAD	LUAD	0.96	0.04
139 | TCGA-55-1594-01A-01R-0946-07	LUAD	LUAD	0.79	0.21
140 | TCGA-55-1595-01A-01R-0946-07	LUAD	LUAD	0.968	0.032
141 | TCGA-55-1596-01A-01R-0946-07	LUAD	LUAD	0.844	0.156
142 | TCGA-55-5899-01A-11R-1628-07	LUAD	LUAD	0.73	0.27
143 | TCGA-55-6543-01A-11R-1755-07	LUAD	LUAD	0.98	0.02
144 | TCGA-55-6642-01A-11R-1858-07	LUAD	LUAD	0.962	0.038
145 | TCGA-55-6712-01A-11R-1858-07	LUAD	LUAD	0.944	0.056
146 | TCGA-55-6968-01A-11R-1949-07	LUAD	LUAD	0.544	0.456
147 | TCGA-55-6969-01A-11R-1949-07	LUAD	LUAD	0.84	0.16
148 | TCGA-55-6970-01A-11R-1949-07	LUAD	LUAD	0.97	0.03
149 | TCGA-55-6971-01A-11R-1949-07	LUAD	LUAD	0.964	0.036
150 | TCGA-55-6972-01A-11R-1949-07	LUAD	LUAD	0.924	0.076
151 | TCGA-55-6975-01A-11R-1949-07	LUAD	LUAD	0.902	0.098
152 | TCGA-55-6978-01A-11R-1949-07	LUAD	LUAD	0.91	0.09
153 | TCGA-55-6979-01A-11R-1949-07	LUAD	LUAD	0.946	0.054
154 | TCGA-55-6980-01A-11R-1949-07	LUAD	LUAD	0.968	0.032
155 | TCGA-55-6981-01A-11R-1949-07	LUAD	LUAD	0.95	0.05
156 | TCGA-55-6982-01A-11R-1949-07	LUAD	LUAD	0.97	0.03
157 | TCGA-55-6983-01A-11R-1949-07	LUAD	LUAD	0.988	0.012
158 | TCGA-55-6984-01A-11R-1949-07	LUAD	LUAD	0.888	0.112
159 | TCGA-55-6985-01A-11R-1949-07	LUAD	LUAD	0.964	0.036
160 | TCGA-55-6986-01A-11R-1949-07	LUAD	LUAD	0.978	0.022
161 | TCGA-55-6987-01A-11R-1949-07	LUAD	LUAD	0.95	0.05
162 | TCGA-55-7227-01A-11R-2039-07	LUAD	LUAD	0.992	0.008
163 | TCGA-55-7281-01A-11R-2039-07	LUAD	LUAD	0.954	0.046
164 | TCGA-55-7283-01A-11R-2039-07	LUAD	LUAD	0.99	0.01
165 | TCGA-55-7284-01B-11R-2241-07	LUAD	LUAD	0.974	0.026
166 | TCGA-55-7570-01A-11R-2039-07	LUAD	LUAD	0.58	0.42
167 | TCGA-55-7573-01A-11R-2039-07	LUAD	LUAD	0.982	0.018
168 | TCGA-55-7574-01A-11R-2039-07	LUAD	LUAD	0.98	0.02
169 | TCGA-55-7576-01A-11R-2066-07	LUAD	LUAD	0.97	0.03
170 | TCGA-55-7724-01A-11R-2170-07	LUAD	LUSC	0.228	0.772
171 | TCGA-55-7725-01A-11R-2170-07	LUAD	LUAD	0.936	0.064
172 | TCGA-55-7726-01A-11R-2170-07	LUAD	LUSC	0.368	0.632
173 | TCGA-55-7727-01A-11R-2170-07	LUAD	LUAD	0.944	0.056
174 | TCGA-55-7728-01A-11R-2187-07	LUAD	LUAD	0.886	0.114
175 | TCGA-55-7815-01A-11R-2170-07	LUAD	LUAD	0.848	0.152
176 | TCGA-55-7903-01A-11R-2170-07	LUAD	LUAD	0.966	0.034
177 | TCGA-55-7907-01A-11R-2170-07	LUAD	LUAD	0.956	0.044
178 | TCGA-55-7910-01A-11R-2170-07	LUAD	LUAD	0.916	0.084
179 | TCGA-55-7911-01A-11R-2170-07	LUAD	LUAD	0.966	0.034
180 | TCGA-55-7913-01B-11R-2241-07	LUAD	LUAD	0.904	0.096
181 | TCGA-55-7914-01A-11R-2170-07	LUAD	LUAD	0.956	0.044
182 | TCGA-55-7994-01A-11R-2187-07	LUAD	LUAD	0.906	0.094
183 | TCGA-55-7995-01A-11R-2187-07	LUAD	LUAD	0.756	0.244
184 | TCGA-55-8085-01A-11R-2241-07	LUAD	LUAD	0.97	0.03
185 | TCGA-55-8087-01A-11R-2241-07	LUAD	LUAD	0.976	0.024
186 | TCGA-55-8089-01A-11R-2241-07	LUAD	LUAD	0.902	0.098
187 | TCGA-55-8090-01A-11R-2241-07	LUAD	LUAD	0.96	0.04
188 | TCGA-55-8091-01A-11R-2241-07	LUAD	LUAD	0.948	0.052
189 | TCGA-55-8092-01A-11R-2241-07	LUAD	LUAD	0.924	0.076
190 | TCGA-55-8094-01A-11R-2241-07	LUAD	LUAD	0.842	0.158
191 | TCGA-55-8096-01A-11R-2241-07	LUAD	LUAD	0.948	0.052
192 | TCGA-55-8097-01A-11R-2241-07	LUAD	LUAD	0.986	0.014
193 | TCGA-55-8203-01A-11R-2241-07	LUAD	LUAD	0.992	0.008
194 | TCGA-55-8204-01A-11R-2241-07	LUAD	LUSC	0.184	0.816
195 | TCGA-55-8205-01A-11R-2241-07	LUAD	LUAD	0.922	0.078
196 | TCGA-55-8206-01A-11R-2241-07	LUAD	LUAD	0.97	0.03
197 | TCGA-55-8207-01A-11R-2241-07	LUAD	LUAD	0.976	0.024
198 | TCGA-55-8208-01A-11R-2241-07	LUAD	LUAD	0.954	0.046
199 | TCGA-55-8299-01A-11R-2287-07	LUAD	LUAD	0.952	0.048
200 | TCGA-55-8301-01A-11R-2287-07	LUAD	LUAD	0.956	0.044
201 | TCGA-64-1676-01A-01R-0946-07	LUAD	LUAD	0.836	0.164
202 | TCGA-64-1677-01A-01R-0946-07	LUAD	LUAD	0.89	0.11
203 | TCGA-64-1678-01A-01R-0946-07	LUAD	LUAD	0.7	0.3
204 | TCGA-64-1679-01A-21R-2066-07	LUAD	LUAD	0.904	0.096
205 | TCGA-64-1680-01A-02R-0946-07	LUAD	LUAD	0.966	0.034
206 | TCGA-64-1681-01A-11R-2066-07	LUAD	LUAD	0.968	0.032
207 | TCGA-64-5774-01A-01R-1628-07	LUAD	LUAD	0.938	0.062
208 | TCGA-64-5775-01A-01R-1628-07	LUAD	LUAD	0.688	0.312
209 | TCGA-64-5778-01A-01R-1628-07	LUAD	LUAD	0.936	0.064
210 | TCGA-64-5779-01A-01R-1628-07	LUAD	LUAD	0.908	0.092
211 | TCGA-64-5781-01A-01R-1628-07	LUAD	LUAD	0.808	0.192
212 | TCGA-64-5815-01A-01R-1628-07	LUAD	LUAD	0.872	0.128
213 | TCGA-67-3770-01A-01R-0946-07	LUAD	LUAD	0.94	0.06
214 | TCGA-67-3771-01A-01R-0946-07	LUAD	LUAD	0.81	0.19
215 | TCGA-67-3772-01A-01R-0946-07	LUAD	LUAD	0.954	0.046
216 | TCGA-67-3773-01A-01R-0946-07	LUAD	LUAD	0.94	0.06
217 | TCGA-67-3774-01A-01R-0946-07	LUAD	LUAD	0.978	0.022
218 | TCGA-67-4679-01B-01R-1755-07	LUAD	LUAD	0.988	0.012
219 | TCGA-67-6215-01A-11R-1755-07	LUAD	LUAD	0.974	0.026
220 | TCGA-67-6216-01A-11R-1755-07	LUAD	LUAD	0.98	0.02
221 | TCGA-67-6217-01A-11R-1755-07	LUAD	LUAD	0.982	0.018
222 | TCGA-69-7760-01A-11R-2170-07	LUAD	LUAD	0.904	0.096
223 | TCGA-69-7761-01A-11R-2170-07	LUAD	LUAD	0.938	0.062
224 | TCGA-69-7763-01A-11R-2170-07	LUAD	LUAD	0.96	0.04
225 | TCGA-69-7764-01A-11R-2170-07	LUAD	LUAD	0.974	0.026
226 | TCGA-69-7765-01A-11R-2170-07	LUAD	LUAD	0.98	0.02
227 | TCGA-69-7973-01A-11R-2187-07	LUAD	LUAD	0.948	0.052
228 | TCGA-69-7974-01A-11R-2187-07	LUAD	LUAD	0.982	0.018
229 | TCGA-69-7978-01A-11R-2187-07	LUAD	LUAD	0.958	0.042
230 | TCGA-69-7979-01A-11R-2187-07	LUAD	LUAD	0.772	0.228
231 | TCGA-69-7980-01A-11R-2187-07	LUAD	LUAD	0.956	0.044
232 | TCGA-69-8253-01A-11R-2287-07	LUAD	LUAD	0.958	0.042
233 | TCGA-69-8254-01A-11R-2287-07	LUAD	LUAD	0.982	0.018
234 | TCGA-69-8255-01A-11R-2287-07	LUAD	LUAD	0.7	0.3
235 | TCGA-71-6725-01A-11R-1858-07	LUAD	LUAD	0.934	0.066
236 | TCGA-73-4658-01A-01R-1755-07	LUAD	LUAD	0.968	0.032
237 | TCGA-73-4659-01A-01R-1206-07	LUAD	LUAD	0.976	0.024
238 | TCGA-73-4662-01A-01R-1206-07	LUAD	LUAD	0.99	0.01
239 | TCGA-73-4666-01A-01R-1206-07	LUAD	LUAD	0.852	0.148
240 | TCGA-73-4668-01A-01R-1206-07	LUAD	LUAD	0.94	0.06
241 | TCGA-73-4670-01A-01R-1206-07	LUAD	LUAD	0.81	0.19
242 | TCGA-73-4675-01A-01R-1206-07	LUAD	LUAD	0.898	0.102
243 | TCGA-73-4676-01A-01R-1755-07	LUAD	LUAD	0.886	0.114
244 | TCGA-73-4677-01A-01R-1206-07	LUAD	LUAD	0.982	0.018
245 | TCGA-73-7498-01A-12R-2187-07	LUAD	LUAD	0.996	0.004
246 | TCGA-73-7499-01A-11R-2187-07	LUAD	LUAD	0.872	0.128
247 | TCGA-75-5122-01A-01R-1755-07	LUAD	LUAD	0.88	0.12
248 | TCGA-75-5125-01A-01R-1755-07	LUAD	LUAD	0.896	0.104
249 | TCGA-75-5126-01A-01R-1755-07	LUAD	LUAD	0.93	0.07
250 | TCGA-75-5146-01A-01R-1628-07	LUAD	LUAD	0.944	0.056
251 | TCGA-75-5147-01A-01R-1628-07	LUAD	LUAD	0.87	0.13
252 | TCGA-75-6203-01A-11R-1755-07	LUAD	LUAD	0.952	0.048
253 | TCGA-75-6205-01A-11R-1755-07	LUAD	LUAD	0.892	0.108
254 | TCGA-75-6206-01A-11R-1755-07	LUAD	LUAD	0.96	0.04
255 | TCGA-75-6207-01A-11R-1755-07	LUAD	LUAD	0.884	0.116
256 | TCGA-75-6211-01A-11R-1755-07	LUAD	LUAD	0.838	0.162
257 | TCGA-75-6212-01A-11R-1755-07	LUAD	LUAD	0.968	0.032
258 | TCGA-75-6214-01A-41R-1949-07	LUAD	LUSC	0.24	0.76
259 | TCGA-75-7025-01A-12R-1949-07	LUAD	LUAD	0.952	0.048
260 | TCGA-75-7027-01A-11R-1949-07	LUAD	LUAD	0.9	0.1
261 | TCGA-75-7030-01A-11R-1949-07	LUAD	LUAD	0.906	0.094
262 | TCGA-75-7031-01A-11R-1949-07	LUAD	LUAD	0.912	0.088
263 | TCGA-78-7143-01A-11R-2039-07	LUAD	LUAD	0.99	0.01
264 | TCGA-78-7145-01A-11R-2039-07	LUAD	LUAD	0.972	0.028
265 | TCGA-78-7146-01A-11R-2039-07	LUAD	LUAD	0.8	0.2
266 | TCGA-78-7147-01A-11R-2039-07	LUAD	LUAD	0.94	0.06
267 | TCGA-78-7148-01A-11R-2039-07	LUAD	LUAD	0.994	0.006
268 | TCGA-78-7149-01A-11R-2039-07	LUAD	LUAD	0.974	0.026
269 | TCGA-78-7150-01A-21R-2039-07	LUAD	LUAD	0.824	0.176
270 | TCGA-78-7152-01A-11R-2039-07	LUAD	LUAD	0.95	0.05
271 | TCGA-78-7153-01A-11R-2039-07	LUAD	LUAD	0.956	0.044
272 | TCGA-78-7154-01A-11R-2039-07	LUAD	LUAD	0.788	0.212
273 | TCGA-78-7155-01A-11R-2039-07	LUAD	LUAD	0.594	0.406
274 | TCGA-78-7156-01A-11R-2039-07	LUAD	LUAD	0.96	0.04
275 | TCGA-78-7158-01A-11R-2039-07	LUAD	LUAD	0.966	0.034
276 | TCGA-78-7159-01A-11R-2039-07	LUAD	LUAD	0.978	0.022
277 | TCGA-78-7160-01A-11R-2039-07	LUAD	LUAD	0.978	0.022
278 | TCGA-78-7161-01A-11R-2039-07	LUAD	LUAD	0.978	0.022
279 | TCGA-78-7162-01A-21R-2066-07	LUAD	LUAD	0.974	0.026
280 | TCGA-78-7163-01A-12R-2066-07	LUAD	LUAD	0.908	0.092
281 | TCGA-78-7166-01A-12R-2066-07	LUAD	LUAD	0.946	0.054
282 | TCGA-78-7167-01A-11R-2066-07	LUAD	LUAD	0.974	0.026
283 | TCGA-78-7220-01A-11R-2039-07	LUAD	LUAD	0.826	0.174
284 | TCGA-78-7535-01A-11R-2066-07	LUAD	LUAD	0.87	0.13
285 | TCGA-78-7536-01A-11R-2066-07	LUAD	LUAD	0.83	0.17
286 | TCGA-78-7537-01A-11R-2066-07	LUAD	LUAD	0.982	0.018
287 | TCGA-78-7539-01A-11R-2066-07	LUAD	LUAD	0.974	0.026
288 | TCGA-78-7540-01A-11R-2066-07	LUAD	LUAD	0.954	0.046
289 | TCGA-78-7542-01A-21R-2066-07	LUAD	LUAD	0.626	0.374
290 | TCGA-78-7633-01A-11R-2066-07	LUAD	LUAD	0.976	0.024
291 | TCGA-80-5607-01A-31R-1949-07	LUAD	LUAD	0.968	0.032
292 | TCGA-80-5608-01A-31R-1949-07	LUAD	LUAD	0.974	0.026
293 | TCGA-80-5611-01A-01R-1628-07	LUAD	LUAD	0.872	0.128
294 | TCGA-83-5908-01A-21R-2287-07	LUAD	LUAD	0.856	0.144
295 | TCGA-86-6562-01A-11R-1755-07	LUAD	LUAD	0.988	0.012
296 | TCGA-86-6851-01A-11R-1949-07	LUAD	LUAD	0.924	0.076
297 | TCGA-86-7701-01A-11R-2170-07	LUAD	LUAD	0.932	0.068
298 | TCGA-86-7711-01A-11R-2066-07	LUAD	LUAD	0.726	0.274
299 | TCGA-86-7713-01A-11R-2066-07	LUAD	LUAD	0.91	0.09
300 | TCGA-86-7714-01A-12R-2170-07	LUAD	LUAD	0.97	0.03
301 | TCGA-86-7953-01A-11R-2187-07	LUAD	LUAD	0.9	0.1
302 | TCGA-86-7954-01A-11R-2187-07	LUAD	LUAD	0.954	0.046
303 | TCGA-86-7955-01A-11R-2187-07	LUAD	LUAD	0.878	0.122
304 | TCGA-86-8054-01A-11R-2241-07	LUAD	LUAD	0.846	0.154
305 | TCGA-86-8055-01A-11R-2241-07	LUAD	LUAD	0.956	0.044
306 | TCGA-86-8056-01A-11R-2241-07	LUAD	LUAD	0.97	0.03
307 | TCGA-86-8073-01A-11R-2241-07	LUAD	LUAD	0.912	0.088
308 | TCGA-86-8074-01A-11R-2241-07	LUAD	LUAD	0.984	0.016
309 | TCGA-86-8075-01A-11R-2241-07	LUAD	LUAD	0.956	0.044
310 | TCGA-86-8076-01A-31R-2241-07	LUAD	LUAD	0.988	0.012
311 | TCGA-86-8279-01A-11R-2287-07	LUAD	LUAD	0.944	0.056
312 | TCGA-86-8280-01A-11R-2287-07	LUAD	LUAD	0.986	0.014
313 | TCGA-86-8281-01A-11R-2287-07	LUAD	LUAD	0.99	0.01
314 | TCGA-91-6828-01A-11R-1858-07	LUAD	LUAD	0.97	0.03
315 | TCGA-91-6829-01A-21R-1858-07	LUAD	LUAD	0.928	0.072
316 | TCGA-91-6830-01A-11R-1949-07	LUAD	LUAD	0.916	0.084
317 | TCGA-91-6831-01A-11R-1858-07	LUAD	LUAD	0.894	0.106
318 | TCGA-91-6835-01A-11R-1858-07	LUAD	LUAD	0.936	0.064
319 | TCGA-91-6836-01A-21R-1858-07	LUAD	LUAD	0.832	0.168
320 | TCGA-91-6840-01A-11R-1949-07	LUAD	LUAD	0.798	0.202
321 | TCGA-91-6847-01A-11R-1949-07	LUAD	LUAD	0.736	0.264
322 | TCGA-91-6848-01A-11R-1949-07	LUAD	LUAD	0.606	0.394
323 | TCGA-91-6849-01A-11R-1949-07	LUAD	LUAD	0.974	0.026
324 | TCGA-91-7771-01A-11R-2170-07	LUAD	LUAD	0.99	0.01
325 | TCGA-93-7347-01A-11R-2187-07	LUAD	LUAD	0.99	0.01
326 | TCGA-93-7348-01A-21R-2039-07	LUAD	LUAD	0.978	0.022
327 | TCGA-93-8067-01A-11R-2287-07	LUAD	LUAD	0.91	0.09
328 | TCGA-95-7039-01A-11R-1949-07	LUAD	LUAD	0.928	0.072
329 | TCGA-95-7043-01A-11R-1949-07	LUAD	LUAD	0.786	0.214
330 | TCGA-95-7562-01A-11R-2241-07	LUAD	LUAD	0.87	0.13
331 | TCGA-95-7567-01A-11R-2066-07	LUAD	LUAD	0.966	0.034
332 | TCGA-95-7944-01A-11R-2187-07	LUAD	LUAD	0.776	0.224
333 | TCGA-95-7947-01A-11R-2187-07	LUAD	LUAD	0.834	0.166
334 | TCGA-95-7948-01A-11R-2187-07	LUAD	LUAD	0.828	0.172
335 | TCGA-95-8039-01A-11R-2241-07	LUAD	LUAD	0.998	0.002
336 | TCGA-97-7546-01A-11R-2039-07	LUAD	LUAD	0.966	0.034
337 | TCGA-97-7547-01A-11R-2039-07	LUAD	LUAD	0.958	0.042
338 | TCGA-97-7552-01A-11R-2039-07	LUAD	LUAD	0.97	0.03
339 | TCGA-97-7553-01A-21R-2039-07	LUAD	LUAD	0.978	0.022
340 | TCGA-97-7554-01A-11R-2039-07	LUAD	LUAD	0.972	0.028
341 | TCGA-97-7937-01A-11R-2170-07	LUAD	LUAD	0.98	0.02
342 | TCGA-97-7938-01A-11R-2170-07	LUAD	LUAD	0.978	0.022
343 | TCGA-97-7941-01A-11R-2187-07	LUAD	LUAD	0.968	0.032
344 | TCGA-97-8171-01A-11R-2287-07	LUAD	LUAD	0.888	0.112
345 | TCGA-97-8172-01A-11R-2287-07	LUAD	LUAD	0.984	0.016
346 | TCGA-97-8174-01A-11R-2287-07	LUAD	LUAD	0.932	0.068
347 | TCGA-97-8175-01A-11R-2287-07	LUAD	LUAD	0.956	0.044
348 | TCGA-97-8177-01A-11R-2287-07	LUAD	LUAD	0.96	0.04
349 | TCGA-97-8179-01A-11R-2287-07	LUAD	LUAD	0.97	0.03
350 | TCGA-99-7458-01A-11R-2039-07	LUAD	LUAD	0.984	0.016
351 | TCGA-99-8025-01A-11R-2241-07	LUAD	LUAD	0.958	0.042
352 | TCGA-99-8028-01A-11R-2241-07	LUAD	LUAD	0.972	0.028
353 | TCGA-99-8032-01A-11R-2241-07	LUAD	LUAD	0.958	0.042
354 | TCGA-99-8033-01A-11R-2241-07	LUAD	LUAD	0.944	0.056
355 | TCGA-J2-8192-01A-11R-2241-07	LUAD	LUAD	0.962	0.038
356 | TCGA-J2-8194-01A-11R-2241-07	LUAD	LUAD	0.984	0.016
357 | TCGA-18-3406-01A-01R-0980-07	LUSC	LUSC	0.102	0.898
358 | TCGA-18-3407-01A-01R-0980-07	LUSC	LUSC	0.024	0.976
359 | TCGA-18-3408-01A-01R-0980-07	LUSC	LUSC	0.22	0.78
360 | TCGA-18-3409-01A-01R-0980-07	LUSC	LUSC	0.26	0.74
361 | TCGA-18-3410-01A-01R-0980-07	LUSC	LUSC	0.18	0.82
362 | TCGA-18-3411-01A-01R-0980-07	LUSC	LUSC	0.028	0.972
363 | TCGA-18-3412-01A-01R-0980-07	LUSC	LUSC	0.024	0.976
364 | TCGA-18-3414-01A-01R-0980-07	LUSC	LUSC	0.024	0.976
365 | TCGA-18-3415-01A-01R-0980-07	LUSC	LUSC	0.02	0.98
366 | TCGA-18-3416-01A-01R-0980-07	LUSC	LUSC	0.128	0.872
367 | TCGA-18-3417-01A-01R-1443-07	LUSC	LUSC	0.156	0.844
368 | TCGA-18-3419-01A-01R-0980-07	LUSC	LUSC	0.14	0.86
369 | TCGA-18-3421-01A-01R-0980-07	LUSC	LUSC	0.086	0.914
370 | TCGA-18-4083-01A-01R-1100-07	LUSC	LUSC	0.052	0.948
371 | TCGA-18-4086-01A-01R-1100-07	LUSC	LUSC	0.004	0.996
372 | TCGA-18-4721-01A-01R-1443-07	LUSC	LUSC	0.072	0.928
373 | TCGA-18-5592-01A-01R-1635-07	LUSC	LUSC	0.048	0.952
374 | TCGA-18-5595-01A-01R-1635-07	LUSC	LUSC	0.076	0.924
375 | TCGA-21-1070-01A-01R-0692-07	LUSC	LUSC	0.118	0.882
376 | TCGA-21-1071-01A-01R-0692-07	LUSC	LUSC	0.11	0.89
377 | TCGA-21-1072-01A-01R-0692-07	LUSC	LUSC	0.042	0.958
378 | TCGA-21-1075-01A-01R-0692-07	LUSC	LUSC	0.05	0.95
379 | TCGA-21-1076-01A-02R-0692-07	LUSC	LUSC	0.138	0.862
380 | TCGA-21-1077-01A-01R-0692-07	LUSC	LUSC	0.01	0.99
381 | TCGA-21-1078-01A-01R-0692-07	LUSC	LUSC	0.444	0.556
382 | TCGA-21-1079-01A-01R-0692-07	LUSC	LUSC	0.132	0.868
383 | TCGA-21-1080-01A-01R-0692-07	LUSC	LUSC	0.056	0.944
384 | TCGA-21-1081-01A-01R-0692-07	LUSC	LUSC	0.098	0.902
385 | TCGA-21-1082-01A-01R-0692-07	LUSC	LUSC	0.062	0.938
386 | TCGA-21-1083-01A-01R-0692-07	LUSC	LUSC	0.246	0.754
387 | TCGA-21-5782-01A-01R-1635-07	LUSC	LUSC	0.128	0.872
388 | TCGA-21-5784-01A-01R-1635-07	LUSC	LUSC	0.086	0.914
389 | TCGA-21-5786-01A-01R-1635-07	LUSC	LUSC	0.04	0.96
390 | TCGA-21-5787-01A-01R-1635-07	LUSC	LUAD	0.662	0.338
391 | TCGA-22-0940-01A-01R-0692-07	LUSC	LUSC	0.16	0.84
392 | TCGA-22-0944-01A-01R-0692-07	LUSC	LUSC	0.028	0.972
393 | TCGA-22-1002-01A-01R-0692-07	LUSC	LUSC	0.204	0.796
394 | TCGA-22-1011-01A-01R-0692-07	LUSC	LUSC	0.072	0.928
395 | TCGA-22-1012-01A-01R-0692-07	LUSC	LUSC	0.036	0.964
396 | TCGA-22-1016-01A-01R-0692-07	LUSC	LUSC	0.412	0.588
397 | TCGA-22-1017-01A-01R-0692-07	LUSC	LUAD	0.874	0.126
398 | TCGA-22-4591-01A-01R-1201-07	LUSC	LUSC	0.208	0.792
399 | TCGA-22-4593-01A-21R-1820-07	LUSC	LUSC	0.068	0.932
400 | TCGA-22-4594-01A-01R-1201-07	LUSC	LUAD	0.892	0.108
401 | TCGA-22-4595-01A-01R-1201-07	LUSC	LUSC	0.058	0.942
402 | TCGA-22-4596-01A-01R-1201-07	LUSC	LUAD	0.936	0.064
403 | TCGA-22-4599-01A-01R-1443-07	LUSC	LUSC	0.068	0.932
404 | TCGA-22-4601-01A-01R-1443-07	LUSC	LUSC	0.054	0.946
405 | TCGA-22-4604-01A-01R-1201-07	LUSC	LUSC	0.018	0.982
406 | TCGA-22-4607-01A-01R-1201-07	LUSC	LUSC	0.088	0.912
407 | TCGA-22-4613-01A-01R-1443-07	LUSC	LUSC	0.022	0.978
408 | TCGA-22-5471-01A-01R-1635-07	LUSC	LUSC	0.04	0.96
409 | TCGA-22-5472-01A-01R-1635-07	LUSC	LUSC	0.09	0.91
410 | TCGA-22-5473-01A-01R-1635-07	LUSC	LUSC	0.008	0.992
411 | TCGA-22-5474-01A-01R-1635-07	LUSC	LUSC	0.048	0.952
412 | TCGA-22-5477-01A-01R-1635-07	LUSC	LUSC	0.094	0.906
413 | TCGA-22-5478-01A-01R-1635-07	LUSC	LUSC	0.278	0.722
414 | TCGA-22-5479-01A-31R-1949-07	LUSC	LUSC	0.01	0.99
415 | TCGA-22-5480-01A-01R-1635-07	LUSC	LUSC	0.196	0.804
416 | TCGA-22-5481-01A-31R-1949-07	LUSC	LUSC	0.422	0.578
417 | TCGA-22-5482-01A-01R-1635-07	LUSC	LUSC	0.014	0.986
418 | TCGA-22-5483-01A-01R-1820-07	LUSC	LUSC	0.408	0.592
419 | TCGA-22-5485-01A-01R-1635-07	LUSC	LUSC	0.04	0.96
420 | TCGA-22-5489-01A-01R-1635-07	LUSC	LUSC	0.106	0.894
421 | TCGA-22-5491-01A-01R-1635-07	LUSC	LUSC	0.024	0.976
422 | TCGA-22-5492-01A-01R-1635-07	LUSC	LUSC	0.106	0.894
423 | TCGA-33-4532-01A-01R-1201-07	LUSC	LUSC	0.028	0.972
424 | TCGA-33-4533-01A-01R-1201-07	LUSC	LUSC	0.194	0.806
425 | TCGA-33-4538-01A-01R-1201-07	LUSC	LUSC	0.014	0.986
426 | TCGA-33-4547-01A-01R-1201-07	LUSC	LUSC	0.044	0.956
427 | TCGA-33-4566-01A-01R-1443-07	LUSC	LUSC	0.486	0.514
428 | TCGA-33-4582-01A-01R-1443-07	LUSC	LUSC	0.084	0.916
429 | TCGA-33-4583-01A-01R-1443-07	LUSC	LUSC	0.04	0.96
430 | TCGA-33-4586-01A-01R-1443-07	LUSC	LUSC	0.03	0.97
431 | TCGA-33-6737-01A-11R-1820-07	LUSC	LUAD	0.754	0.246
432 | TCGA-33-6738-01A-11R-1949-07	LUSC	LUSC	0.376	0.624
433 | TCGA-34-2596-01A-01R-0851-07	LUSC	LUSC	0.05	0.95
434 | TCGA-34-2600-01A-01R-0851-07	LUSC	LUSC	0.076	0.924
435 | TCGA-34-2608-01A-02R-0851-07	LUSC	LUSC	0.046	0.954
436 | TCGA-34-5231-01A-21R-1820-07	LUSC	LUSC	0.094	0.906
437 | TCGA-34-5232-01A-21R-1820-07	LUSC	LUSC	0.024	0.976
438 | TCGA-34-5234-01A-01R-1635-07	LUSC	LUSC	0.282	0.718
439 | TCGA-34-5236-01A-21R-1820-07	LUSC	LUSC	0.016	0.984
440 | TCGA-34-5239-01A-21R-1820-07	LUSC	LUSC	0.138	0.862
441 | TCGA-34-5240-01A-01R-1443-07	LUSC	LUSC	0.064	0.936
442 | TCGA-34-5241-01A-01R-1443-07	LUSC	LUSC	0.024	0.976
443 | TCGA-34-5927-01A-11R-1820-07	LUSC	LUSC	0.184	0.816
444 | TCGA-34-5928-01A-11R-1820-07	LUSC	LUSC	0.15	0.85
445 | TCGA-34-5929-01A-11R-1820-07	LUSC	LUSC	0.086	0.914
446 | TCGA-34-7107-01A-11R-1949-07	LUSC	LUSC	0.034	0.966
447 | TCGA-37-3783-01A-01R-1201-07	LUSC	LUSC	0.1	0.9
448 | TCGA-37-3789-01A-01R-0980-07	LUSC	LUSC	0.06	0.94
449 | TCGA-37-3792-01A-01R-0980-07	LUSC	LUAD	0.504	0.496
450 | TCGA-37-4129-01A-01R-1100-07	LUSC	LUAD	0.594	0.406
451 | TCGA-37-4130-01A-01R-1100-07	LUSC	LUAD	0.598	0.402
452 | TCGA-37-4132-01A-01R-1100-07	LUSC	LUSC	0.428	0.572
453 | TCGA-37-4133-01A-01R-1100-07	LUSC	LUSC	0.172	0.828
454 | TCGA-37-4135-01A-01R-1100-07	LUSC	LUSC	0.344	0.656
455 | TCGA-37-4141-01A-02R-1100-07	LUSC	LUSC	0.434	0.566
456 | TCGA-37-5819-01A-01R-1635-07	LUSC	LUAD	0.54	0.46
457 | TCGA-39-5011-01A-01R-1443-07	LUSC	LUAD	0.838	0.162
458 | TCGA-39-5016-01A-01R-1443-07	LUSC	LUSC	0.044	0.956
459 | TCGA-39-5019-01A-01R-1820-07	LUSC	LUSC	0.004	0.996
460 | TCGA-39-5021-01A-01R-1443-07	LUSC	LUSC	0.08	0.92
461 | TCGA-39-5024-01A-21R-1820-07	LUSC	LUSC	0.094	0.906
462 | TCGA-39-5027-01A-21R-1820-07	LUSC	LUSC	0.02	0.98
463 | TCGA-39-5028-01A-01R-1443-07	LUSC	LUSC	0.166	0.834
464 | TCGA-39-5029-01A-01R-1443-07	LUSC	LUSC	0.062	0.938
465 | TCGA-39-5030-01A-01R-1443-07	LUSC	LUSC	0.082	0.918
466 | TCGA-39-5031-01A-01R-1443-07	LUSC	LUSC	0.022	0.978
467 | TCGA-39-5034-01A-01R-1443-07	LUSC	LUAD	0.802	0.198
468 | TCGA-39-5035-01A-01R-1443-07	LUSC	LUSC	0.144	0.856
469 | TCGA-39-5036-01A-01R-1443-07	LUSC	LUSC	0.048	0.952
470 | TCGA-39-5037-01A-01R-1443-07	LUSC	LUSC	0.066	0.934
471 | TCGA-39-5039-01A-01R-1443-07	LUSC	LUSC	0.288	0.712
472 | TCGA-43-2578-01A-01R-0851-07	LUSC	LUSC	0.422	0.578
473 | TCGA-43-2581-01A-01R-0851-07	LUSC	LUAD	0.838	0.162
474 | TCGA-43-3394-01A-01R-0980-07	LUSC	LUSC	0.03	0.97
475 | TCGA-43-3920-01A-01R-0980-07	LUSC	LUSC	0.094	0.906
476 | TCGA-43-5668-01A-01R-1635-07	LUSC	LUAD	0.728	0.272
477 | TCGA-43-6143-01A-11R-1820-07	LUSC	LUSC	0.192	0.808
478 | TCGA-43-6647-01A-11R-1820-07	LUSC	LUSC	0.116	0.884
479 | TCGA-43-6770-01A-11R-1820-07	LUSC	LUSC	0.016	0.984
480 | TCGA-43-6771-01A-11R-1820-07	LUSC	LUSC	0.218	0.782
481 | TCGA-46-3765-01A-01R-0980-07	LUSC	LUSC	0.01	0.99
482 | TCGA-46-3766-01A-01R-0980-07	LUSC	LUSC	0.262	0.738
483 | TCGA-46-3767-01A-01R-0980-07	LUSC	LUSC	0.052	0.948
484 | TCGA-46-3768-01A-01R-0980-07	LUSC	LUSC	0.092	0.908
485 | TCGA-46-3769-01A-01R-0980-07	LUSC	LUAD	0.572	0.428
486 | TCGA-46-6025-01A-11R-1820-07	LUSC	LUSC	0.088	0.912
487 | TCGA-46-6026-01A-11R-1820-07	LUSC	LUSC	0.144	0.856
488 | TCGA-51-4079-01A-01R-1100-07	LUSC	LUSC	0.02	0.98
489 | TCGA-51-4080-01A-01R-1100-07	LUSC	LUSC	0.07	0.93
490 | TCGA-51-4081-01A-01R-1100-07	LUSC	LUSC	0.032	0.968
491 | TCGA-56-1622-01A-01R-0692-07	LUSC	LUSC	0.168	0.832
492 | TCGA-56-5897-01A-11R-1635-07	LUSC	LUSC	0.05	0.95
493 | TCGA-56-5898-01A-11R-1635-07	LUSC	LUSC	0.046	0.954
494 | TCGA-56-6545-01A-11R-1820-07	LUSC	LUSC	0.074	0.926
495 | TCGA-56-6546-01A-11R-1820-07	LUSC	LUSC	0.466	0.534
496 | TCGA-60-2695-01A-01R-0851-07	LUSC	LUSC	0.448	0.552
497 | TCGA-60-2696-01A-01R-0851-07	LUSC	LUSC	0.156	0.844
498 | TCGA-60-2698-01A-01R-0851-07	LUSC	LUSC	0.08	0.92
499 | TCGA-60-2706-01A-01R-0851-07	LUSC	LUAD	0.624	0.376
500 | TCGA-60-2707-01A-01R-0851-07	LUSC	LUSC	0.076	0.924
501 | TCGA-60-2708-01A-01R-0851-07	LUSC	LUSC	0.044	0.956
502 | TCGA-60-2709-01A-21R-1820-07	LUSC	LUSC	0.106	0.894
503 | TCGA-60-2710-01A-01R-0851-07	LUSC	LUSC	0.05	0.95
504 | TCGA-60-2711-01A-01R-0851-07	LUSC	LUSC	0.058	0.942
505 | TCGA-60-2712-01A-01R-0851-07	LUSC	LUSC	0.096	0.904
506 | TCGA-60-2713-01A-01R-0851-07	LUSC	LUSC	0.028	0.972
507 | TCGA-60-2714-01A-01R-0851-07	LUSC	LUAD	0.838	0.162
508 | TCGA-60-2715-01A-01R-0851-07	LUSC	LUSC	0.174	0.826
509 | TCGA-60-2716-01A-01R-0851-07	LUSC	LUSC	0.206	0.794
510 | TCGA-60-2719-01A-01R-0851-07	LUSC	LUSC	0.058	0.942
511 | TCGA-60-2720-01A-01R-0851-07	LUSC	LUSC	0.16	0.84
512 | TCGA-60-2721-01A-01R-0851-07	LUSC	LUSC	0.05	0.95
513 | TCGA-60-2722-01A-01R-0851-07	LUSC	LUSC	0.032	0.968
514 | TCGA-60-2723-01A-01R-0851-07	LUSC	LUSC	0.06	0.94
515 | TCGA-60-2724-01A-01R-0851-07	LUSC	LUSC	0.056	0.944
516 | TCGA-60-2725-01A-01R-1201-07	LUSC	LUSC	0.06	0.94
517 | TCGA-60-2726-01A-01R-0851-07	LUSC	LUSC	0.166	0.834
518 | TCGA-63-5128-01A-01R-1443-07	LUSC	LUSC	0.048	0.952
519 | TCGA-63-5131-01A-01R-1443-07	LUSC	LUSC	0.078	0.922
520 | TCGA-63-6202-01A-11R-1820-07	LUSC	LUAD	0.826	0.174
521 | TCGA-63-7020-01A-11R-1949-07	LUSC	LUSC	0.13	0.87
522 | TCGA-63-7021-01A-11R-1949-07	LUSC	LUSC	0.042	0.958
523 | TCGA-63-7022-01A-11R-1949-07	LUSC	LUSC	0.09	0.91
524 | TCGA-63-7023-01A-11R-1949-07	LUSC	LUSC	0.144	0.856
525 | TCGA-66-2727-01A-01R-0980-07	LUSC	LUSC	0.034	0.966
526 | TCGA-66-2734-01A-01R-0980-07	LUSC	LUSC	0.042	0.958
527 | TCGA-66-2737-01A-01R-0980-07	LUSC	LUSC	0.008	0.992
528 | TCGA-66-2742-01A-01R-0980-07	LUSC	LUSC	0.056	0.944
529 | TCGA-66-2744-01A-01R-0980-07	LUSC	LUSC	0.426	0.574
530 | TCGA-66-2753-01A-01R-0980-07	LUSC	LUSC	0.126	0.874
531 | TCGA-66-2754-01A-01R-0980-07	LUSC	LUSC	0.5	0.5
532 | TCGA-66-2755-01A-01R-0851-07	LUSC	LUSC	0.136	0.864
533 | TCGA-66-2756-01A-01R-0851-07	LUSC	LUAD	0.578	0.422
534 | TCGA-66-2757-01A-01R-0851-07	LUSC	LUSC	0.356	0.644
535 | TCGA-66-2758-01A-02R-0851-07	LUSC	LUSC	0.07	0.93
536 | TCGA-66-2759-01A-01R-0851-07	LUSC	LUSC	0.024	0.976
537 | TCGA-66-2763-01A-01R-0851-07	LUSC	LUSC	0.04	0.96
538 | TCGA-66-2765-01A-01R-0851-07	LUSC	LUSC	0.04	0.96
539 | TCGA-66-2766-01A-01R-0851-07	LUSC	LUSC	0.036	0.964
540 | TCGA-66-2767-01A-01R-0851-07	LUSC	LUSC	0.046	0.954
541 | TCGA-66-2768-01A-01R-0851-07	LUSC	LUSC	0.04	0.96
542 | TCGA-66-2769-01A-02R-0851-07	LUSC	LUSC	0.156	0.844
543 | TCGA-66-2770-01A-01R-0851-07	LUSC	LUSC	0.038	0.962
544 | TCGA-66-2771-01A-01R-0980-07	LUSC	LUSC	0.068	0.932
545 | TCGA-66-2773-01A-01R-1201-07	LUSC	LUSC	0.09	0.91
546 | TCGA-66-2777-01A-01R-1201-07	LUSC	LUSC	0.044	0.956
547 | TCGA-66-2778-01A-02R-0851-07	LUSC	LUSC	0.102	0.898
548 | TCGA-66-2780-01A-01R-0851-07	LUSC	LUSC	0.016	0.984
549 | TCGA-66-2781-01A-01R-0851-07	LUSC	LUSC	0.024	0.976
550 | TCGA-66-2782-01A-01R-0851-07	LUSC	LUSC	0.118	0.882
551 | TCGA-66-2783-01A-01R-1201-07	LUSC	LUSC	0.038	0.962
552 | TCGA-66-2785-01A-01R-0851-07	LUSC	LUSC	0.386	0.614
553 | TCGA-66-2786-01A-01R-0851-07	LUSC	LUSC	0.166	0.834
554 | TCGA-66-2787-01A-01R-0980-07	LUSC	LUSC	0.06	0.94
555 | TCGA-66-2788-01A-01R-0980-07	LUSC	LUSC	0.018	0.982
556 | TCGA-66-2789-01A-01R-0980-07	LUSC	LUSC	0.096	0.904
557 | TCGA-66-2790-01A-01R-0980-07	LUSC	LUSC	0.11	0.89
558 | TCGA-66-2791-01A-01R-0980-07	LUSC	LUSC	0.008	0.992
559 | TCGA-66-2792-01A-01R-0980-07	LUSC	LUSC	0.018	0.982
560 | TCGA-66-2793-01A-01R-1201-07	LUSC	LUSC	0.164	0.836
561 | TCGA-66-2794-01A-01R-1201-07	LUSC	LUSC	0.036	0.964
562 | TCGA-66-2795-01A-02R-0980-07	LUSC	LUSC	0.03	0.97
563 | TCGA-66-2800-01A-01R-1201-07	LUSC	LUSC	0.04	0.96
564 | TCGA-70-6722-01A-11R-1820-07	LUSC	LUSC	0.336	0.664
565 | TCGA-70-6723-01A-11R-1820-07	LUSC	LUSC	0.11	0.89
566 | TCGA-77-6842-01A-11R-1949-07	LUSC	LUSC	0.312	0.688
567 | TCGA-77-6843-01A-11R-1949-07	LUSC	LUSC	0.104	0.896
568 | TCGA-77-6844-01A-11R-1949-07	LUSC	LUSC	0.026	0.974
569 | TCGA-77-6845-01A-11R-1949-07	LUSC	LUSC	0.04	0.96
570 | TCGA-79-5596-01A-31R-1949-07	LUSC	LUSC	0.012	0.988
571 | TCGA-85-6175-01A-11R-1820-07	LUSC	LUSC	0.264	0.736
572 | TCGA-85-6560-01A-11R-1820-07	LUSC	LUSC	0.498	0.502
573 | TCGA-85-6561-01A-11R-1820-07	LUSC	LUSC	0.022	0.978
574 | TCGA-85-6798-01A-11R-1949-07	LUSC	LUSC	0.05	0.95
575 | TCGA-90-6837-01A-11R-1949-07	LUSC	LUSC	0.088	0.912
576 | TCGA-94-7033-01A-11R-1949-07	LUSC	LUSC	0.106	0.894
577 | 


--------------------------------------------------------------------------------
/Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt:
--------------------------------------------------------------------------------
  1 | SampleID	ActualClass	PredictedClass	LUAD_Probability	LUSC_Probability
  2 | TCGA-05-4244-01A-01R-1107-07	LUAD	LUSC	0.44	0.56
  3 | TCGA-05-4249-01A-01R-1107-07	LUAD	LUAD	0.768	0.232
  4 | TCGA-05-4250-01A-01R-1107-07	LUAD	LUSC	0.368	0.632
  5 | TCGA-05-4382-01A-01R-1206-07	LUAD	LUAD	0.996	0.004
  6 | TCGA-05-4384-01A-01R-1755-07	LUAD	LUAD	1	0
  7 | TCGA-05-4389-01A-01R-1206-07	LUAD	LUAD	1	0
  8 | TCGA-05-4390-01A-02R-1755-07	LUAD	LUAD	0.996	0.004
  9 | TCGA-05-4395-01A-01R-1206-07	LUAD	LUAD	0.992	0.008
 10 | TCGA-05-4396-01A-21R-1858-07	LUAD	LUAD	0.986	0.014
 11 | TCGA-05-4397-01A-01R-1206-07	LUAD	LUAD	0.992	0.008
 12 | TCGA-05-4398-01A-01R-1206-07	LUAD	LUAD	0.992	0.008
 13 | TCGA-05-4402-01A-01R-1206-07	LUAD	LUAD	1	0
 14 | TCGA-05-4403-01A-01R-1206-07	LUAD	LUAD	0.994	0.006
 15 | TCGA-05-4405-01A-21R-1858-07	LUAD	LUAD	0.998	0.002
 16 | TCGA-05-4410-01A-21R-1858-07	LUAD	LUAD	1	0
 17 | TCGA-05-4415-01A-22R-1858-07	LUAD	LUAD	0.992	0.008
 18 | TCGA-05-4417-01A-22R-1858-07	LUAD	LUAD	1	0
 19 | TCGA-05-4418-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
 20 | TCGA-05-4420-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
 21 | TCGA-05-4422-01A-01R-1206-07	LUAD	LUAD	0.996	0.004
 22 | TCGA-05-4424-01A-22R-1858-07	LUAD	LUAD	1	0
 23 | TCGA-05-4425-01A-01R-1755-07	LUAD	LUAD	0.996	0.004
 24 | TCGA-05-4426-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
 25 | TCGA-05-4427-01A-21R-1858-07	LUAD	LUAD	0.98	0.02
 26 | TCGA-05-4430-01A-02R-1206-07	LUAD	LUAD	0.996	0.004
 27 | TCGA-05-4432-01A-01R-1206-07	LUAD	LUAD	0.994	0.006
 28 | TCGA-05-4433-01A-22R-1858-07	LUAD	LUAD	0.996	0.004
 29 | TCGA-05-4434-01A-01R-1206-07	LUAD	LUAD	1	0
 30 | TCGA-05-5420-01A-01R-1628-07	LUAD	LUAD	0.986	0.014
 31 | TCGA-05-5423-01A-01R-1628-07	LUAD	LUAD	0.994	0.006
 32 | TCGA-05-5425-01A-02R-1628-07	LUAD	LUAD	1	0
 33 | TCGA-05-5428-01A-01R-1628-07	LUAD	LUAD	0.994	0.006
 34 | TCGA-05-5429-01A-01R-1628-07	LUAD	LUAD	0.984	0.016
 35 | TCGA-05-5715-01A-01R-1628-07	LUAD	LUAD	1	0
 36 | TCGA-35-3615-01A-01R-0946-07	LUAD	LUAD	1	0
 37 | TCGA-35-4122-01A-01R-1107-07	LUAD	LUSC	0.184	0.816
 38 | TCGA-35-4123-01A-01R-1107-07	LUAD	LUSC	0.25	0.75
 39 | TCGA-35-5375-01A-01R-1628-07	LUAD	LUAD	0.872	0.128
 40 | TCGA-38-4625-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
 41 | TCGA-38-4626-01A-01R-1206-07	LUAD	LUAD	1	0
 42 | TCGA-38-4627-01A-01R-1206-07	LUAD	LUAD	0.996	0.004
 43 | TCGA-38-4628-01A-01R-1206-07	LUAD	LUAD	0.996	0.004
 44 | TCGA-38-4629-01A-02R-1206-07	LUAD	LUAD	0.99	0.01
 45 | TCGA-38-4630-01A-01R-1206-07	LUAD	LUAD	0.902	0.098
 46 | TCGA-38-4631-01A-01R-1755-07	LUAD	LUAD	0.962	0.038
 47 | TCGA-38-4632-01A-01R-1755-07	LUAD	LUAD	1	0
 48 | TCGA-38-6178-01A-11R-1755-07	LUAD	LUAD	0.994	0.006
 49 | TCGA-38-7271-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
 50 | TCGA-44-2655-01A-01R-0946-07	LUAD	LUAD	1	0
 51 | TCGA-44-2656-01A-02R-0946-07	LUAD	LUAD	1	0
 52 | TCGA-44-2657-01A-01R-1107-07	LUAD	LUAD	1	0
 53 | TCGA-44-2659-01A-01R-0946-07	LUAD	LUAD	0.998	0.002
 54 | TCGA-44-2661-01A-01R-1107-07	LUAD	LUAD	0.968	0.032
 55 | TCGA-44-2662-01A-01R-0946-07	LUAD	LUAD	1	0
 56 | TCGA-44-2665-01A-01R-0946-07	LUAD	LUAD	0.982	0.018
 57 | TCGA-44-2666-01A-01R-0946-07	LUAD	LUAD	1	0
 58 | TCGA-44-2668-01A-01R-0946-07	LUAD	LUAD	1	0
 59 | TCGA-44-3396-01A-01R-1206-07	LUAD	LUAD	1	0
 60 | TCGA-44-3398-01A-01R-1107-07	LUAD	LUAD	0.898	0.102
 61 | TCGA-44-3918-01A-01R-1107-07	LUAD	LUSC	0.49	0.51
 62 | TCGA-44-3919-01A-02R-1107-07	LUAD	LUAD	0.864	0.136
 63 | TCGA-44-4112-01A-01R-1107-07	LUAD	LUSC	0.434	0.566
 64 | TCGA-44-5643-01A-01R-1628-07	LUAD	LUAD	0.892	0.108
 65 | TCGA-44-5644-01A-21R-2039-07	LUAD	LUAD	0.998	0.002
 66 | TCGA-44-5645-01A-01R-1628-07	LUAD	LUAD	0.982	0.018
 67 | TCGA-44-6145-01A-11R-1755-07	LUAD	LUAD	1	0
 68 | TCGA-44-6146-01A-11R-1755-07	LUAD	LUAD	1	0
 69 | TCGA-44-6147-01A-11R-1755-07	LUAD	LUAD	1	0
 70 | TCGA-44-6148-01A-11R-1755-07	LUAD	LUAD	0.982	0.018
 71 | TCGA-44-6774-01A-21R-1858-07	LUAD	LUAD	0.998	0.002
 72 | TCGA-44-6775-01A-11R-1858-07	LUAD	LUAD	1	0
 73 | TCGA-44-6776-01A-11R-1858-07	LUAD	LUAD	1	0
 74 | TCGA-44-6777-01A-11R-1858-07	LUAD	LUAD	0.996	0.004
 75 | TCGA-44-6778-01A-11R-1858-07	LUAD	LUAD	0.97	0.03
 76 | TCGA-44-6779-01A-11R-1858-07	LUAD	LUAD	0.994	0.006
 77 | TCGA-44-7659-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
 78 | TCGA-44-7660-01A-11R-2066-07	LUAD	LUAD	0.988	0.012
 79 | TCGA-44-7661-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
 80 | TCGA-44-7662-01A-11R-2066-07	LUAD	LUAD	1	0
 81 | TCGA-44-7667-01A-31R-2066-07	LUAD	LUAD	0.99	0.01
 82 | TCGA-44-7669-01A-21R-2066-07	LUAD	LUAD	0.982	0.018
 83 | TCGA-44-7670-01A-11R-2066-07	LUAD	LUAD	0.992	0.008
 84 | TCGA-44-7671-01A-11R-2066-07	LUAD	LUAD	1	0
 85 | TCGA-44-7672-01A-11R-2066-07	LUAD	LUAD	1	0
 86 | TCGA-44-8117-01A-11R-2241-07	LUAD	LUAD	0.994	0.006
 87 | TCGA-44-8119-01A-11R-2241-07	LUAD	LUAD	1	0
 88 | TCGA-44-8120-01A-11R-2241-07	LUAD	LUAD	1	0
 89 | TCGA-49-4486-01A-01R-1206-07	LUAD	LUAD	0.992	0.008
 90 | TCGA-49-4487-01A-21R-1858-07	LUAD	LUAD	0.998	0.002
 91 | TCGA-49-4488-01A-01R-1755-07	LUAD	LUAD	1	0
 92 | TCGA-49-4490-01A-21R-1858-07	LUAD	LUAD	1	0
 93 | TCGA-49-4494-01A-01R-1206-07	LUAD	LUAD	0.996	0.004
 94 | TCGA-49-4501-01A-01R-1206-07	LUAD	LUAD	1	0
 95 | TCGA-49-4505-01A-01R-1206-07	LUAD	LUAD	1	0
 96 | TCGA-49-4506-01A-01R-1206-07	LUAD	LUAD	0.938	0.062
 97 | TCGA-49-4507-01A-01R-1206-07	LUAD	LUAD	1	0
 98 | TCGA-49-4510-01A-01R-1206-07	LUAD	LUAD	1	0
 99 | TCGA-49-4512-01A-21R-1858-07	LUAD	LUAD	1	0
100 | TCGA-49-4514-01A-21R-1858-07	LUAD	LUAD	0.992	0.008
101 | TCGA-49-6742-01A-11R-1858-07	LUAD	LUAD	1	0
102 | TCGA-49-6743-01A-11R-1858-07	LUAD	LUAD	0.998	0.002
103 | TCGA-49-6744-01A-11R-1858-07	LUAD	LUAD	1	0
104 | TCGA-49-6745-01A-11R-1858-07	LUAD	LUAD	1	0
105 | TCGA-49-6761-01A-31R-1949-07	LUAD	LUAD	1	0
106 | TCGA-49-6767-01A-11R-1858-07	LUAD	LUAD	0.988	0.012
107 | TCGA-50-5044-01A-21R-1858-07	LUAD	LUAD	0.986	0.014
108 | TCGA-50-5049-01A-01R-1628-07	LUAD	LUAD	0.996	0.004
109 | TCGA-50-5051-01A-21R-1858-07	LUAD	LUAD	0.984	0.016
110 | TCGA-50-5055-01A-01R-1628-07	LUAD	LUAD	0.998	0.002
111 | TCGA-50-5066-01A-01R-1628-07	LUAD	LUAD	0.974	0.026
112 | TCGA-50-5066-02A-11R-2090-07	LUAD	LUAD	0.996	0.004
113 | TCGA-50-5068-01A-01R-1628-07	LUAD	LUAD	0.98	0.02
114 | TCGA-50-5072-01A-21R-1858-07	LUAD	LUAD	0.94	0.06
115 | TCGA-50-5931-01A-11R-1755-07	LUAD	LUAD	0.926	0.074
116 | TCGA-50-5932-01A-11R-1755-07	LUAD	LUAD	0.998	0.002
117 | TCGA-50-5933-01A-11R-1755-07	LUAD	LUAD	0.988	0.012
118 | TCGA-50-5935-01A-11R-1755-07	LUAD	LUAD	1	0
119 | TCGA-50-5936-01A-11R-1628-07	LUAD	LUAD	0.994	0.006
120 | TCGA-50-5939-01A-11R-1628-07	LUAD	LUAD	0.998	0.002
121 | TCGA-50-5941-01A-11R-1755-07	LUAD	LUAD	1	0
122 | TCGA-50-5942-01A-21R-1755-07	LUAD	LUAD	0.996	0.004
123 | TCGA-50-5944-01A-11R-1755-07	LUAD	LUAD	1	0
124 | TCGA-50-5946-01A-11R-1755-07	LUAD	LUAD	0.996	0.004
125 | TCGA-50-5946-02A-11R-2090-07	LUAD	LUAD	1	0
126 | TCGA-50-6590-01A-12R-1858-07	LUAD	LUAD	0.922	0.078
127 | TCGA-50-6591-01A-11R-1755-07	LUAD	LUAD	0.9	0.1
128 | TCGA-50-6592-01A-11R-1755-07	LUAD	LUAD	0.996	0.004
129 | TCGA-50-6593-01A-11R-1755-07	LUAD	LUAD	1	0
130 | TCGA-50-6594-01A-11R-1755-07	LUAD	LUAD	0.992	0.008
131 | TCGA-50-6595-01A-12R-1858-07	LUAD	LUAD	1	0
132 | TCGA-50-6597-01A-11R-1858-07	LUAD	LUAD	0.994	0.006
133 | TCGA-50-6673-01A-11R-1949-07	LUAD	LUAD	1	0
134 | TCGA-50-7109-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
135 | TCGA-53-7624-01A-11R-2066-07	LUAD	LUAD	0.976	0.024
136 | TCGA-53-7626-01A-12R-2066-07	LUAD	LUAD	1	0
137 | TCGA-53-7813-01A-11R-2170-07	LUAD	LUAD	0.996	0.004
138 | TCGA-55-1592-01A-01R-0946-07	LUAD	LUAD	0.996	0.004
139 | TCGA-55-1594-01A-01R-0946-07	LUAD	LUAD	0.986	0.014
140 | TCGA-55-1595-01A-01R-0946-07	LUAD	LUAD	1	0
141 | TCGA-55-1596-01A-01R-0946-07	LUAD	LUAD	0.996	0.004
142 | TCGA-55-5899-01A-11R-1628-07	LUAD	LUAD	0.988	0.012
143 | TCGA-55-6543-01A-11R-1755-07	LUAD	LUAD	0.996	0.004
144 | TCGA-55-6642-01A-11R-1858-07	LUAD	LUAD	1	0
145 | TCGA-55-6712-01A-11R-1858-07	LUAD	LUAD	0.998	0.002
146 | TCGA-55-6968-01A-11R-1949-07	LUAD	LUAD	0.982	0.018
147 | TCGA-55-6969-01A-11R-1949-07	LUAD	LUAD	0.992	0.008
148 | TCGA-55-6970-01A-11R-1949-07	LUAD	LUAD	1	0
149 | TCGA-55-6971-01A-11R-1949-07	LUAD	LUAD	1	0
150 | TCGA-55-6972-01A-11R-1949-07	LUAD	LUAD	0.996	0.004
151 | TCGA-55-6975-01A-11R-1949-07	LUAD	LUAD	0.936	0.064
152 | TCGA-55-6978-01A-11R-1949-07	LUAD	LUAD	1	0
153 | TCGA-55-6979-01A-11R-1949-07	LUAD	LUAD	0.954	0.046
154 | TCGA-55-6980-01A-11R-1949-07	LUAD	LUAD	1	0
155 | TCGA-55-6981-01A-11R-1949-07	LUAD	LUAD	0.996	0.004
156 | TCGA-55-6982-01A-11R-1949-07	LUAD	LUAD	0.96	0.04
157 | TCGA-55-6983-01A-11R-1949-07	LUAD	LUAD	0.998	0.002
158 | TCGA-55-6984-01A-11R-1949-07	LUAD	LUAD	0.996	0.004
159 | TCGA-55-6985-01A-11R-1949-07	LUAD	LUAD	1	0
160 | TCGA-55-6986-01A-11R-1949-07	LUAD	LUAD	0.998	0.002
161 | TCGA-55-6987-01A-11R-1949-07	LUAD	LUAD	1	0
162 | TCGA-55-7227-01A-11R-2039-07	LUAD	LUAD	1	0
163 | TCGA-55-7281-01A-11R-2039-07	LUAD	LUAD	1	0
164 | TCGA-55-7283-01A-11R-2039-07	LUAD	LUAD	1	0
165 | TCGA-55-7284-01B-11R-2241-07	LUAD	LUAD	0.996	0.004
166 | TCGA-55-7570-01A-11R-2039-07	LUAD	LUAD	0.962	0.038
167 | TCGA-55-7573-01A-11R-2039-07	LUAD	LUAD	1	0
168 | TCGA-55-7574-01A-11R-2039-07	LUAD	LUAD	1	0
169 | TCGA-55-7576-01A-11R-2066-07	LUAD	LUAD	1	0
170 | TCGA-55-7724-01A-11R-2170-07	LUAD	LUAD	0.724	0.276
171 | TCGA-55-7725-01A-11R-2170-07	LUAD	LUAD	0.996	0.004
172 | TCGA-55-7726-01A-11R-2170-07	LUAD	LUAD	0.936	0.064
173 | TCGA-55-7727-01A-11R-2170-07	LUAD	LUAD	0.926	0.074
174 | TCGA-55-7728-01A-11R-2187-07	LUAD	LUAD	1	0
175 | TCGA-55-7815-01A-11R-2170-07	LUAD	LUAD	0.87	0.13
176 | TCGA-55-7903-01A-11R-2170-07	LUAD	LUAD	1	0
177 | TCGA-55-7907-01A-11R-2170-07	LUAD	LUAD	1	0
178 | TCGA-55-7910-01A-11R-2170-07	LUAD	LUAD	0.994	0.006
179 | TCGA-55-7911-01A-11R-2170-07	LUAD	LUAD	1	0
180 | TCGA-55-7913-01B-11R-2241-07	LUAD	LUAD	0.988	0.012
181 | TCGA-55-7914-01A-11R-2170-07	LUAD	LUAD	0.998	0.002
182 | TCGA-55-7994-01A-11R-2187-07	LUAD	LUAD	0.99	0.01
183 | TCGA-55-7995-01A-11R-2187-07	LUAD	LUAD	1	0
184 | TCGA-55-8085-01A-11R-2241-07	LUAD	LUAD	1	0
185 | TCGA-55-8087-01A-11R-2241-07	LUAD	LUAD	0.998	0.002
186 | TCGA-55-8089-01A-11R-2241-07	LUAD	LUAD	1	0
187 | TCGA-55-8090-01A-11R-2241-07	LUAD	LUAD	0.996	0.004
188 | TCGA-55-8091-01A-11R-2241-07	LUAD	LUAD	0.996	0.004
189 | TCGA-55-8092-01A-11R-2241-07	LUAD	LUAD	0.994	0.006
190 | TCGA-55-8094-01A-11R-2241-07	LUAD	LUAD	1	0
191 | TCGA-55-8096-01A-11R-2241-07	LUAD	LUAD	0.996	0.004
192 | TCGA-55-8097-01A-11R-2241-07	LUAD	LUAD	1	0
193 | TCGA-55-8203-01A-11R-2241-07	LUAD	LUAD	1	0
194 | TCGA-55-8204-01A-11R-2241-07	LUAD	LUAD	0.666	0.334
195 | TCGA-55-8205-01A-11R-2241-07	LUAD	LUAD	0.998	0.002
196 | TCGA-55-8206-01A-11R-2241-07	LUAD	LUAD	1	0
197 | TCGA-55-8207-01A-11R-2241-07	LUAD	LUAD	1	0
198 | TCGA-55-8208-01A-11R-2241-07	LUAD	LUAD	1	0
199 | TCGA-55-8299-01A-11R-2287-07	LUAD	LUAD	1	0
200 | TCGA-55-8301-01A-11R-2287-07	LUAD	LUAD	1	0
201 | TCGA-64-1676-01A-01R-0946-07	LUAD	LUAD	0.98	0.02
202 | TCGA-64-1677-01A-01R-0946-07	LUAD	LUAD	0.998	0.002
203 | TCGA-64-1678-01A-01R-0946-07	LUAD	LUAD	0.928	0.072
204 | TCGA-64-1679-01A-21R-2066-07	LUAD	LUAD	1	0
205 | TCGA-64-1680-01A-02R-0946-07	LUAD	LUAD	0.998	0.002
206 | TCGA-64-1681-01A-11R-2066-07	LUAD	LUAD	1	0
207 | TCGA-64-5774-01A-01R-1628-07	LUAD	LUAD	0.998	0.002
208 | TCGA-64-5775-01A-01R-1628-07	LUAD	LUAD	0.942	0.058
209 | TCGA-64-5778-01A-01R-1628-07	LUAD	LUAD	1	0
210 | TCGA-64-5779-01A-01R-1628-07	LUAD	LUAD	1	0
211 | TCGA-64-5781-01A-01R-1628-07	LUAD	LUAD	0.998	0.002
212 | TCGA-64-5815-01A-01R-1628-07	LUAD	LUAD	1	0
213 | TCGA-67-3770-01A-01R-0946-07	LUAD	LUAD	0.998	0.002
214 | TCGA-67-3771-01A-01R-0946-07	LUAD	LUAD	1	0
215 | TCGA-67-3772-01A-01R-0946-07	LUAD	LUAD	0.996	0.004
216 | TCGA-67-3773-01A-01R-0946-07	LUAD	LUAD	0.998	0.002
217 | TCGA-67-3774-01A-01R-0946-07	LUAD	LUAD	1	0
218 | TCGA-67-4679-01B-01R-1755-07	LUAD	LUAD	1	0
219 | TCGA-67-6215-01A-11R-1755-07	LUAD	LUAD	1	0
220 | TCGA-67-6216-01A-11R-1755-07	LUAD	LUAD	0.998	0.002
221 | TCGA-67-6217-01A-11R-1755-07	LUAD	LUAD	1	0
222 | TCGA-69-7760-01A-11R-2170-07	LUAD	LUAD	0.994	0.006
223 | TCGA-69-7761-01A-11R-2170-07	LUAD	LUAD	0.988	0.012
224 | TCGA-69-7763-01A-11R-2170-07	LUAD	LUAD	1	0
225 | TCGA-69-7764-01A-11R-2170-07	LUAD	LUAD	1	0
226 | TCGA-69-7765-01A-11R-2170-07	LUAD	LUAD	1	0
227 | TCGA-69-7973-01A-11R-2187-07	LUAD	LUAD	0.996	0.004
228 | TCGA-69-7974-01A-11R-2187-07	LUAD	LUAD	1	0
229 | TCGA-69-7978-01A-11R-2187-07	LUAD	LUAD	1	0
230 | TCGA-69-7979-01A-11R-2187-07	LUAD	LUAD	0.992	0.008
231 | TCGA-69-7980-01A-11R-2187-07	LUAD	LUAD	1	0
232 | TCGA-69-8253-01A-11R-2287-07	LUAD	LUAD	1	0
233 | TCGA-69-8254-01A-11R-2287-07	LUAD	LUAD	0.994	0.006
234 | TCGA-69-8255-01A-11R-2287-07	LUAD	LUAD	0.988	0.012
235 | TCGA-71-6725-01A-11R-1858-07	LUAD	LUAD	1	0
236 | TCGA-73-4658-01A-01R-1755-07	LUAD	LUAD	1	0
237 | TCGA-73-4659-01A-01R-1206-07	LUAD	LUAD	1	0
238 | TCGA-73-4662-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
239 | TCGA-73-4666-01A-01R-1206-07	LUAD	LUAD	0.994	0.006
240 | TCGA-73-4668-01A-01R-1206-07	LUAD	LUAD	1	0
241 | TCGA-73-4670-01A-01R-1206-07	LUAD	LUAD	0.998	0.002
242 | TCGA-73-4675-01A-01R-1206-07	LUAD	LUAD	1	0
243 | TCGA-73-4676-01A-01R-1755-07	LUAD	LUAD	0.998	0.002
244 | TCGA-73-4677-01A-01R-1206-07	LUAD	LUAD	1	0
245 | TCGA-73-7498-01A-12R-2187-07	LUAD	LUAD	1	0
246 | TCGA-73-7499-01A-11R-2187-07	LUAD	LUAD	1	0
247 | TCGA-75-5122-01A-01R-1755-07	LUAD	LUAD	0.992	0.008
248 | TCGA-75-5125-01A-01R-1755-07	LUAD	LUAD	1	0
249 | TCGA-75-5126-01A-01R-1755-07	LUAD	LUAD	0.998	0.002
250 | TCGA-75-5146-01A-01R-1628-07	LUAD	LUAD	0.994	0.006
251 | TCGA-75-5147-01A-01R-1628-07	LUAD	LUAD	0.998	0.002
252 | TCGA-75-6203-01A-11R-1755-07	LUAD	LUAD	1	0
253 | TCGA-75-6205-01A-11R-1755-07	LUAD	LUAD	1	0
254 | TCGA-75-6206-01A-11R-1755-07	LUAD	LUAD	1	0
255 | TCGA-75-6207-01A-11R-1755-07	LUAD	LUAD	0.99	0.01
256 | TCGA-75-6211-01A-11R-1755-07	LUAD	LUAD	0.99	0.01
257 | TCGA-75-6212-01A-11R-1755-07	LUAD	LUAD	1	0
258 | TCGA-75-6214-01A-41R-1949-07	LUAD	LUAD	0.956	0.044
259 | TCGA-75-7025-01A-12R-1949-07	LUAD	LUAD	1	0
260 | TCGA-75-7027-01A-11R-1949-07	LUAD	LUAD	0.996	0.004
261 | TCGA-75-7030-01A-11R-1949-07	LUAD	LUAD	1	0
262 | TCGA-75-7031-01A-11R-1949-07	LUAD	LUAD	1	0
263 | TCGA-78-7143-01A-11R-2039-07	LUAD	LUAD	1	0
264 | TCGA-78-7145-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
265 | TCGA-78-7146-01A-11R-2039-07	LUAD	LUAD	0.974	0.026
266 | TCGA-78-7147-01A-11R-2039-07	LUAD	LUAD	1	0
267 | TCGA-78-7148-01A-11R-2039-07	LUAD	LUAD	1	0
268 | TCGA-78-7149-01A-11R-2039-07	LUAD	LUAD	1	0
269 | TCGA-78-7150-01A-21R-2039-07	LUAD	LUAD	0.982	0.018
270 | TCGA-78-7152-01A-11R-2039-07	LUAD	LUAD	1	0
271 | TCGA-78-7153-01A-11R-2039-07	LUAD	LUAD	1	0
272 | TCGA-78-7154-01A-11R-2039-07	LUAD	LUAD	0.974	0.026
273 | TCGA-78-7155-01A-11R-2039-07	LUAD	LUAD	0.928	0.072
274 | TCGA-78-7156-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
275 | TCGA-78-7158-01A-11R-2039-07	LUAD	LUAD	0.988	0.012
276 | TCGA-78-7159-01A-11R-2039-07	LUAD	LUAD	1	0
277 | TCGA-78-7160-01A-11R-2039-07	LUAD	LUAD	1	0
278 | TCGA-78-7161-01A-11R-2039-07	LUAD	LUAD	1	0
279 | TCGA-78-7162-01A-21R-2066-07	LUAD	LUAD	0.998	0.002
280 | TCGA-78-7163-01A-12R-2066-07	LUAD	LUAD	0.982	0.018
281 | TCGA-78-7166-01A-12R-2066-07	LUAD	LUAD	1	0
282 | TCGA-78-7167-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
283 | TCGA-78-7220-01A-11R-2039-07	LUAD	LUAD	0.996	0.004
284 | TCGA-78-7535-01A-11R-2066-07	LUAD	LUAD	1	0
285 | TCGA-78-7536-01A-11R-2066-07	LUAD	LUAD	0.986	0.014
286 | TCGA-78-7537-01A-11R-2066-07	LUAD	LUAD	1	0
287 | TCGA-78-7539-01A-11R-2066-07	LUAD	LUAD	0.994	0.006
288 | TCGA-78-7540-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
289 | TCGA-78-7542-01A-21R-2066-07	LUAD	LUAD	0.984	0.016
290 | TCGA-78-7633-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
291 | TCGA-80-5607-01A-31R-1949-07	LUAD	LUAD	1	0
292 | TCGA-80-5608-01A-31R-1949-07	LUAD	LUAD	1	0
293 | TCGA-80-5611-01A-01R-1628-07	LUAD	LUAD	0.994	0.006
294 | TCGA-83-5908-01A-21R-2287-07	LUAD	LUAD	0.998	0.002
295 | TCGA-86-6562-01A-11R-1755-07	LUAD	LUAD	1	0
296 | TCGA-86-6851-01A-11R-1949-07	LUAD	LUAD	0.998	0.002
297 | TCGA-86-7701-01A-11R-2170-07	LUAD	LUAD	0.992	0.008
298 | TCGA-86-7711-01A-11R-2066-07	LUAD	LUAD	0.982	0.018
299 | TCGA-86-7713-01A-11R-2066-07	LUAD	LUAD	0.998	0.002
300 | TCGA-86-7714-01A-12R-2170-07	LUAD	LUAD	1	0
301 | TCGA-86-7953-01A-11R-2187-07	LUAD	LUAD	0.998	0.002
302 | TCGA-86-7954-01A-11R-2187-07	LUAD	LUAD	0.998	0.002
303 | TCGA-86-7955-01A-11R-2187-07	LUAD	LUAD	0.994	0.006
304 | TCGA-86-8054-01A-11R-2241-07	LUAD	LUAD	0.992	0.008
305 | TCGA-86-8055-01A-11R-2241-07	LUAD	LUAD	0.998	0.002
306 | TCGA-86-8056-01A-11R-2241-07	LUAD	LUAD	0.998	0.002
307 | TCGA-86-8073-01A-11R-2241-07	LUAD	LUAD	0.996	0.004
308 | TCGA-86-8074-01A-11R-2241-07	LUAD	LUAD	1	0
309 | TCGA-86-8075-01A-11R-2241-07	LUAD	LUAD	1	0
310 | TCGA-86-8076-01A-31R-2241-07	LUAD	LUAD	1	0
311 | TCGA-86-8279-01A-11R-2287-07	LUAD	LUAD	1	0
312 | TCGA-86-8280-01A-11R-2287-07	LUAD	LUAD	1	0
313 | TCGA-86-8281-01A-11R-2287-07	LUAD	LUAD	1	0
314 | TCGA-91-6828-01A-11R-1858-07	LUAD	LUAD	0.998	0.002
315 | TCGA-91-6829-01A-21R-1858-07	LUAD	LUAD	0.986	0.014
316 | TCGA-91-6830-01A-11R-1949-07	LUAD	LUAD	1	0
317 | TCGA-91-6831-01A-11R-1858-07	LUAD	LUAD	0.99	0.01
318 | TCGA-91-6835-01A-11R-1858-07	LUAD	LUAD	1	0
319 | TCGA-91-6836-01A-21R-1858-07	LUAD	LUAD	0.978	0.022
320 | TCGA-91-6840-01A-11R-1949-07	LUAD	LUAD	0.99	0.01
321 | TCGA-91-6847-01A-11R-1949-07	LUAD	LUAD	0.95	0.05
322 | TCGA-91-6848-01A-11R-1949-07	LUAD	LUAD	0.924	0.076
323 | TCGA-91-6849-01A-11R-1949-07	LUAD	LUAD	1	0
324 | TCGA-91-7771-01A-11R-2170-07	LUAD	LUAD	0.998	0.002
325 | TCGA-93-7347-01A-11R-2187-07	LUAD	LUAD	1	0
326 | TCGA-93-7348-01A-21R-2039-07	LUAD	LUAD	1	0
327 | TCGA-93-8067-01A-11R-2287-07	LUAD	LUAD	1	0
328 | TCGA-95-7039-01A-11R-1949-07	LUAD	LUAD	1	0
329 | TCGA-95-7043-01A-11R-1949-07	LUAD	LUAD	0.986	0.014
330 | TCGA-95-7562-01A-11R-2241-07	LUAD	LUAD	0.99	0.01
331 | TCGA-95-7567-01A-11R-2066-07	LUAD	LUAD	1	0
332 | TCGA-95-7944-01A-11R-2187-07	LUAD	LUAD	0.998	0.002
333 | TCGA-95-7947-01A-11R-2187-07	LUAD	LUAD	0.988	0.012
334 | TCGA-95-7948-01A-11R-2187-07	LUAD	LUAD	0.994	0.006
335 | TCGA-95-8039-01A-11R-2241-07	LUAD	LUAD	1	0
336 | TCGA-97-7546-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
337 | TCGA-97-7547-01A-11R-2039-07	LUAD	LUAD	0.968	0.032
338 | TCGA-97-7552-01A-11R-2039-07	LUAD	LUAD	0.996	0.004
339 | TCGA-97-7553-01A-21R-2039-07	LUAD	LUAD	1	0
340 | TCGA-97-7554-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
341 | TCGA-97-7937-01A-11R-2170-07	LUAD	LUAD	1	0
342 | TCGA-97-7938-01A-11R-2170-07	LUAD	LUAD	1	0
343 | TCGA-97-7941-01A-11R-2187-07	LUAD	LUAD	1	0
344 | TCGA-97-8171-01A-11R-2287-07	LUAD	LUAD	0.994	0.006
345 | TCGA-97-8172-01A-11R-2287-07	LUAD	LUAD	0.998	0.002
346 | TCGA-97-8174-01A-11R-2287-07	LUAD	LUAD	0.998	0.002
347 | TCGA-97-8175-01A-11R-2287-07	LUAD	LUAD	0.968	0.032
348 | TCGA-97-8177-01A-11R-2287-07	LUAD	LUAD	1	0
349 | TCGA-97-8179-01A-11R-2287-07	LUAD	LUAD	1	0
350 | TCGA-99-7458-01A-11R-2039-07	LUAD	LUAD	0.998	0.002
351 | TCGA-99-8025-01A-11R-2241-07	LUAD	LUAD	1	0
352 | TCGA-99-8028-01A-11R-2241-07	LUAD	LUAD	1	0
353 | TCGA-99-8032-01A-11R-2241-07	LUAD	LUAD	1	0
354 | TCGA-99-8033-01A-11R-2241-07	LUAD	LUAD	0.874	0.126
355 | TCGA-J2-8192-01A-11R-2241-07	LUAD	LUAD	0.762	0.238
356 | TCGA-J2-8194-01A-11R-2241-07	LUAD	LUAD	1	0
357 | TCGA-18-3406-01A-01R-0980-07	LUSC	LUAD	0.776	0.224
358 | TCGA-18-3407-01A-01R-0980-07	LUSC	LUSC	0	1
359 | TCGA-18-3408-01A-01R-0980-07	LUSC	LUAD	0.9	0.1
360 | TCGA-18-3409-01A-01R-0980-07	LUSC	LUAD	0.832	0.168
361 | TCGA-18-3410-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
362 | TCGA-18-3411-01A-01R-0980-07	LUSC	LUSC	0	1
363 | TCGA-18-3412-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
364 | TCGA-18-3414-01A-01R-0980-07	LUSC	LUSC	0.062	0.938
365 | TCGA-18-3415-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
366 | TCGA-18-3416-01A-01R-0980-07	LUSC	LUSC	0.006	0.994
367 | TCGA-18-3417-01A-01R-1443-07	LUSC	LUSC	0.016	0.984
368 | TCGA-18-3419-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
369 | TCGA-18-3421-01A-01R-0980-07	LUSC	LUSC	0.006	0.994
370 | TCGA-18-4083-01A-01R-1100-07	LUSC	LUSC	0.006	0.994
371 | TCGA-18-4086-01A-01R-1100-07	LUSC	LUSC	0	1
372 | TCGA-18-4721-01A-01R-1443-07	LUSC	LUSC	0	1
373 | TCGA-18-5592-01A-01R-1635-07	LUSC	LUSC	0.002	0.998
374 | TCGA-18-5595-01A-01R-1635-07	LUSC	LUSC	0	1
375 | TCGA-21-1070-01A-01R-0692-07	LUSC	LUSC	0.032	0.968
376 | TCGA-21-1071-01A-01R-0692-07	LUSC	LUSC	0.006	0.994
377 | TCGA-21-1072-01A-01R-0692-07	LUSC	LUSC	0.046	0.954
378 | TCGA-21-1075-01A-01R-0692-07	LUSC	LUSC	0.12	0.88
379 | TCGA-21-1076-01A-02R-0692-07	LUSC	LUSC	0.008	0.992
380 | TCGA-21-1077-01A-01R-0692-07	LUSC	LUSC	0.002	0.998
381 | TCGA-21-1078-01A-01R-0692-07	LUSC	LUSC	0.042	0.958
382 | TCGA-21-1079-01A-01R-0692-07	LUSC	LUSC	0.014	0.986
383 | TCGA-21-1080-01A-01R-0692-07	LUSC	LUSC	0.004	0.996
384 | TCGA-21-1081-01A-01R-0692-07	LUSC	LUSC	0.004	0.996
385 | TCGA-21-1082-01A-01R-0692-07	LUSC	LUSC	0.004	0.996
386 | TCGA-21-1083-01A-01R-0692-07	LUSC	LUSC	0.002	0.998
387 | TCGA-21-5782-01A-01R-1635-07	LUSC	LUSC	0.032	0.968
388 | TCGA-21-5784-01A-01R-1635-07	LUSC	LUSC	0.01	0.99
389 | TCGA-21-5786-01A-01R-1635-07	LUSC	LUSC	0.008	0.992
390 | TCGA-21-5787-01A-01R-1635-07	LUSC	LUSC	0.008	0.992
391 | TCGA-22-0940-01A-01R-0692-07	LUSC	LUSC	0.004	0.996
392 | TCGA-22-0944-01A-01R-0692-07	LUSC	LUSC	0.002	0.998
393 | TCGA-22-1002-01A-01R-0692-07	LUSC	LUSC	0	1
394 | TCGA-22-1011-01A-01R-0692-07	LUSC	LUSC	0.004	0.996
395 | TCGA-22-1012-01A-01R-0692-07	LUSC	LUSC	0.002	0.998
396 | TCGA-22-1016-01A-01R-0692-07	LUSC	LUSC	0.006	0.994
397 | TCGA-22-1017-01A-01R-0692-07	LUSC	LUSC	0.092	0.908
398 | TCGA-22-4591-01A-01R-1201-07	LUSC	LUSC	0.018	0.982
399 | TCGA-22-4593-01A-21R-1820-07	LUSC	LUSC	0.006	0.994
400 | TCGA-22-4594-01A-01R-1201-07	LUSC	LUSC	0.152	0.848
401 | TCGA-22-4595-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
402 | TCGA-22-4596-01A-01R-1201-07	LUSC	LUSC	0.172	0.828
403 | TCGA-22-4599-01A-01R-1443-07	LUSC	LUSC	0.032	0.968
404 | TCGA-22-4601-01A-01R-1443-07	LUSC	LUSC	0.002	0.998
405 | TCGA-22-4604-01A-01R-1201-07	LUSC	LUSC	0	1
406 | TCGA-22-4607-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
407 | TCGA-22-4613-01A-01R-1443-07	LUSC	LUSC	0.008	0.992
408 | TCGA-22-5471-01A-01R-1635-07	LUSC	LUSC	0.002	0.998
409 | TCGA-22-5472-01A-01R-1635-07	LUSC	LUSC	0	1
410 | TCGA-22-5473-01A-01R-1635-07	LUSC	LUSC	0	1
411 | TCGA-22-5474-01A-01R-1635-07	LUSC	LUSC	0	1
412 | TCGA-22-5477-01A-01R-1635-07	LUSC	LUSC	0	1
413 | TCGA-22-5478-01A-01R-1635-07	LUSC	LUSC	0.006	0.994
414 | TCGA-22-5479-01A-31R-1949-07	LUSC	LUSC	0	1
415 | TCGA-22-5480-01A-01R-1635-07	LUSC	LUSC	0.004	0.996
416 | TCGA-22-5481-01A-31R-1949-07	LUSC	LUSC	0.068	0.932
417 | TCGA-22-5482-01A-01R-1635-07	LUSC	LUSC	0	1
418 | TCGA-22-5483-01A-01R-1820-07	LUSC	LUSC	0.002	0.998
419 | TCGA-22-5485-01A-01R-1635-07	LUSC	LUSC	0.002	0.998
420 | TCGA-22-5489-01A-01R-1635-07	LUSC	LUSC	0.004	0.996
421 | TCGA-22-5491-01A-01R-1635-07	LUSC	LUSC	0.002	0.998
422 | TCGA-22-5492-01A-01R-1635-07	LUSC	LUSC	0.008	0.992
423 | TCGA-33-4532-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
424 | TCGA-33-4533-01A-01R-1201-07	LUSC	LUSC	0.022	0.978
425 | TCGA-33-4538-01A-01R-1201-07	LUSC	LUSC	0.016	0.984
426 | TCGA-33-4547-01A-01R-1201-07	LUSC	LUSC	0	1
427 | TCGA-33-4566-01A-01R-1443-07	LUSC	LUSC	0.054	0.946
428 | TCGA-33-4582-01A-01R-1443-07	LUSC	LUSC	0	1
429 | TCGA-33-4583-01A-01R-1443-07	LUSC	LUSC	0.012	0.988
430 | TCGA-33-4586-01A-01R-1443-07	LUSC	LUSC	0.012	0.988
431 | TCGA-33-6737-01A-11R-1820-07	LUSC	LUSC	0.006	0.994
432 | TCGA-33-6738-01A-11R-1949-07	LUSC	LUSC	0.004	0.996
433 | TCGA-34-2596-01A-01R-0851-07	LUSC	LUSC	0	1
434 | TCGA-34-2600-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
435 | TCGA-34-2608-01A-02R-0851-07	LUSC	LUSC	0	1
436 | TCGA-34-5231-01A-21R-1820-07	LUSC	LUSC	0.008	0.992
437 | TCGA-34-5232-01A-21R-1820-07	LUSC	LUSC	0	1
438 | TCGA-34-5234-01A-01R-1635-07	LUSC	LUSC	0.108	0.892
439 | TCGA-34-5236-01A-21R-1820-07	LUSC	LUSC	0	1
440 | TCGA-34-5239-01A-21R-1820-07	LUSC	LUSC	0	1
441 | TCGA-34-5240-01A-01R-1443-07	LUSC	LUSC	0.022	0.978
442 | TCGA-34-5241-01A-01R-1443-07	LUSC	LUSC	0.004	0.996
443 | TCGA-34-5927-01A-11R-1820-07	LUSC	LUSC	0.006	0.994
444 | TCGA-34-5928-01A-11R-1820-07	LUSC	LUSC	0.002	0.998
445 | TCGA-34-5929-01A-11R-1820-07	LUSC	LUSC	0.002	0.998
446 | TCGA-34-7107-01A-11R-1949-07	LUSC	LUSC	0	1
447 | TCGA-37-3783-01A-01R-1201-07	LUSC	LUSC	0.008	0.992
448 | TCGA-37-3789-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
449 | TCGA-37-3792-01A-01R-0980-07	LUSC	LUSC	0.12	0.88
450 | TCGA-37-4129-01A-01R-1100-07	LUSC	LUSC	0.082	0.918
451 | TCGA-37-4130-01A-01R-1100-07	LUSC	LUSC	0.116	0.884
452 | TCGA-37-4132-01A-01R-1100-07	LUSC	LUSC	0.012	0.988
453 | TCGA-37-4133-01A-01R-1100-07	LUSC	LUSC	0.034	0.966
454 | TCGA-37-4135-01A-01R-1100-07	LUSC	LUSC	0.024	0.976
455 | TCGA-37-4141-01A-02R-1100-07	LUSC	LUSC	0.008	0.992
456 | TCGA-37-5819-01A-01R-1635-07	LUSC	LUSC	0.032	0.968
457 | TCGA-39-5011-01A-01R-1443-07	LUSC	LUSC	0.032	0.968
458 | TCGA-39-5016-01A-01R-1443-07	LUSC	LUSC	0	1
459 | TCGA-39-5019-01A-01R-1820-07	LUSC	LUSC	0.122	0.878
460 | TCGA-39-5021-01A-01R-1443-07	LUSC	LUSC	0	1
461 | TCGA-39-5024-01A-21R-1820-07	LUSC	LUSC	0.05	0.95
462 | TCGA-39-5027-01A-21R-1820-07	LUSC	LUSC	0	1
463 | TCGA-39-5028-01A-01R-1443-07	LUSC	LUSC	0.002	0.998
464 | TCGA-39-5029-01A-01R-1443-07	LUSC	LUSC	0	1
465 | TCGA-39-5030-01A-01R-1443-07	LUSC	LUSC	0.002	0.998
466 | TCGA-39-5031-01A-01R-1443-07	LUSC	LUSC	0	1
467 | TCGA-39-5034-01A-01R-1443-07	LUSC	LUSC	0.026	0.974
468 | TCGA-39-5035-01A-01R-1443-07	LUSC	LUSC	0.01	0.99
469 | TCGA-39-5036-01A-01R-1443-07	LUSC	LUSC	0	1
470 | TCGA-39-5037-01A-01R-1443-07	LUSC	LUSC	0.002	0.998
471 | TCGA-39-5039-01A-01R-1443-07	LUSC	LUSC	0.006	0.994
472 | TCGA-43-2578-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
473 | TCGA-43-2581-01A-01R-0851-07	LUSC	LUSC	0.104	0.896
474 | TCGA-43-3394-01A-01R-0980-07	LUSC	LUSC	0	1
475 | TCGA-43-3920-01A-01R-0980-07	LUSC	LUSC	0.026	0.974
476 | TCGA-43-5668-01A-01R-1635-07	LUSC	LUSC	0.016	0.984
477 | TCGA-43-6143-01A-11R-1820-07	LUSC	LUSC	0.01	0.99
478 | TCGA-43-6647-01A-11R-1820-07	LUSC	LUSC	0.006	0.994
479 | TCGA-43-6770-01A-11R-1820-07	LUSC	LUSC	0	1
480 | TCGA-43-6771-01A-11R-1820-07	LUSC	LUSC	0.066	0.934
481 | TCGA-46-3765-01A-01R-0980-07	LUSC	LUSC	0	1
482 | TCGA-46-3766-01A-01R-0980-07	LUSC	LUSC	0.026	0.974
483 | TCGA-46-3767-01A-01R-0980-07	LUSC	LUSC	0.006	0.994
484 | TCGA-46-3768-01A-01R-0980-07	LUSC	LUSC	0	1
485 | TCGA-46-3769-01A-01R-0980-07	LUSC	LUSC	0.034	0.966
486 | TCGA-46-6025-01A-11R-1820-07	LUSC	LUSC	0.004	0.996
487 | TCGA-46-6026-01A-11R-1820-07	LUSC	LUSC	0.052	0.948
488 | TCGA-51-4079-01A-01R-1100-07	LUSC	LUSC	0	1
489 | TCGA-51-4080-01A-01R-1100-07	LUSC	LUSC	0.002	0.998
490 | TCGA-51-4081-01A-01R-1100-07	LUSC	LUSC	0	1
491 | TCGA-56-1622-01A-01R-0692-07	LUSC	LUSC	0.008	0.992
492 | TCGA-56-5897-01A-11R-1635-07	LUSC	LUSC	0	1
493 | TCGA-56-5898-01A-11R-1635-07	LUSC	LUSC	0.008	0.992
494 | TCGA-56-6545-01A-11R-1820-07	LUSC	LUSC	0.036	0.964
495 | TCGA-56-6546-01A-11R-1820-07	LUSC	LUSC	0.004	0.996
496 | TCGA-60-2695-01A-01R-0851-07	LUSC	LUSC	0.004	0.996
497 | TCGA-60-2696-01A-01R-0851-07	LUSC	LUSC	0.006	0.994
498 | TCGA-60-2698-01A-01R-0851-07	LUSC	LUSC	0.038	0.962
499 | TCGA-60-2706-01A-01R-0851-07	LUSC	LUSC	0.032	0.968
500 | TCGA-60-2707-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
501 | TCGA-60-2708-01A-01R-0851-07	LUSC	LUSC	0	1
502 | TCGA-60-2709-01A-21R-1820-07	LUSC	LUSC	0.004	0.996
503 | TCGA-60-2710-01A-01R-0851-07	LUSC	LUSC	0.004	0.996
504 | TCGA-60-2711-01A-01R-0851-07	LUSC	LUSC	0	1
505 | TCGA-60-2712-01A-01R-0851-07	LUSC	LUSC	0	1
506 | TCGA-60-2713-01A-01R-0851-07	LUSC	LUSC	0	1
507 | TCGA-60-2714-01A-01R-0851-07	LUSC	LUSC	0.258	0.742
508 | TCGA-60-2715-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
509 | TCGA-60-2716-01A-01R-0851-07	LUSC	LUSC	0.026	0.974
510 | TCGA-60-2719-01A-01R-0851-07	LUSC	LUSC	0	1
511 | TCGA-60-2720-01A-01R-0851-07	LUSC	LUSC	0.016	0.984
512 | TCGA-60-2721-01A-01R-0851-07	LUSC	LUSC	0	1
513 | TCGA-60-2722-01A-01R-0851-07	LUSC	LUSC	0	1
514 | TCGA-60-2723-01A-01R-0851-07	LUSC	LUSC	0	1
515 | TCGA-60-2724-01A-01R-0851-07	LUSC	LUSC	0.008	0.992
516 | TCGA-60-2725-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
517 | TCGA-60-2726-01A-01R-0851-07	LUSC	LUSC	0.008	0.992
518 | TCGA-63-5128-01A-01R-1443-07	LUSC	LUSC	0.014	0.986
519 | TCGA-63-5131-01A-01R-1443-07	LUSC	LUSC	0.012	0.988
520 | TCGA-63-6202-01A-11R-1820-07	LUSC	LUSC	0.068	0.932
521 | TCGA-63-7020-01A-11R-1949-07	LUSC	LUSC	0.006	0.994
522 | TCGA-63-7021-01A-11R-1949-07	LUSC	LUSC	0.028	0.972
523 | TCGA-63-7022-01A-11R-1949-07	LUSC	LUSC	0	1
524 | TCGA-63-7023-01A-11R-1949-07	LUSC	LUSC	0	1
525 | TCGA-66-2727-01A-01R-0980-07	LUSC	LUSC	0	1
526 | TCGA-66-2734-01A-01R-0980-07	LUSC	LUSC	0	1
527 | TCGA-66-2737-01A-01R-0980-07	LUSC	LUSC	0	1
528 | TCGA-66-2742-01A-01R-0980-07	LUSC	LUSC	0	1
529 | TCGA-66-2744-01A-01R-0980-07	LUSC	LUSC	0.004	0.996
530 | TCGA-66-2753-01A-01R-0980-07	LUSC	LUSC	0.01	0.99
531 | TCGA-66-2754-01A-01R-0980-07	LUSC	LUSC	0.02	0.98
532 | TCGA-66-2755-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
533 | TCGA-66-2756-01A-01R-0851-07	LUSC	LUSC	0.088	0.912
534 | TCGA-66-2757-01A-01R-0851-07	LUSC	LUSC	0.004	0.996
535 | TCGA-66-2758-01A-02R-0851-07	LUSC	LUSC	0	1
536 | TCGA-66-2759-01A-01R-0851-07	LUSC	LUSC	0	1
537 | TCGA-66-2763-01A-01R-0851-07	LUSC	LUSC	0.006	0.994
538 | TCGA-66-2765-01A-01R-0851-07	LUSC	LUSC	0.02	0.98
539 | TCGA-66-2766-01A-01R-0851-07	LUSC	LUSC	0.01	0.99
540 | TCGA-66-2767-01A-01R-0851-07	LUSC	LUSC	0	1
541 | TCGA-66-2768-01A-01R-0851-07	LUSC	LUSC	0	1
542 | TCGA-66-2769-01A-02R-0851-07	LUSC	LUSC	0.002	0.998
543 | TCGA-66-2770-01A-01R-0851-07	LUSC	LUSC	0	1
544 | TCGA-66-2771-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
545 | TCGA-66-2773-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
546 | TCGA-66-2777-01A-01R-1201-07	LUSC	LUSC	0	1
547 | TCGA-66-2778-01A-02R-0851-07	LUSC	LUSC	0.014	0.986
548 | TCGA-66-2780-01A-01R-0851-07	LUSC	LUSC	0.002	0.998
549 | TCGA-66-2781-01A-01R-0851-07	LUSC	LUSC	0	1
550 | TCGA-66-2782-01A-01R-0851-07	LUSC	LUSC	0	1
551 | TCGA-66-2783-01A-01R-1201-07	LUSC	LUSC	0	1
552 | TCGA-66-2785-01A-01R-0851-07	LUSC	LUSC	0.024	0.976
553 | TCGA-66-2786-01A-01R-0851-07	LUSC	LUSC	0	1
554 | TCGA-66-2787-01A-01R-0980-07	LUSC	LUSC	0	1
555 | TCGA-66-2788-01A-01R-0980-07	LUSC	LUSC	0.002	0.998
556 | TCGA-66-2789-01A-01R-0980-07	LUSC	LUSC	0.004	0.996
557 | TCGA-66-2790-01A-01R-0980-07	LUSC	LUSC	0.008	0.992
558 | TCGA-66-2791-01A-01R-0980-07	LUSC	LUSC	0	1
559 | TCGA-66-2792-01A-01R-0980-07	LUSC	LUSC	0	1
560 | TCGA-66-2793-01A-01R-1201-07	LUSC	LUSC	0.016	0.984
561 | TCGA-66-2794-01A-01R-1201-07	LUSC	LUSC	0	1
562 | TCGA-66-2795-01A-02R-0980-07	LUSC	LUSC	0.008	0.992
563 | TCGA-66-2800-01A-01R-1201-07	LUSC	LUSC	0.002	0.998
564 | TCGA-70-6722-01A-11R-1820-07	LUSC	LUSC	0.012	0.988
565 | TCGA-70-6723-01A-11R-1820-07	LUSC	LUSC	0.026	0.974
566 | TCGA-77-6842-01A-11R-1949-07	LUSC	LUSC	0.002	0.998
567 | TCGA-77-6843-01A-11R-1949-07	LUSC	LUSC	0.006	0.994
568 | TCGA-77-6844-01A-11R-1949-07	LUSC	LUSC	0.004	0.996
569 | TCGA-77-6845-01A-11R-1949-07	LUSC	LUSC	0	1
570 | TCGA-79-5596-01A-31R-1949-07	LUSC	LUSC	0	1
571 | TCGA-85-6175-01A-11R-1820-07	LUSC	LUSC	0.138	0.862
572 | TCGA-85-6560-01A-11R-1820-07	LUSC	LUAD	0.882	0.118
573 | TCGA-85-6561-01A-11R-1820-07	LUSC	LUSC	0.02	0.98
574 | TCGA-85-6798-01A-11R-1949-07	LUSC	LUSC	0	1
575 | TCGA-90-6837-01A-11R-1949-07	LUSC	LUSC	0	1
576 | TCGA-94-7033-01A-11R-1949-07	LUSC	LUSC	0.008	0.992
577 | 


--------------------------------------------------------------------------------
/Codes/BuildMatrixFile.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | from utilities import *
 3 | 
 4 | inFilePattern = sys.argv[1]
 5 | outFilePath = sys.argv[2]
 6 | 
 7 | inFilePaths = sorted(glob.glob(inFilePattern))
 8 | sampleIDs = [os.path.basename(x) for x in inFilePaths]
 9 | 
10 | features = set()
11 | for inFilePath in inFilePaths:
12 |     print "Identifying features in %s" % inFilePath
13 |     for line in file(inFilePath):
14 |         features.add(line.rstrip().split("\t")[0])
15 | features = sorted(list(features))
16 | 
17 | outData = [[""] + features]
18 | for inFilePath in inFilePaths:
19 |     print "Parsing and saving values for %s" % inFilePath
20 |     sampleID = os.path.basename(inFilePath)
21 | 
22 |     valueDict = {}
23 |     for line in file(inFilePath):
24 |         lineItems = line.rstrip().split("\t")
25 |         valueDict[lineItems[0]] = lineItems[1]
26 | 
27 |     values = [valueDict[feature] for feature in features]
28 |     outData.append([sampleID] + values)
29 | 
30 | print "Transposing and saving to %s" % outFilePath
31 | writeMatrixToFile(transposeMatrix(outData), outFilePath)
32 | 


--------------------------------------------------------------------------------
/Codes/CalcAUC.R:
--------------------------------------------------------------------------------
 1 | library(pROC)
 2 | 
 3 | inFilePath = commandArgs()[7]
 4 | actualColumnName = commandArgs()[8]
 5 | probabilitiesColumnName = commandArgs()[9]
 6 | outFilePath = commandArgs()[10]
 7 | main = commandArgs()[11]
 8 | 
 9 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
10 | 
11 | actual = as.factor(data[,actualColumnName])
12 | probabilities = as.numeric(data[,probabilitiesColumnName])
13 | 
14 | pdf(outFilePath)
15 | par(mar=c(4.5, 4.7, 0.0, 0.5), lwd=4)
16 |   
17 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE)
18 | lowerBoundAuc = format(roc_result$ci[1], digits=3)
19 | midAuc = format(roc_result$ci[2], digits=3)
20 | upperBoundAuc = format(roc_result$ci[3], digits=3)
21 |   
22 | ci(roc_result)
23 | sens.ci <- ci.se(roc_result)
24 | plot(sens.ci, type="shape", col="gray95")
25 | plot(sens.ci, type="bars")
26 | plot(roc_result, add=TRUE)
27 |   
28 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep=""))
29 | title(main)
30 |   
31 | par(mar=c(5.1, 4.1, 2.1, 2.1))
32 | graphics.off()
33 |   
34 | print(c(lowerBoundAuc, midAuc, upperBoundAuc))
35 | 


--------------------------------------------------------------------------------
/Codes/CalcAccuracy.R:
--------------------------------------------------------------------------------
 1 | library(pROC)
 2 | 
 3 | inFilePath = commandArgs()[7]
 4 | actualColumnName = commandArgs()[8]
 5 | predColumnName = commandArgs()[9]
 6 | 
 7 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
 8 | 
 9 | actual = data[,actualColumnName]
10 | pred = data[,predColumnName]
11 | 
12 | accuracy = sum(actual == pred) / nrow(data)
13 | 
14 | print(accuracy)
15 | 


--------------------------------------------------------------------------------
/Codes/Classify_luad_vs_lusc.R:
--------------------------------------------------------------------------------
 1 | library(caret)
 2 | 
 3 | outFilePath12 = "Classification_12_LUAD_LUSC_Predictions.txt"
 4 | outFilePath20 = "Classification_20_LUAD_LUSC_Predictions.txt"
 5 | 
 6 | # Read data from file
 7 | setwd("Analysis_datasets")
 8 | luad12 = read.table("12_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
 9 | lusc12 = read.table("12_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
10 | lu12 = cbind(luad12,lusc12)
11 | luad20 = read.table("20_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
12 | lusc20 = read.table("20_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
13 | lu20 = cbind(luad20,lusc20)
14 | 
15 | # Only keep the same samples in TCGA processed versus Rsubread processed data
16 | lu20_f = lu20[,colnames(lu20)%in%colnames(lu12)]
17 | 
18 | # Remove class values from data frame "LGG"==rownames(data)[9752]
19 | classes12 = as.factor(as.character(lu12[nrow(lu12),]))
20 | data12 = t(data.matrix(lu12[-nrow(lu12),]))
21 | classes20 = as.factor(as.character(lu20_f[nrow(lu20_f),]))
22 | data20 = t(data.matrix(lu20_f[-nrow(lu20_f),]))
23 | 
24 | # Retain features that do not have zero variance
25 | data12 = data12[,which(apply(data12, 2, var) > 0)]
26 | data20 = data20[,which(apply(data20, 2, var) > 0)]
27 | 
28 | # Set random seed so results are same each time
29 | set.seed(0)
30 | 
31 | # Build the classification model
32 | mod12 <- train(classes12~., data=data12, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T))
33 | set.seed(0)
34 | mod20 <- train(classes20~., data=data20, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T))
35 | 
36 | # Determine which mtry parameter value performed best
37 | tuneResults12 = mod12$results[order(mod12$results$Accuracy, decreasing=TRUE),]
38 | bestMtry12 = tuneResults12[1,]$mtry
39 | tuneResults20 = mod20$results[order(mod20$results$Accuracy, decreasing=TRUE),]
40 | bestMtry20 = tuneResults20[1,]$mtry
41 | 
42 | # Select predictions that coincide with best mtry parameter
43 | predictions12 = mod12$pred[which(mod12$pred$mtry == bestMtry12),]
44 | predictions20 = mod20$pred[which(mod20$pred$mtry == bestMtry20),]
45 | 
46 | # Sort predictions by the original order
47 | predictions12 = predictions12[order(predictions12$rowIndex),]
48 | predictions20 = predictions20[order(predictions20$rowIndex),]
49 | 
50 | # Build output matrix
51 | output12 = cbind(rownames(data12), predictions12[,2], predictions12[,1], predictions12[,3:(ncol(predictions12) - 3)])
52 | colnames(output12) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output12)[4:ncol(output12)], "Probability", sep="_"))
53 | output20 = cbind(rownames(data20), predictions20[,2], predictions20[,1], predictions20[,3:(ncol(predictions20) - 3)])
54 | colnames(output20) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output20)[4:ncol(output20)], "Probability", sep="_"))
55 | 
56 | # Save predictions to output file
57 | write.table(output12, outFilePath12, sep="\t", col.names=T, row.names=F, quote=F)
58 | write.table(output20, outFilePath20, sep="\t", col.names=T, row.names=F, quote=F)
59 | 


--------------------------------------------------------------------------------
/Codes/CombineScalarValues.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | from utilities import *
 3 | 
 4 | inFilePattern = sys.argv[1]
 5 | outFilePath = sys.argv[2]
 6 | 
 7 | outFile = open(outFilePath, 'w')
 8 | 
 9 | for inFilePath in glob.glob(inFilePattern):
10 |     outFile.write("%s\t%s\n" % (os.path.basename(inFilePath), readScalarFromFile(inFilePath)))
11 | 
12 | outFile.close()
13 | 


--------------------------------------------------------------------------------
/Codes/FileContainsText.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | from utilities import *
3 | 
4 | inFilePath = sys.argv[1]
5 | searchPattern = sys.argv[2].decode('string-escape')
6 | 
7 | print searchPattern in readTextFromFile(inFilePath)
8 | 


--------------------------------------------------------------------------------
/Codes/GetFileExtension.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | 
3 | inFilePath = sys.argv[1]
4 | 
5 | file, ext = os.path.splitext(inFilePath)
6 | 
7 | print ext
8 | 


--------------------------------------------------------------------------------
/Codes/IdentifyDiscordantPredictions.R:
--------------------------------------------------------------------------------
 1 | inFilePath = commandArgs()[7]
 2 | actualColumnName = commandArgs()[8]
 3 | predictedColumnName = commandArgs()[9]
 4 | potentiallyDiscordantFilePath = commandArgs()[10]
 5 | 
 6 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
 7 | 
 8 | incorrect = data[which(data[,actualColumnName]!=data[,predictedColumnName]),]
 9 | 
10 | potentiallyDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE)
11 | 
12 | print("Samples predicted incorrectly:")
13 | print(nrow(incorrect))
14 | 
15 | print("Samples predicted incorrectly that were identified previously as potentially discordant:")
16 | print(nrow(incorrect[which(incorrect$row.names %in% potentiallyDiscordantSamples),]))
17 | 


--------------------------------------------------------------------------------
/Codes/IdentifyInconsistentPredictions.R:
--------------------------------------------------------------------------------
 1 | inFilePath1 = commandArgs()[7]
 2 | inFilePath2 = commandArgs()[8]
 3 | actualColumnName = commandArgs()[9]
 4 | predictedColumnName = commandArgs()[10]
 5 | 
 6 | data1 = read.table(inFilePath1, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
 7 | data2 = read.table(inFilePath2, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
 8 | 
 9 | incorrect1 = data1[which(data1[,actualColumnName]!=data1[,predictedColumnName]),]
10 | incorrect2 = data2[which(data2[,actualColumnName]!=data2[,predictedColumnName]),]
11 | 
12 | print(nrow(incorrect1))
13 | print(nrow(incorrect2))
14 | 
15 | diff12 = setdiff(incorrect1$row.names, incorrect2$row.names)
16 | diff21 = setdiff(incorrect2$row.names, incorrect1$row.names)
17 | diffs = c(diff12, diff21)
18 | 
19 | print("Samples predicted inconsistently between two data sets:")
20 | data = merge(data1, data2, by=1)
21 | print(data[which(data$row.names %in% diffs),])
22 | 


--------------------------------------------------------------------------------
/Codes/LUSC_vs_LUAD.R:
--------------------------------------------------------------------------------
  1 | library(data.table)
  2 | library(stringr)
  3 | library(heatmap3)
  4 | library(caret)
  5 | library(pROC)
  6 | 
  7 | readData = function(filePath, logTransform=FALSE)
  8 | {
  9 |   data = fread(filePath)
 10 |   
 11 |   data = data.frame(data[-nrow(data),])
 12 |   rownames(data) = data[,1]
 13 |   data = data[,-1]
 14 |   data = data.matrix(data)
 15 |   
 16 |   if (logTransform)
 17 |     data = log2(data + 1)
 18 |   
 19 |   return(data)
 20 | }
 21 | 
 22 | mergeData = function(data1, data2)
 23 | {
 24 |   merged = merge(data1, data2, by=0, sort=FALSE)
 25 |   rownames(merged) = merged[,1]
 26 |   merged = merged[,-1]
 27 | }
 28 | 
 29 | crossValidate = function(data, outPrefix)
 30 | {
 31 |   # Remove any genes with no variance
 32 |   data = data[which(apply(data, 1, var) > 0),]
 33 |   write.table(dim(data), paste(outPrefix, "_Dimensions.txt", sep=""))
 34 | 
 35 |   library(doParallel)
 36 |   registerDoParallel(cores=12)
 37 | 
 38 |   # From http://stackoverflow.com/questions/13403427/fully-reproducible-parallel-models-using-caret
 39 |   # Unfortunately, it doesn't seem to ensure that the results are the same for multiple iterations
 40 |   set.seed(0)
 41 |   seeds <- vector(mode = "list", length = 11) # length is = (n_repeats*nresampling)+1
 42 |   for(i in 1:10) seeds[[i]] <- sample.int(n=1000, 3) #(3 is the number of tuning parameter, mtry for rf, here equal to ncol(iris)-2)
 43 |     seeds[[11]]<-sample.int(1000, 1)#for the last model
 44 | 
 45 |   model <- train(classes~., data=t(data), method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T), seeds=seeds)
 46 | 
 47 |   tuneResults = model$results[order(model$results$Accuracy, decreasing=TRUE),]
 48 |   bestMtry = tuneResults[1,]$mtry
 49 | 
 50 |   # Select predictions that coincide with best mtry parameter
 51 |   predictions = model$pred[which(model$pred$mtry == bestMtry),]
 52 | 
 53 |   # Sort predictions by the original order
 54 |   predictions = predictions[order(predictions$rowIndex),]
 55 | 
 56 |   rownames(predictions) = gsub("\\.", "-", colnames(data))
 57 | 
 58 |   write.table(predictions, paste(outPrefix, "_Predictions.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=T)
 59 | 
 60 |   featureImportance <- varImp(model, scale = TRUE)$importance
 61 |   featureImportance <- featureImportance[order(featureImportance$Overall, decreasing=TRUE),,drop=FALSE]
 62 |   write.table(featureImportance, paste(outPrefix, "_FeatureImportance.txt", sep=""), quote=FALSE, row.names=T, col.names=NA, sep="\t")
 63 | }
 64 | 
 65 | identifyDiffExpressedGenes = function(data1, data2, n)
 66 | {
 67 |   data1Mean = apply(data1, 1, mean)
 68 |   data2Mean = apply(data2, 1, mean)
 69 |   ratios = (data1Mean + 1) / (data2Mean + 1)
 70 |   ratios = sort(ratios, decreasing=TRUE)
 71 |   genesToPlot = c(names(head(ratios, n=n)), names(tail(ratios, n=n)))
 72 | 
 73 |   return(genesToPlot)
 74 | }
 75 | 
 76 | tcgaLuad = readData("12_LUAD_t.txt", logTransform=TRUE)
 77 | tcgaLusc = readData("12_LUSC_t.txt", logTransform=TRUE)
 78 | rsubreadLuad = readData("20_LUAD_t.txt")
 79 | rsubreadLusc = readData("20_LUSC_t.txt")
 80 | 
 81 | # Extract gene symbols from row names
 82 | rownames(tcgaLuad) = sapply(rownames(tcgaLuad), function(x) { str_split(x, "\\|")[[1]][1] })
 83 | rownames(tcgaLusc) = sapply(rownames(tcgaLusc), function(x) { str_split(x, "\\|")[[1]][1] })
 84 | 
 85 | # Find genes that are common across both data sets
 86 | commonTcgaGenes = intersect(rownames(tcgaLuad), rownames(tcgaLusc))
 87 | commonRsubreadGenes = intersect(rownames(rsubreadLuad), rownames(rsubreadLusc))
 88 | commonGenes = intersect(commonTcgaGenes, commonRsubreadGenes)
 89 | nonOverlappingGenes = setdiff(commonRsubreadGenes, commonTcgaGenes)
 90 | 
 91 | # Find samples that are common across both data sets
 92 | commonLuadSamples = intersect(colnames(tcgaLuad), colnames(rsubreadLuad))
 93 | commonLuscSamples = intersect(colnames(tcgaLusc), colnames(rsubreadLusc))
 94 | 
 95 | # Select common genes, samples of interest
 96 | tcgaLuad = tcgaLuad[commonTcgaGenes,commonLuadSamples]
 97 | tcgaLusc = tcgaLusc[commonTcgaGenes,commonLuscSamples]
 98 | rsubreadLuad = rsubreadLuad[commonRsubreadGenes,commonLuadSamples]
 99 | rsubreadLusc = rsubreadLusc[commonRsubreadGenes,commonLuscSamples]
100 | 
101 | classesLuad = rep("LUAD", ncol(tcgaLuad))
102 | classesLusc = rep("LUSC", ncol(tcgaLusc))
103 | classes = as.factor(c(classesLuad, classesLusc))
104 | 
105 | tcga = mergeData(tcgaLuad, tcgaLusc)
106 | rsubread = mergeData(rsubreadLuad, rsubreadLusc)
107 | 
108 | # Remove any genes with no variance
109 | tcga = tcga[which(apply(tcga, 1, var) > 0),]
110 | rsubread = rsubread[which(apply(rsubread, 1, var) > 0),]
111 | 
112 | crossValidate(tcga, "TCGA_AllGenes")
113 | crossValidate(rsubread, "RSubread_AllGenes")
114 | crossValidate(tcga[commonGenes,], "TCGA_CommonGenes")
115 | crossValidate(rsubread[commonGenes,], "RSubread_CommonGenes")
116 | crossValidate(rsubread[nonOverlappingGenes,], "RSubread_NonOverlappingGenes")
117 | 
118 | # Identify top differentially expressed genes
119 | tcgaDiffExpressedGenes = identifyDiffExpressedGenes(tcgaLuad, tcgaLusc, 100)
120 | rsubreadNonOverlappingDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLusc[nonOverlappingGenes,], 100)
121 | 
122 | # Get potentially discordant samples
123 | luscDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE)
124 | luscDiscordantSamples = str_replace_all(luscDiscordantSamples, "\\-", ".")
125 | luscDiscordantSamples = intersect(luscDiscordantSamples, commonLuscSamples)
126 | 
127 | tcgaLuscDiscordant = tcgaLusc[,luscDiscordantSamples]
128 | tcgaLusc = tcgaLusc[,setdiff(colnames(tcgaLusc), luscDiscordantSamples)]
129 | tcga = mergeData(tcgaLuad, tcgaLusc)
130 | tcga = mergeData(tcga, tcgaLuscDiscordant)
131 | 
132 | rsubreadLuscDiscordant = rsubreadLusc[,luscDiscordantSamples]
133 | rsubreadLusc = rsubreadLusc[,setdiff(colnames(rsubreadLusc), luscDiscordantSamples)]
134 | rsubread = mergeData(rsubreadLuad, rsubreadLusc)
135 | rsubread = mergeData(rsubread, rsubreadLuscDiscordant)
136 | 
137 | #discordantDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLuscDiscordant[nonOverlappingGenes,], 5)
138 | discordantDiffExpressedGenes = c("MIR320A", "MIR1234", "MIR4461", "MIR186")
139 | 
140 | colnames(rsubread) = str_replace_all(colnames(rsubread), "\\.", "-")
141 | write.table(rsubread[discordantDiffExpressedGenes,], "RSubread_Discordant_DiffExpressedGenes_Data.txt", sep="\t", quote=F, col.names=NA, row.names=T)
142 | 
143 | classes = c(classesLuad, rep("LUSC", ncol(rsubreadLusc)), rep("Discordant LUSC", ncol(rsubreadLuscDiscordant)))
144 | classes = cbind(colnames(rsubread), classes)
145 | write.table(classes, "RSubread_Discordant_Classes.txt", sep="\t", quote=F, col.names=F, row.names=F)
146 | 


--------------------------------------------------------------------------------
/Codes/ParseCgHubQueryResults.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | 
 3 | inFilePath = sys.argv[1]
 4 | sampleFilePath = sys.argv[2]
 5 | outDownloadSamplesDirPath = sys.argv[3]
 6 | outCancerTypesDirPath = sys.argv[4]
 7 | 
 8 | def parseTagValue(lines, key):
 9 |     for line in lines:
10 |         line = line.strip()
11 | 
12 |         if line.startswith("<%s>" % key):
13 |             return line.replace("/", "").replace("<%s>" % key, "")
14 | 
15 |     return None
16 | 
17 | def saveOutput(outLines):
18 |     legacyID = parseTagValue(outLines, "legacy_sample_id")
19 | 
20 |     if sampleFilePath == "" or legacyID in samplesToKeep:
21 |         analysisID = parseTagValue(outLines, "analysis_id")
22 | 
23 |         if analysisID != None:
24 |             outFilePath = "%s/%s" % (outDownloadSamplesDirPath, legacyID)
25 |             if os.path.exists(outFilePath):
26 |                 print "%s already exists" % outFilePath
27 |             outFile = open(outFilePath, 'w')
28 |             outFile.write("%s\n" % analysisID)
29 |             outFile.close()
30 | 
31 |             cancerType = parseTagValue(outLines, "disease_abbr")
32 |             if cancerType == None:
33 |                 print "Cancer type was not specified for %s." % analysisID
34 |                 exit(1)
35 |             outFile = open("%s/%s" % (outCancerTypesDirPath, legacyID), 'w')
36 |             outFile.write("%s\n" % cancerType)
37 |             outFile.close()
38 | 
39 |             return legacyID
40 | 
41 |     return None
42 | 
43 | inFileLines = [line for line in file(inFilePath)]
44 | 
45 | headerLine1 = inFileLines.pop(0)
46 | headerLine2 = inFileLines.pop(0)
47 | 
48 | if "Query" in inFileLines[0]:
49 |     inFileLines.pop(0)
50 |     inFileLines.pop(0)
51 | 
52 | footerLine = inFileLines.pop(len(inFileLines)-1)
53 | 
54 | if sampleFilePath != "":
55 |     samplesToKeep = set([line.rstrip() for line in file(sampleFilePath)])
56 | 
57 | samplesSaved = set()
58 | 
59 | outLines = []
60 | 
61 | for line in inFileLines:
62 |     if "<Result" in line:
63 |         sampleSaved = saveOutput(outLines)
64 |         if sampleSaved != None:
65 |             samplesSaved.add(sampleSaved)
66 | 
67 |         if len(samplesSaved) % 100 == 0:
68 |             print "Done processing %i samples" % len(samplesSaved)
69 | 
70 |         outLines = []
71 | 
72 |     outLines.append(line)
73 | 
74 | sampleSaved = saveOutput(outLines)
75 | if sampleSaved != None:
76 |     samplesSaved.add(sampleSaved)
77 | print "Done processing %i samples" % len(samplesSaved)
78 | 


--------------------------------------------------------------------------------
/Codes/ParseSampleTypes.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | 
3 | inFilePath = sys.argv[1]
4 | 
5 | sampleTypes = set()
6 | for line in file(inFilePath):
7 |     sampleTypes.add(line.rstrip()[13:15])
8 | print sampleTypes
9 | 


--------------------------------------------------------------------------------
/Codes/PeekMatrix.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | 
 3 | inFilePath = sys.argv[1]
 4 | numRows = int(sys.argv[2])
 5 | numCols = int(sys.argv[3])
 6 | 
 7 | lineCount = 0
 8 | for line in file(inFilePath):
 9 |     lineItems = line.rstrip().split("\t")
10 | 
11 |     if lineCount <  numRows:
12 |         print lineItems[:numCols]
13 |         lineCount += 1
14 |     else:
15 |         break
16 | 


--------------------------------------------------------------------------------
/Codes/PlotDiscordant.R:
--------------------------------------------------------------------------------
 1 | plotHist = function(gene, data, classes, outFilePath)
 2 | {
 3 | #  xlimMin = min(c(min(data1[gene,]), min(data2[gene,]), min(data3[gene,])))
 4 | #  xlimMax = max(c(min(data1[gene,]), max(data2[gene,]), max(data3[gene,])))
 5 | #  xlim = c(xlimMin, xlimMax)
 6 | 
 7 |   xlim = c(min(data), max(data))
 8 |   xlab = paste(gene, " expression levels", sep="")
 9 | 
10 |   pdf(outFilePath)
11 |   par(mfrow=c(3,1),lwd=2)
12 |   hist(as.numeric(data[gene,which(classes=="LUAD")]), main="LUAD", breaks=50, xlab=xlab, xlim=xlim, cex.axis=2,cex.lab=2)
13 |   hist(as.numeric(data[gene,which(classes=="LUSC")]), main="LUSC", breaks=50, xlab=xlab, xlim=xlim,cex.axis=2,cex.lab=2)
14 |   hist(as.numeric(data[gene,which(classes=="Discordant LUSC")]), main="LUSC (potentially discordant)", breaks=12, xlab=xlab, xlim=xlim,cex.axis=2,cex.lab=2)
15 |   graphics.off()
16 | }
17 | 
18 | library(data.table)
19 | library(heatmap3)
20 | library(RColorBrewer)
21 | 
22 | data = as.data.frame(fread("RSubread_Discordant_DiffExpressedGenes_Data.txt"))
23 | rownames(data) = data[,1]
24 | data = data.matrix(data[,-1])
25 | 
26 | classes = read.table("RSubread_Discordant_Classes.txt", sep="\t", stringsAsFactors=F, header=F, row.names=1)
27 | 
28 | data = data[which(apply(data, 1, var) > 0),]
29 | 
30 | accent = brewer.pal(8, "Accent")
31 | set3 = brewer.pal(12, "Set3")
32 | cols = c(accent[1], set3[12], accent[7])
33 | 
34 | ColSideColors = as.character(classes[,1])
35 | ColSideColors[ColSideColors=="LUAD"] = cols[1]
36 | ColSideColors[ColSideColors=="LUSC"] = cols[2]
37 | ColSideColors[ColSideColors=="Discordant LUSC"] = cols[3]
38 | par(lwd=4)
39 | if (nrow(data) <= 10)
40 |   for (gene in rownames(data))
41 |     plotHist(gene, data, classes[,1], paste("RSubread_", gene, "_Histogram.pdf", sep=""))
42 | 
43 | pdf("RSubread_Discordant_Heatmap.pdf")
44 | colnames(data) = rep("", ncol(data))
45 | if (nrow(data) > 20)
46 |   rownames(data) = rep("", nrow(data))
47 | heatmap3(data, Colv=NA, Rowv=TRUE, showRowDendro=T, showColDendro=F, cexRow=3, margins=c(5, 12), ColSideColors=ColSideColors, ColSideLabs="", cex=1.5)
48 | legend("top", legend=c("LUAD", "LUSC", "Discordant LUSC"), col=cols, cex=1.1, lty=1, lwd=4, inset=-0.07, xpd=TRUE, box.lwd=0, box.lty=0, horiz=F)
49 | graphics.off()
50 | 


--------------------------------------------------------------------------------
/Codes/PrintMatrixDimensions.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | import utilities
 3 | 
 4 | inFilePath = sys.argv[1]
 5 | 
 6 | inFile = open(inFilePath)
 7 | numCols = len(inFile.readline().rstrip().split("\t"))
 8 | numRows = 1
 9 | for line in inFile:
10 |     numRows += 1
11 | inFile.close()
12 | 
13 | print "Number Rows: %i" % numRows
14 | print "Number Columns: %i" % numCols
15 | 


--------------------------------------------------------------------------------
/Codes/ProcessClinicalData.R:
--------------------------------------------------------------------------------
 1 | if (!require("plyr")) {
 2 |    install.packages("plyr", dependencies = TRUE)
 3 |    library(plyr)
 4 |    }
 5 | 
 6 | data==identifiers=tmp_data=tmp_identifier=NULL
 7 | dirname='.'
 8 | setwd(dirname)#Set the directory where the clinical data is located for each cancer in separate folder
 9 | filenames<-system("ls */nationwidechildrens.org_clinical_patient*", intern=T)
10 | for(i in 1:length(filenames)){#####iterating through each of the clinical files to create new matrix files with ALL clinical variables
11 |   print(i)
12 |   f<-(read.delim(paste(c(dirname,filenames[i]), collapse=''))) ###reading in the filess one at a time
13 |   tmp_data<-f[3:nrow(f),]
14 |   tmp_identifier<-f[1:2,]
15 |   if(i==1){
16 |     data<-tmp_data
17 |     identifier<-tmp_identifier
18 | }else{
19 |     identifier<-list(identifier,tmp_identifier)
20 |     identifier<-rbind.fill.matrix(identifier)
21 |     for(j in 1:ncol(identifier)){
22 |       if(!is.na(identifier[3,j])){
23 |         identifier[1,j]<-identifier[3,j]
24 |         identifier[2,j]<-identifier[4,j]
25 |       }
26 |     }
27 |     identifier<-identifier[1:2,]
28 |     data<-list(data,tmp_data)
29 |     data<-rbind.fill.matrix(data)
30 |     #data<-merge(data,f)
31 |   }
32 | }
33 | rownames(data)<-data[,2]
34 | 
35 | #Now, converting short TCGA ids reported in clinical data to long TCGA ids reported in RNA-Seq dataset using R codes
36 | 
37 | sample_names<-rownames(as.matrix(read.table("PANCAN24_CancerType_Samples.txt", row.names=1, sep='\t', check.names = F))) #getting the long TCGA IDs used in RNA-Seq dataset
38 | partial_sample_names<-rownames(data)
39 | counter=0##to check how many replacement has been done
40 | for (j in 1:length(partial_sample_names)){
41 |   if(!is.na(pmatch(partial_sample_names[j],sample_names))){
42 |     partial_sample_names[j]<-sample_names[pmatch(partial_sample_names[j],sample_names, duplicates.ok=F)]  
43 |     counter=counter+1
44 |   }
45 | }
46 | 
47 | rownames(data)<-partial_sample_names
48 | clinical_data<-matrix(NA, nrow=9264,ncol=548) ###instantiating an NA matrix
49 | rownames(clinical_data)<-sample_names
50 | colnames(clinical_data)<-colnames(data)
51 | for(i in 1:length(rownames(clinical_data))){
52 |   sample_id<-rownames(clinical_data)[i]
53 |   if(sample_id%in%rownames(data)){
54 |     clinical_data[sample_id,]<-data[sample_id,]      
55 |   }
56 | }
57 | clinical_data_identifier<-cbind(t(identifier),t(clinical_data))
58 | write.table(clinical_data_identifier,file="TCGA_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt", sep='\t',col.names=NA, quote=F)
59 | 
60 | 


--------------------------------------------------------------------------------
/Codes/ProcessRnaSeqFeatureCounts.R:
--------------------------------------------------------------------------------
 1 | library(Rsubread)
 2 | library(limma)
 3 | library(edgeR)
 4 | library(tools)
 5 | options(digits=2)
 6 | 
 7 | referenceGenomeFastaFilePath = commandArgs()[7]
 8 | inFilePath1 = commandArgs()[8]
 9 | inFilePath2 = commandArgs()[9] # NULL for single-end analyses or when a BAM file has been specified
10 | gtfFilePath = commandArgs()[10]
11 | tempFilePrefix = commandArgs()[11]
12 | outFpkmFilePath = commandArgs()[12]
13 | outTpmFilePath = commandArgs()[13]
14 | outCountsFilePath = commandArgs()[14]
15 | outStatsFilePath = commandArgs()[15]
16 | 
17 | memory = 4000
18 | nthreads = 1
19 | 
20 | input_format = "gzFASTQ"
21 | if (file_ext(inFilePath1) == "bam")
22 |   input_format = "BAM"
23 | if (file_ext(inFilePath1) %in% c("fastq", "fq"))
24 |   input_format = "FASTQ"
25 | 
26 | outBamFilePath = paste(tempFilePrefix, "bam", sep=".")
27 | 
28 | referenceGenomeIndexFilePrefix = paste(referenceGenomeFastaFilePath, "__reference_index", sep="")
29 | 
30 | if (!file.exists(paste(referenceGenomeIndexFilePrefix, ".reads", sep="")))
31 |   buildindex(basename=referenceGenomeIndexFilePrefix, reference=referenceGenomeFastaFilePath, memory=memory)
32 | 
33 | if (inFilePath2 == "NULL")
34 |   inFilePath2 = NULL
35 | 
36 | if (!file.exists(outBamFilePath))
37 |   align(index=referenceGenomeIndexFilePrefix, readfile1=inFilePath1, readfile2=inFilePath2, output_file=outBamFilePath, nthreads=nthreads, input_format=input_format, tieBreakHamming=TRUE, unique=TRUE, indels=5)
38 | 
39 | fCountsList = featureCounts(outBamFilePath, annot.ext=gtfFilePath, isGTFAnnotationFile=TRUE, nthreads=nthreads, isPairedEnd=!is.null(inFilePath2))
40 | dgeList = DGEList(counts=fCountsList$counts, genes=fCountsList$annotation)
41 | fpkm = rpkm(dgeList, dgeList$genes$Length)
42 | tpm = exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
43 | 
44 | write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
45 | 
46 | featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts)
47 | write.table(featureCounts, outCountsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
48 | 
49 | write.table(cbind(fCountsList$annotation[,1], fpkm), outFpkmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
50 | #write.table(cbind(fCountsList$annotation[,1], log2(fpkm + 1)), outFpkmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
51 | write.table(cbind(fCountsList$annotation[,1], tpm), outTpmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
52 | #write.table(cbind(fCountsList$annotation[,1], log2(tpm + 1)), outTpmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
53 | 
54 | unlink(outBamFilePath)
55 | unlink(paste(outBamFilePath, ".indel", sep=""))
56 | 


--------------------------------------------------------------------------------
/Codes/Split.py:
--------------------------------------------------------------------------------
 1 | import os, sys, glob
 2 | 
 3 | inFilePath = sys.argv[1]
 4 | outDirPath = sys.argv[2]
 5 | 
 6 | inFile = open(inFilePath)
 7 | sampleIDs = inFile.readline().rstrip().split("\t")[1:]
 8 | 
 9 | lineCount = 0
10 | 
11 | for line in inFile:
12 |     lineItems = line.rstrip().split("\t")
13 |     gene = lineItems.pop(0)
14 | 
15 |     for sampleID in sampleIDs:
16 |         outFile = open(outDirPath + "/" + sampleID, 'a')
17 |         outFile.write("%s\t%s\n" % (gene, lineItems.pop(0)))
18 |         outFile.close()
19 | 
20 |     lineCount += 1
21 |     if lineCount % 1000 == 0:
22 |         print lineCount
23 | 
24 | inFile.close()
25 | 


--------------------------------------------------------------------------------
/Codes/TransposeData.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import utilities
 3 | 
 4 | inFilePath = sys.argv[1]
 5 | outFilePath = sys.argv[2]
 6 | 
 7 | data = utilities.readMatrixFromFile(inFilePath)
 8 | 
 9 | if len(data) > 1 and len(data[0]) == len(data[1]) - 1:
10 |     data[0].insert(0, " ")
11 | 
12 | utilities.writeMatrixToFile(utilities.transposeMatrix(data), outFilePath)
13 | 


--------------------------------------------------------------------------------
/Codes/biological_rep.R:
--------------------------------------------------------------------------------
  1 | find_biological_replicate<-function(matrix){
  2 |   s=NULL
  3 |   samples=colnames(matrix)
  4 |   for(i in 1:ncol(matrix)){
  5 |     s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
  6 |   }
  7 |   sum(duplicated(s))
  8 |   s=s[duplicated(s)]
  9 |   dupsamples=NULL
 10 |   counter=0
 11 |   for(i in 1:length(samples)){ 
 12 |     tmp=paste(strsplit(samples[i],'-')[[1]][1:3],sep='',collapse="-")
 13 |     print(tmp)
 14 |     if(tmp%in%s){
 15 |       print("biological replicate found!!")
 16 |       print(rownames(samples))[i]
 17 |       dupsamples=c(dupsamples,samples[i])
 18 |       counter=counter+1
 19 |     }
 20 |   }
 21 | 
 22 |   print(paste(counter,"samples are duplicated for biological replicates"))
 23 |   return (matrix[,colnames(matrix)%in%dupsamples])
 24 |   
 25 | }
 26 | 
 27 | 
 28 | 
 29 | library(data.table)
 30 | 
 31 | samples<-read.table("~/Downloads/GSE62944_TCGA_20_CancerType_Samples.txt",row.names=1)
 32 | # tcga20<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_FeatureCounts.txt"),row.names=1,check.names = F)
 33 | # dim(tcga20)
 34 | # filt_20<-find_biological_replicate(rownames(samples),tcga20)
 35 | # dim(filt_20)
 36 | # tcga20_zero<-apply(tcga20==0,2,sum)
 37 | tcga24_tpm<-data.frame(fread("~/Desktop/PANCAN24/PANCAN24.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM.txt"),row.names=1,check.names = F)
 38 | dim(tcga20_tpm)
 39 | 
 40 | normal_tpm<-data.frame(fread("~/Desktop/PANCAN24/TCGA24.IlluminaHiSeq_RNASeqV2.normal_Rsubread_TPM.txt"),row.names=1,check.names = F)
 41 | dim(normal_tpm)
 42 | colnames(normal_tpm)
 43 | s=NULL
 44 | samples=colnames(normal_tpm)
 45 | for(i in 1:ncol(normal_tpm)){
 46 |   s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
 47 | }
 48 | tumor_s=NULL
 49 | samples=colnames(tcga24_tpm)
 50 | for(i in 1:ncol(tcga24_tpm)){
 51 |   tumor_s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
 52 | }
 53 | 
 54 | 
 55 | 
 56 | tcga20_tpm<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM_10_9.txt"),row.names=1,check.names = F)
 57 | dim(tcga20_tpm)
 58 | filt_20_tpm<-find_biological_replicate(tcga20_tpm)
 59 | dim(filt_20_tpm)
 60 | tcga20_zero<-apply(tcga20_tpm==0,2,sum)
 61 | 
 62 | 
 63 | rsem<-data.frame(fread("~/Desktop/PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist"),row.names=1,check.names = F)
 64 | filt_12<-find_biological_replicate(rsem)
 65 | dim(filt_12)##16 samples with 2 replicates
 66 | rsem_zero<-apply(rsem==0,2,sum)
 67 | filt_12<-filt_12[30:nrow(filt_12),]
 68 | biological_rep_12<-subset(filt_12,select=colnames(filt_12)%in%colnames(filt_20_tpm))
 69 | rownames_biological_rep_12<-gsub("[|].*","",rownames(biological_rep_12))
 70 | 
 71 | biological_rep_12_o<-biological_rep_12[rownames_biological_rep_12%in%rownames(biological_rep_20),order(colnames(biological_rep_12))]
 72 | dim(biological_rep_12_o)
 73 | 
 74 | #biological_rep_20<-subset(filt_20,select=colnames(filt_20)%in%colnames(filt_12))
 75 | #biological_rep_20_o<-biological_rep_20[,order(colnames(biological_rep_20))]
 76 | biological_rep_20<-subset(filt_20_tpm,select=colnames(filt_20_tpm)%in%colnames(filt_12))
 77 | biological_rep_20_o<-biological_rep_20[rownames(biological_rep_20)%in%rownames_biological_rep_12,order(colnames(biological_rep_20))]
 78 | dim(biological_rep_20_o)
 79 | 
 80 | 
 81 | total_20<-log2(apply(biological_rep_20_o,2,sum))
 82 | total_12<-log2(apply(biological_rep_12_o,2,sum))
 83 | 
 84 | 
 85 | #plot(total_12,ylim=c(22.75,26.5),main="PANCAN12 Level 3",ylab="log2(Gene Counts)")
 86 | cor_res_12=cor_res_20=NULL
 87 | pdf("~/Dropbox/Bioinformatics submission/Resubmission/scatter_plot.pdf")
 88 | 
 89 | cor_res_12=cor_res_20=NULL
 90 | for(i in 1:13){
 91 |   #points((i*2-1):(i*2),total_12[(i*2-1):(i*2)],col=i,lwd = 4,pch=i)
 92 |   plot(log2(biological_rep_12_o[,(i*2-1)]+1),log2(biological_rep_12_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_12_o)[(i*2-1)],"log2(Normalized gene counts)",sep='\n'),ylab=paste(colnames(biological_rep_12_o)[(i*2)],"log2(Normalized gene counts)",sep=' '))
 93 |   c=cor.test(biological_rep_12_o[,(i*2-1)],biological_rep_12_o[,(i*2)])#,method="spearman")
 94 |   cor_res_12=rbind(cor_res_12,c(colnames(biological_rep_12_o)[(i*2-1)],round(total_12[(i*2-1)],digits = 3),colnames(biological_rep_12_o)[(i*2)],round(total_12[(i*2)],digits = 3),round(c$estimate,digits = 3)))
 95 |   #title(paste(paste(strsplit(colnames(biological_rep_12_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep=""))
 96 |   title(paste("TCGA Level 3 \nPearson's correlation=",round(c$estimate,digits = 2),sep=""))
 97 |   print(i)
 98 |   print(colnames(biological_rep_12_o)[i])
 99 |   plot(log2(biological_rep_20_o[,(i*2-1)]+1),log2(biological_rep_20_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_20_o)[(i*2-1)],"log2(TPM)",sep='\n'),ylab=paste(colnames(biological_rep_20_o)[(i*2)],"log2(TPM)",sep=''))
100 |   c<-cor.test(biological_rep_20_o[,(i*2-1)],biological_rep_20_o[,(i*2)])#,method="spearman")
101 |   cor_res_20=rbind(cor_res_20,c(colnames(biological_rep_20_o)[(i*2-1)],round(total_20[(i*2-1)],digits = 3),colnames(biological_rep_20_o)[(i*2)],round(total_20[(i*2)],digits=3),round(c$estimate,digits = 3)))
102 |   #title(paste(paste(strsplit(colnames(biological_rep_20_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep=""))
103 |   title(paste("Rsubread TPM \nPearson's correlation=",round(c$estimate,digits = 2),sep=""))
104 |   
105 | }
106 | colnames(cor_res_12)=c("Replicate_1","log2 Level 3 gene counts","Replicate_2","log2 Level 3 gene counts","Pearson's correlation between replicates(Level 3)")
107 | colnames(cor_res_20)=c("Replicate_1","log2 Rsubread gene counts","Replicate_2","log2 Rsubread gene counts","Pearson's correlation between replicates(Rsubread)")
108 | 
109 | par( mfrow = c(2, 1 ) ,lwd=4)
110 | hist(as.numeric(cor_res_12[,5]),main = "TCGA Level 3 Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5)
111 | abline(v=mean(as.numeric(cor_res_12[,5])),col="red")
112 | abline(v=median(as.numeric(cor_res_12[,5])),col="blue")
113 | hist(as.numeric(cor_res_20[,5]),main = "Rsubread Replicates Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5)
114 | abline(v=mean(as.numeric(cor_res_20[,5])),col="red")
115 | abline(v=median(as.numeric(cor_res_20[,5])),col="blue")
116 | 
117 | write.table(cbind(cor_res_12,cor_res_20),"~/Desktop/correlations.txt",sep='\t',col.names=NA,quote=1)
118 | #************************************************************************
119 | 
120 | 
121 | 
122 | ####
123 | ecdf_all_ex<-apply(log2(biological_rep_12_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf)
124 | plot(ecdf_all_ex[[1]],xlab="log2 Level 3 reads", ylab = NA,xlim=c(0,20),col="blue",main="TCGA Level 3",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5)
125 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown")
126 | 
127 | ###using Rsubread pipeline aligned data
128 | # ecdf_all<-apply(rsub_fpkmlog,2,ecdf)
129 | # plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
130 | # for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
131 | # for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
132 | 
133 | ecdf_all_ex<-apply(log2(biological_rep_20_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf)
134 | plot(ecdf_all_ex[[1]],xlab="log2TPM reads", ylab = NA,xlim=c(0,20),col="blue",main="Rsubread",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5,)
135 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown")
136 | 
137 | 
138 | 
139 | 
140 | ############zero
141 | setwd("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/")
142 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) 
143 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) 
144 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) 
145 | rsub_tpm<-log2(read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F)+1) 
146 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),]
147 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene
148 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene)
149 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1)
150 | 
151 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_tpm),]
152 | com_genes_TCGA<-com_genes_TCGA[order(rownames(com_genes_TCGA)),]
153 | com_genes_tpm<-rsub_tpm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),]
154 | com_genes_tpm<-com_genes_tpm[order(rownames(com_genes_tpm)),]
155 | zero_genes_rsem<-com_genes_TCGA[apply(com_genes_TCGA[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates
156 | sum_zero_genes_rsem<-mean(apply(zero_genes_rsem==0,1,sum))##average of how many replicates have same zero expression
157 | 
158 | 
159 | nrow(zero_genes_rsem)
160 | nrow(zero_genes_rsem)/nrow(com_genes_TCGA)
161 | mean(apply(zero_genes_rsem,1,mean))#228.859 if TCGA counts are used
162 | zero_genes_f<-com_genes_tpm[apply(com_genes_tpm[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates
163 | sum_zero_genes_feature<-mean(apply(zero_genes_f==0,1,sum))##average of how many replicates have same zero expression
164 | 
165 | nrow(zero_genes_f)
166 | nrow(zero_genes_f)/nrow(com_genes_tpm)
167 | mean(apply(zero_genes_f,1,mean))#0.55 if Rsubread counts are used.
168 | par( mfrow = c(1, 2 ) ,lwd=4)
169 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="TCGA Level 3",breaks=12)
170 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2)
171 | 
172 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="Rsubread TPM",breaks=2)
173 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2)
174 | pro_t<-zero_sum/nrow(rsem_f)
175 | prop<-cbind(pro_t,pro_r)
176 | colnames(prop)<-c("TCGA Level 3","Rsubread TPM")
177 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F)
178 | 
179 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1,]),sep=" "))
180 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1&apply(com_genes_tpm[,1:12]<=100,1,mean)==1,]),sep=" "))
181 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>100,1,mean)==1&apply(com_genes_tpm[,1:12]<=1000,1,mean)==1,]),sep=" "))
182 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>1000,1,mean)==1&apply(com_genes_tpm[,1:12]<=10000,1,mean)==1,]),sep=" "))
183 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>10000,1,mean)==1,]),sep=" "))
184 | 
185 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1,]),sep=" "))
186 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1&apply(com_genes_TCGA[,1:12]<=100,1,mean)==1,]),sep=" "))
187 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>100,1,mean)==1&apply(com_genes_TCGA[,1:12]<=1000,1,mean)==1,]),sep=" "))
188 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>1000,1,mean)==1&apply(com_genes_TCGA[,1:12]<=10000,1,mean)==1,]),sep=" "))
189 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>10000,1,mean)==1,]),sep=" "))
190 | 
191 | 
192 | 
193 | 
194 | 
195 | # feature_f<-feature[rownames(feature)%in%rownames(rsem_her2_expected_counts),]
196 | # rsem_f<-rsem_her2_expected_counts[rownames(rsem_her2_expected_counts)%in%rownames(feature),]
197 | # zero_sum_r<-apply(feature_f==0,2,sum)
198 | # pro_r<-zero_sum_r/nrow(feature_f)
199 | # pro_r<-pro_r[order(names(pro_r))]
200 | # zero_sum_tcga<-apply(rsem_f==0,2,sum)/nrow(rsem_f)
201 | # print(paste("Total number of zero expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]==0,1,mean)==1),]),sep=" "))
202 | # print(paste("Total number of zero expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]==0,1,mean)==1),]),sep=" "))
203 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" "))
204 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" "))
205 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" "))
206 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" "))
207 | # 
208 | # dim(rsem_f)
209 | # dim(feature_f)
210 | # rsem_f_o<-rsem_f[order(rownames(rsem_f)),]
211 | # feature_f_o<-feature_f[order(rownames(feature_f)),]
212 | # head(rownames(rsem_f_o))
213 | # head(rownames(feature_f_o))
214 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)==1,]#common zero expressing genes
215 | # nrow(zero_genes_rsem)
216 | # nrow(feature_f_o[apply(feature_f_o[,1:17]==0,1,mean)==1&apply(rsem_f_o[,1:17]==0,1,mean)==1,])
217 | # zero_genes_f<-feature_f_o[apply(rsem_f_o[,1:17]==0,1,mean)!=1&apply(feature_f_o[,1:17]==0,1,mean)==1,]##gene that are zero expressing in feature counts but nonzero in TCGA
218 | # nrow(zero_genes_f)
219 | # zero_genes_r<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)!=1,]##gene that are zero expressing in Level 3 but nonzero in feature
220 | # nrow(zero_genes_r)
221 | # -------
222 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates
223 | # nrow(zero_genes_rsem)
224 | # nrow(zero_genes_rsem)/nrow(rsem_f_o)
225 | # mean(apply(zero_genes_rsem,1,mean))
226 | # zero_genes_f<-feature_f_o[apply(feature_f_o[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates
227 | # nrow(zero_genes_f)
228 | # nrow(zero_genes_f)/nrow(feature_f_o)
229 | # mean(apply(zero_genes_f,1,mean))
230 | 
231 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1,]),sep=" "))
232 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1&apply(feature_f_o[,1:12]<=100,1,mean)==1,]),sep=" "))
233 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>100,1,mean)==1&apply(feature_f_o[,1:12]<=1000,1,mean)==1,]),sep=" "))
234 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>1000,1,mean)==1&apply(feature_f_o[,1:12]<=10000,1,mean)==1,]),sep=" "))
235 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>10000,1,mean)==1,]),sep=" "))
236 | 
237 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1,]),sep=" "))
238 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1&apply(rsem_f_o[,1:12]<=100,1,mean)==1,]),sep=" "))
239 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>100,1,mean)==1&apply(rsem_f_o[,1:12]<=1000,1,mean)==1,]),sep=" "))
240 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>1000,1,mean)==1&apply(rsem_f_o[,1:12]<=10000,1,mean)==1,]),sep=" "))
241 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>10000,1,mean)==1,]),sep=" "))
242 | 
243 | 
244 | 
245 | 
246 | 
247 | #interesting_genes<-feature_f[rownames(feature_f)%in%rownames(zero_genes),]
248 | #zero_genes_feature<-interesting_genes[apply(interesting_genes[,1:17]==0,1,mean)==1,]#
249 | par( mfrow = c(2, 1 ) ,lwd=4)
250 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="TCGA",breaks=12)
251 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2)
252 | 
253 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="Rsubread",breaks=2)
254 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2)
255 | pro_t<-zero_sum/nrow(rsem_f)
256 | prop<-cbind(pro_t,pro_r)
257 | colnames(prop)<-c("TCGA","Rsubread")
258 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F)
259 | #########LUSC but LUAD-like analysis##
260 | class_12<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt", header=1,row.names=1)
261 | class_20<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt", header=1,row.names=1)
262 | mismatch12<-class_12[class_12[,1]!=class_12[,2],]
263 | mismatch20<-class_20[class_20[,1]!=class_20[,2],]
264 | mismatches_all<-merge(class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],by=0)
265 | rownames(mismatches_all)<-gsub("01A-.*-07","01",mismatches_all$Row.names)
266 | mismatches_all<-mismatches_all[,2:ncol(mismatches_all)]
267 | colnames(mismatches_all)<-c("ActualClass.TCGA","PredictedClass.TCGA","LUAD_Probability.TCGA", "LUSC_Probability.TCGA", "ActualClass.Rsubread","PredictedClass.Rsubread",  
268 |                            "LUAD_Probability.Rsubread","LUSC_Probability.Rsubread")
269 | lusc_but_luad<-read.table("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/LUSC_but_LUAD_like.txt",sep='\t', header=1)
270 | discord<-merge(mismatches_all,lusc_but_luad,by.x=0,by.y=1,all.y=T)
271 | mismatches_all[gsub("01A-.*-07","01",mismatches_all$Row.names)%in%lusc_but_luad$sample,]#identifies the missclassified LUSC, but LUAD-like samples identified by 
272 | lusc_but_luad[!lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),]
273 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),]
274 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",rownames(mismatch20)),]
275 | 
276 | 


--------------------------------------------------------------------------------
/Codes/numZero.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##Manually download Pancan12 RNA_Seq dataset from https://www.synapse.org/#!Synapse:syn1695324 and filtered for gene symbols.Additionally, download Rsubread TPM RNA_Seq data from GEO accession number GSM1536837.
 3 | pan12<-read.table("PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist", header=1,row.names=1)
 4 | pan20<-read.table("GSM1536837_TCGA_20.Illumina.tumor_Rsubread_TPM.txt",header=1,row.names=1)
 5 | 
 6 | pan12_f<-pan12[rownames(pan12)%in%rownames(pan20),colnames(pan12)%in%colnames(pan20)]
 7 | pan20_f<-pan20[rownames(pan20)%in%rownames(pan12),colnames(pan20)%in%colnames(pan12)]
 8 | 
 9 | 
10 | 
11 | write.table(apply((pan12_f==0),2,sum),"PANCAN12_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F)
12 | write.table(apply((pan20_f==0),2,sum),"PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F)
13 | 


--------------------------------------------------------------------------------
/Codes/utilities.py:
--------------------------------------------------------------------------------
  1 | import glob, os, posix, sys, math, collections, json, difflib
  2 | #import scipy
  3 | #from scipy.stats import *
  4 | from operator import itemgetter, attrgetter
  5 | import itertools
  6 | from random import uniform, sample
  7 | #import numpy
  8 | from collections import defaultdict
  9 | #from fisher import *
 10 | #from transcendental import stdtr
 11 | 
 12 | def printFlush(text, outFilePath=None):
 13 |     print text
 14 |     sys.stdout.flush()
 15 | 
 16 |     if outFilePath != None:
 17 |         outFile = open(outFilePath, 'a')
 18 |         outFile.write(text + "\n")
 19 |         outFile.close()
 20 | 
 21 | def printMatrix(data):
 22 |     for x in data:
 23 |         print x
 24 |     print ""
 25 | 
 26 | def smartDivide(numerator, denominator):
 27 |     if float(denominator) == 0.0:
 28 |         return float('nan')
 29 | 
 30 |     return float(numerator) / float(denominator)
 31 | 
 32 | def getProbes(probeTabFilePath):
 33 |     probes = []
 34 | 
 35 |     probeTabFile = open(probeTabFilePath)
 36 |     headerItems = [x.lower() for x in probeTabFile.readline().rstrip().split("\t")]
 37 | 
 38 |     for line in probeTabFile:
 39 |         lineItems = line.rstrip().split("\t")
 40 |         if headerItems.count("probe set name") > 0:
 41 |             probeset = lineItems[headerItems.index("probe set name")]
 42 |         else:
 43 |             if headerItems.count("probe set id") > 0:
 44 |                 probeset = lineItems[headerItems.index("probe set id")]
 45 |             else:
 46 |                 print "No probe set name or probe set id column in %s" % probeTabFilePath
 47 | 
 48 |         probeX = lineItems[headerItems.index("probe x")]
 49 |         probeY = lineItems[headerItems.index("probe y")]
 50 |         probe = probeset + "#" + probeX + "_" + probeY
 51 |         probes.append(probe)
 52 | 
 53 |     return probes
 54 | 
 55 | def getProbesetProbesDict(probes):
 56 |     probesetProbesDict = {}
 57 | 
 58 |     for probe in probes:
 59 |         probeset = probe[:probe.find("#")]
 60 |         probesetProbesDict[probeset] = probesetProbesDict.setdefault(probeset, []) + [probe]
 61 | 
 62 |     return probesetProbesDict
 63 | 
 64 | def getPatientIDs(normDirPath, normFileSuffix):
 65 |     ids = []
 66 | 
 67 |     #print normDirPath + "*" + normFileSuffix
 68 |     #sys.exit(0)
 69 |     for filePath in glob.glob(normDirPath + "*" + normFileSuffix):
 70 |         ids.append(filePath.replace(normDirPath, "").replace(normFileSuffix, ""))
 71 | 
 72 |     ids.sort()
 73 |     return ids
 74 | 
 75 | def readScalarFromFile(filePath):
 76 |     return readMatrixFromFile(filePath)[0][0]
 77 | 
 78 | def writeScalarToFile(x, filePath):
 79 |     outFile = open(filePath, 'w')
 80 |     outFile.write(x)
 81 |     outFile.close()
 82 | 
 83 | def readVectorFromFile(filePath):
 84 |     return [line.rstrip() for line in file(filePath)]
 85 | 
 86 | def writeVectorToFile(data, filePath):
 87 |     outFile = open(filePath, 'w')
 88 |     for x in data:
 89 |         outFile.write(str(x) + "\n")
 90 |     outFile.close()
 91 | 
 92 | def readMatrixFromFile(filePath, numLines=None):
 93 |     matrix = []
 94 |     for line in file(filePath):
 95 |         if numLines != None and len(matrix) >= numLines:
 96 |             break
 97 | 
 98 |         matrix.append(line.rstrip().split("\t"))
 99 | 
100 |         if len(matrix) % 100000 == 0:
101 |             print len(matrix)
102 | 
103 |     return matrix
104 | 
105 | def writeMatrixToFile(x, filePath, writeMode='w'):
106 |     outFile = open(filePath, writeMode)
107 |     writeMatrixToOpenFile(x, outFile)
108 |     outFile.close()
109 | 
110 | def writeMatrixToOpenFile(x, outFile):
111 |     for y in x:
112 |         outFile.write("\t".join([str(z) for z in y]) + "\n")
113 | 
114 | def appendMatrixToFile(x, filePath):
115 |     writeMatrixToFile(x, filePath, writeMode='a')
116 | 
117 | def readTextFromFile(filePath):
118 |     text = ""
119 | 
120 |     for line in file(filePath):
121 |         text += line
122 | 
123 |     return text
124 | 
125 | def writeDictToFile(dictionary, filePath):
126 |     writeScalarToFile(json.dumps(dictionary), filePath)
127 | 
128 | def readDictFromFile(filePath):
129 |     txt = readTextFromFile(filePath)
130 |     dictionary = json.loads(txt)
131 | 
132 |     dictionary2 = {}
133 | 
134 |     for key in dictionary:
135 |         value = dictionary[key]
136 | 
137 |         if isNumeric(key):
138 |             key = int(key)
139 | 
140 |         dictionary2[key] = value
141 | 
142 |     return dictionary2
143 | 
144 | def calculateMean(values):
145 |     if len(values) == 0:
146 |         return float('nan')
147 | 
148 |     return sum(values) / len(values)
149 | 
150 | def calculateVarianceMean(values):
151 |     mu = calculateMean(values)
152 |     diffValues = [(x - mu)**2 for x in values]
153 |     return calculateMean(diffValues) / (len(diffValues) - 1)
154 | 
155 | def calculateWeightedMean(values, weights):
156 |     if len(values) != len(weights):
157 |         print "When calculating a weighted mean, the values must be the same length as the weights."
158 |         raise
159 | 
160 | def calculateStandardDeviation(values):
161 |     xbar = calculateMean(values)
162 |     residuals = [x - xbar for x in values]
163 |     residualsSquared = [x**2 for x in residuals]
164 |     return math.sqrt(sum(residualsSquared) / (len(values) - 1))
165 | 
166 | def calculateZscore(x):
167 |     mean = calculateMean(x)
168 |     standardDeviation = calculateStandardDeviation(x)
169 |     return [(y - mean) / standardDeviation for y in x]
170 | 
171 | def calculateTrimmedMean(values, trimProportion=0.10):
172 |     if values == None or len(values) == 0:
173 |         return None
174 | 
175 |     values = sorted([float(x) for x in values])
176 | 
177 |     if len(values) < 3:
178 |         return calculateMean(values)
179 |     elif len(values) == 3:
180 |         return values[1]
181 |     elif len(values) == 4:
182 |         return calculateMean(values[1:3])
183 |     elif len(values) == 5:
184 |         return calculateMean(values[1:4])
185 | 
186 |     values = scipy.stats.trimboth(values, trimProportion)
187 | 
188 |     return float(calculateMean(values))
189 | 
190 | def calculateEuclideanDistance(xList, yList):
191 |     zSum = 0.0
192 | 
193 |     for i in range(len(xList)):
194 |         x = xList[i]
195 |         y = yList[i]
196 |         z = math.pow(x - y, 2)
197 |         zSum += z
198 | 
199 |     return math.sqrt(zSum)
200 | 
201 | def calculateCorrelationCoefficient(xList, yList):
202 |     return numpy.corrcoef(xList, yList)[0,1]
203 | 
204 | def calculatePearsonCoefficient(xList, yList):
205 |     return stats.pearsonr(xList, yList)[0]
206 | 
207 | def calculateSpearmanCoefficient(xList, yList):
208 |     return stats.spearmanr(xList, yList)[0]
209 | 
210 | def calculateTTest(xList, yList):
211 |     xList = numpy.array([x for x in xList if not math.isnan(x)])
212 |     yList = numpy.array([y for y in yList if not math.isnan(y)])
213 | 
214 |     if len(xList) == 1 and len(yList) > 1:
215 |         return calculateOneSampleTTest(xList[0], yList)
216 |     if len(xList) > 1 and len(yList) == 1:
217 |         return calculateOneSampleTTest(yList[0], xList)
218 | 
219 |     return ttest_ind(xList, yList, 0)[1]
220 | 
221 | # From http://stackoverflow.com/questions/10038543/tracking-down-the-assumptions-made-by-scipys-ttest-ind-function
222 | def calculateWelchTTest(pop1, pop2):
223 |     num1 = numpy.array(pop1).shape[0]
224 |     num2 = numpy.array(pop2).shape[0]
225 | 
226 |     t_stat = (numpy.mean(pop1) - numpy.mean(pop2))/numpy.sqrt( numpy.var(pop1)/num1 + numpy.var(pop2)/num2)
227 |     df = ((numpy.var(pop1)/num1 + numpy.var(pop2)/num2)**(2.0)) / ((numpy.var(pop1)/num1)**(2.0)/(num1-1) + (numpy.var(pop2)/num2) ** (2.0) / (num2-1))
228 | 
229 |     #one_tailed_p_value = 1.0 - scipy.stats.t.cdf(t_stat,df)
230 |     two_tailed_p_value = 1.0 - (scipy.stats.t.cdf(numpy.abs(t_stat),df) - scipy.stats.t.cdf(-numpy.abs(t_stat), df))
231 | 
232 |     return two_tailed_p_value
233 | 
234 | def calculateOneSampleTTest(x, yList):
235 |     return stats.ttest_1samp(yList, x)[1]
236 | 
237 | def isValueAberrant(x, yList, numStandardDeviations):
238 |     std = calculateStandardDeviation(yList)
239 |     lowerLimit = calculateMean(yList) - float(numStandardDeviations) * std
240 |     upperLimit = calculateMean(yList) + float(numStandardDeviations) * std
241 | 
242 |     return x < lowerLimit or x > upperLimit
243 | 
244 | def calculateMedian(values):
245 |   sortedValues = sorted(values)
246 | 
247 |   if len(sortedValues) % 2 == 1:
248 |       return sortedValues[(len(sortedValues)+1)/2-1]
249 |   else:
250 |       lower = sortedValues[len(sortedValues)/2-1]
251 |       upper = sortedValues[len(sortedValues)/2]
252 |       return (float(lower + upper)) / 2
253 | 
254 | def calculateFoldChange(values1, values2):
255 |     overallMin = min(min(values1), min(values2))
256 | 
257 |     values1 = [x - overallMin + 1 for x in values1]
258 |     values2 = [x - overallMin + 1 for x in values2]
259 | 
260 |     mean1 = calculateMean(values1)
261 |     mean2 = calculateMean(values2)
262 | 
263 |     return mean1 / mean2
264 | 
265 | def calculateAbsoluteFoldChange(values1, values2):
266 |     overallMin = min(min(values1), min(values2))
267 | 
268 |     values1 = [x - overallMin + 1 for x in values1]
269 |     values2 = [x - overallMin + 1 for x in values2]
270 | 
271 |     mean1 = calculateMean(values1)
272 |     mean2 = calculateMean(values2)
273 | 
274 |     ratioA = mean1 / mean2
275 |     ratioB = mean2 / mean1
276 | 
277 |     return min(ratioA, ratioB)
278 | 
279 | def getNormalizedProbes(normFilePath):
280 |     print "Getting normalized probes"
281 |     return [line.split(" ")[0] for line in file(normFilePath)]
282 | 
283 | def getKeyProbeDict(filePath, probesToKeep=None, minProbesPerKey=1):
284 |     probesToKeepSet = set(probesToKeep)
285 |     keyProbeDict = {}
286 | 
287 |     for line in file(filePath):
288 |         lineItems = line.rstrip().split("\t")
289 |         key = lineItems[0]
290 | 
291 |         if len(lineItems) > 1:
292 |             fileProbes = [x for x in lineItems[1].split(",") if x != ""]
293 | 
294 |             if len(fileProbes) >= 0:
295 |                 keyProbeDict[key] = keyProbeDict.setdefault(key, []) + fileProbes
296 | 
297 |     return keyProbeDict
298 | 
299 | def getTranscriptProbeDict(filePath, normFilePath):
300 |     normalizedProbes = set(getNormalizedProbes(normFilePath))
301 | 
302 |     print "Getting transcript-probe dictionary"
303 |     transcriptProbeDict = {}
304 |     for line in file(filePath):
305 |         lineItems = line.rstrip().split("\t")
306 |         transcript = lineItems[0]
307 |         probes = lineItems[1].split(",")
308 |         probes = list(set(probes) & normalizedProbes)
309 | 
310 |         transcriptProbeDict[transcript] = probes
311 | 
312 |     return transcriptProbeDict
313 | 
314 | def getPatientsKeyValuesDict(sourceDir, patientIDs, fileSuffix, dataValueIndex, keys=None):
315 |     patientsKeyValuesDict = collections.defaultdict(dict)
316 | 
317 |     if len(patientIDs) == 0:
318 |         return patientsKeyValuesDict
319 | 
320 |     keyLineIndicesDict = {}
321 |     lineCount = 0
322 | 
323 |     for line in file(sourceDir + patientIDs[0] + fileSuffix):
324 |         key = line.rstrip().split("\t")[0]
325 |         keyLineIndicesDict[key] = lineCount
326 | 
327 |         lineCount += 1
328 |         #if lineCount % 100000 == 0:
329 |         #    print "Parsing file line indices: %i" % lineCount
330 | 
331 |     #print "Creating key line indices list from dict"
332 |     if keys == None:
333 |         keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keyLineIndicesDict.keys()]
334 |     else:
335 |         keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keys if key in keyLineIndicesDict.keys()]
336 | 
337 |     #print "Sorting key line indices"
338 |     keyLineIndices.sort(key=itemgetter(1))
339 | 
340 |     patientFileHandles = {}
341 |     for patientID in patientIDs:
342 |         patientFileHandles[patientID] = open(checkDirPath(sourceDir) + patientID + fileSuffix)
343 | 
344 |     for patientID in patientIDs:
345 |         #print patientID
346 |         patientFile = open(checkDirPath(sourceDir) + patientID + fileSuffix)
347 | 
348 |         previousLineIndex = 0
349 |         for keyLineIndex in keyLineIndices:
350 |             for i in range(previousLineIndex, keyLineIndex[1]):
351 |                 patientFile.readline()
352 |             previousLineIndex = keyLineIndex[1] + 1
353 | 
354 |             lineItems = patientFile.readline().rstrip().split("\t")
355 |             patientsKeyValuesDict[patientID][lineItems[0]] = lineItems[dataValueIndex]
356 | 
357 |         patientFile.close()
358 | 
359 |     return patientsKeyValuesDict
360 | 
361 | def getPatientKeyValuesDict(filePath, dataColumnIndex, probes=None):
362 |     probeValues = {}
363 | 
364 |     for line in file(filePath):
365 |         lineItems = line.rstrip().split("\t")
366 |         probe = lineItems[0]
367 |         value = lineItems[dataColumnIndex]
368 | 
369 |         probeValues[probe] = value
370 | 
371 |     if not probes:
372 |         return probeValues
373 |     else:
374 |         modProbeValues = {}
375 |         for probe in probes:
376 |             modProbeValues[probe] = probeValues[probe]
377 |         return modProbeValues
378 | 
379 | def savePatientKeyValuesDict(patientDict, outFilePath):
380 |     outFile = open(outFilePath, 'w')
381 | 
382 |     for key in sorted(patientDict.keys()):
383 |         outFile.write("%s\t%s\n" % (key, patientDict[key]))
384 | 
385 |     outFile.close()
386 | 
387 | def checkDirPath(dirPath):
388 |     if not os.path.exists(dirPath):
389 |         posix.mkdir(dirPath)
390 | 
391 |     if not dirPath.endswith("/"):
392 |         dirPath = dirPath + "/"
393 | 
394 |     return dirPath
395 | 
396 | def lastIndexOf(theList, value):
397 |     return len(theList) - 1 - theList[::-1].index(value)
398 | 
399 | def getTranscriptGeneDict(filePath):
400 |     transcriptGeneDict = {}
401 | 
402 |     for line in file(filePath):
403 |         lineItems = line.rstrip().split("\t")
404 |         transcript = lineItems[0]
405 | 
406 |         gene = lineItems[1]
407 |         if len(lineItems) == 3:
408 |             gene = lineItems[2]
409 | 
410 |         transcriptGeneDict[transcript] = gene
411 | 
412 |     return transcriptGeneDict
413 | 
414 | def getGeneTranscriptDict(filePath):
415 |     geneTranscriptDict = {}
416 | 
417 |     for line in file(filePath):
418 |         lineItems = line.rstrip().split("\t")
419 |         transcript = lineItems[0]
420 | 
421 |         gene = lineItems[1]
422 |         if len(lineItems) == 3:
423 |             gene = lineItems[2]
424 | 
425 |         geneTranscriptDict[gene] = geneTranscriptDict.setdefault(gene, []) + [transcript]
426 | 
427 |     return geneTranscriptDict
428 | 
429 | def transposeMatrix(x):
430 |     transposed = zip(*x)
431 | 
432 |     for i in range(len(transposed)):
433 |         transposed[i] = list(transposed[i])
434 | 
435 |     return transposed
436 | 
437 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/
438 | def order(x, NoneIsLast = True, decreasing = False):
439 |     """
440 |     Returns the ordering of the elements of x. The list
441 |     [ x[j] for j in order(x) ] is a sorted version of x.
442 | 
443 |     Missing values in x are indicated by None. If NoneIsLast is true,
444 |     then missing values are ordered to be at the end.
445 |     Otherwise, they are ordered at the beginning.
446 |     """
447 |     omitNone = False
448 |     if NoneIsLast == None:
449 |         NoneIsLast = True
450 |         omitNone = True
451 | 
452 |     n  = len(x)
453 |     ix = range(n)
454 |     if None not in x:
455 |         ix.sort(reverse = decreasing, key = lambda j : x[j])
456 |     else:
457 |         # Handle None values properly.
458 |         def key(i, x = x):
459 |             elem = x[i]
460 |             # Valid values are True or False only.
461 |             if decreasing == NoneIsLast:
462 |                 return not(elem is None), elem
463 |             else:
464 |                 return elem is None, elem
465 |         ix = range(n)
466 |         ix.sort(key=key, reverse=decreasing)
467 | 
468 |     if omitNone:
469 |         n = len(x)
470 |         for i in range(n-1, -1, -1):
471 |             if x[ix[i]] == None:
472 |                 n -= 1
473 |         return ix[:n]
474 |     return ix
475 | 
476 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/
477 | def rankSmart(x, NoneIsLast=True, decreasing = False, ties = "first"):
478 |     """
479 |     Returns the ranking of the elements of x. The position of the first
480 |     element in the original vector is rank[0] in the sorted vector.
481 | 
482 |     Missing values are indicated by None.  Calls the order() function.
483 |     Ties are NOT averaged by default. Choices are:
484 |                  "first" "average" "min" "max" "random" "average"
485 |     """
486 |     omitNone = False
487 |     if NoneIsLast == None:
488 |         NoneIsLast = True
489 |         omitNone = True
490 |     O = order(x, NoneIsLast = NoneIsLast, decreasing = decreasing)
491 |     R = O[:]
492 |     n = len(O)
493 |     for i in range(n):
494 |         R[O[i]] = i
495 |     if ties == "first" or ties not in ["first", "average", "min", "max", "random"]:
496 |         return R
497 | 
498 |     blocks     = []
499 |     isnewblock = True
500 |     newblock   = []
501 |     for i in range(1,n) :
502 |         if x[O[i]] == x[O[i-1]]:
503 |             if i-1 not in newblock:
504 |                 newblock.append(i-1)
505 |             newblock.append(i)
506 |         else:
507 |             if len(newblock) > 0:
508 |                 blocks.append(newblock)
509 |                 newblock = []
510 |     if len(newblock) > 0:
511 |         blocks.append(newblock)
512 | 
513 |     for i, block  in enumerate(blocks):
514 |         # Don't process blocks of None values.
515 |         if x[O[block[0]]] == None:
516 |             continue
517 |         if ties == "average":
518 |             s = 0.0
519 |             for j in block:
520 |                 s += j
521 |             s /= float(len(block))
522 |             for j in block:
523 |                 R[O[j]] = s
524 |         elif ties == "min":
525 |             s = min(block)
526 |             for j in block:
527 |                 R[O[j]] = s
528 |         elif ties == "max":
529 |             s =max(block)
530 |             for j in block:
531 |                 R[O[j]] = s
532 |         elif ties == "random":
533 |             s = sample([O[i] for i in block], len(block))
534 |             for i,j in enumerate(block):
535 |                 R[O[j]] = s[i]
536 |         else:
537 |             for i,j in enumerate(block):
538 |                 R[O[j]] = j
539 |     if omitNone:
540 |         R = [ R[j] for j in range(n) if x[j] != None]
541 |     return R
542 | 
543 | # The following function came from http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
544 | def rank2(a):
545 |     n = len(a)
546 |     ivec=rank_simple(a)
547 |     svec=[a[rank] for rank in ivec]
548 |     sumranks = 0
549 |     dupcount = 0
550 |     newarray = [0]*n
551 |     for i in xrange(n):
552 |         sumranks += i
553 |         dupcount += 1
554 |         if i==n-1 or svec[i] != svec[i+1]:
555 |             averank = sumranks / float(dupcount) + 1
556 |             for j in xrange(i-dupcount+1,i+1):
557 |                 newarray[ivec[j]] = averank
558 |             sumranks = 0
559 |             dupcount = 0
560 | 
561 |     return newarray
562 | 
563 | def globFilesSortedByModTime(pattern):
564 |     def getModifiedTime(filename):
565 |         return os.stat(filename).st_mtime
566 | 
567 |     return sorted(glob.glob(pattern), key=getModifiedTime)
568 | 
569 | ## From http://stackoverflow.com/questions/34518/natural-sorting-algorithm
570 | def naturalSort(x, reverse=False):
571 |     def natural_key(s):
572 |         return tuple(
573 |             int(''.join(chars)) if isdigit else ''.join(chars)
574 |             for isdigit, chars in itertools.groupby(s, str.isdigit)
575 |         )
576 | 
577 |     return sorted(x, key=natural_key, reverse=reverse)
578 | 
579 | def getItemFrequencyMap(x):
580 |     d = defaultdict(int)
581 |     for item in x:
582 |         d[item] += 1
583 | 
584 |     return d
585 | 
586 | from math import modf, floor
587 | 
588 | def quantile(x, q,  qtype = 7, issorted = False):
589 |     """
590 |     Args:
591 |        x - input data
592 |        q - quantile
593 |        qtype - algorithm
594 |        issorted- True if x already sorted.
595 | 
596 |     Compute quantiles from input array x given q.For median,
597 |     specify q=0.5.
598 | 
599 |     References:
600 |        http://reference.wolfram.com/mathematica/ref/Quantile.html
601 |        http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile
602 | 
603 |     Author:
604 |     Ernesto P.Adorio Ph.D.
605 |     UP Extension Program in Pampanga, Clark Field.
606 |     """
607 |     if not issorted:
608 |         y = sorted(x)
609 |     else:
610 |         y = x
611 |     if not (1 <= qtype <= 9):
612 |        return None  # error!
613 | 
614 |     # Parameters for the Hyndman and Fan algorithm
615 |     abcd = [(0,   0, 1, 0), # inverse empirical distrib.function., R type 1
616 |             (0.5, 0, 1, 0), # similar to type 1, averaged, R type 2
617 |             (0.5, 0, 0, 0), # nearest order statistic,(SAS) R type 3
618 | 
619 |             (0,   0, 0, 1), # California linear interpolation, R type 4
620 |             (0.5, 0, 0, 1), # hydrologists method, R type 5
621 |             (0,   1, 0, 1), # mean-based estimate(Weibull method), (SPSS,Minitab), type 6
622 |             (1,  -1, 0, 1), # mode-based method,(S, S-Plus), R type 7
623 |             (1.0/3, 1.0/3, 0, 1), # median-unbiased ,  R type 8
624 |             (3/8.0, 0.25, 0, 1)   # normal-unbiased, R type 9.
625 |            ]
626 | 
627 |     a, b, c, d = abcd[qtype-1]
628 |     n = len(x)
629 |     g, j = modf( a + (n+b) * q -1)
630 |     if j < 0:
631 |         return y[0]
632 |     elif j >= n:
633 |         return y[n-1]   # oct. 8, 2010 y[n]???!! uncaught  off by 1 error!!!
634 | 
635 |     j = int(floor(j))
636 |     if g ==  0:
637 |        return y[j]
638 |     else:
639 |        return y[j] + (y[j+1]- y[j])* (c + d * g)
640 | 
641 | def calculateInterquartileRange(x):
642 |     firstQ = quantile(x, 0.25)
643 |     thirdQ = quantile(x, 0.75)
644 | 
645 |     return thirdQ - firstQ
646 | 
647 | def isNumeric(x):
648 |     return str(x).replace(".", "").replace("-", "").isdigit()
649 | 
650 | def getUniqueMatrixColumnValues(filePath, columnIndex):
651 |     uniqueValues = set()
652 | 
653 |     for line in file(filePath):
654 |         uniqueValues.add(line.rstrip().split("\t")[columnIndex])
655 | 
656 |     return sorted(list(uniqueValues))
657 | 
658 | def fisherExactTest(x):
659 |     return FishersExactTest.probability_of_table(x)
660 | 
661 | def complementGenomicSequence(sequence):
662 |     mod = ""
663 | 
664 |     for base in sequence:
665 |         mod += complementGenomicBase(base)
666 | 
667 |     return mod
668 | 
669 | def complementGenomicBase(base):
670 |     base = base.upper()
671 | 
672 |     if base == "A":
673 |         return "T"
674 |     if base == "T":
675 |         return "A"
676 |     if base == "C":
677 |         return "G"
678 |     return "C"
679 | 
680 | def reverseComplementGenomicSequence(dnaSequence):
681 |     return reverseString(complementGenomicSequence(dnaSequence))
682 | 
683 | def reverseString(string):
684 |     return string[::-1]
685 | 
686 | def getDictValue(dictionary, key, default=""):
687 |     if key in dictionary:
688 |         return dictionary[key]
689 |     return default
690 | 
691 | def getDiffPositions(string1, string2):
692 |     matcher = difflib.SequenceMatcher(a=string1, b=string2)
693 |     blocks = matcher.get_matching_blocks()
694 | 
695 |     diffPositions = []
696 |     for block in blocks:
697 |         if block[2] == 0 or block[2] == len(string1):
698 |             continue
699 | 
700 |         if len(diffPositions) == 0:
701 |             diffPositions.append(block[2])
702 |         else:
703 |             diffPositions.append(block[2] + diffPositions[-1])
704 | 
705 |     return diffPositions
706 | 
707 | def getSimilarityPercent(string1, string2):
708 |     blocks = difflib.SequenceMatcher(None, a=string1, b=string2).get_matching_blocks()
709 | 
710 |     totalMatching = 0.0
711 |     for block in blocks:
712 |         totalMatching += block[2]
713 | 
714 |     return (totalMatching / float(len(string1))) * 100.0
715 | 
716 | def getLineItems(line, separator="\t"):
717 |     return line.rstrip().split(separator)
718 | 
719 | def sortMatrix(data, columnIndex, reverse=False):
720 |     data.sort(key=itemgetter(columnIndex), reverse=reverse)
721 |     return data
722 | 
723 | def uniqueSort(values):
724 |     # Slow but keeps values in order and uniquifies
725 |     out = []
726 | 
727 |     for value in values:
728 |         if value not in out:
729 |             out.append(value)
730 | 
731 |     return out
732 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 mumtahena
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This repository includes code for processing RNA-Seq FASTQ files and clinical data from The Cancer Genome Atlas. In addition, we have included the code used for analyzing data in our manuscript, "Alternative preprocessing of RNA-Sequencing data in The Cancer Genome Atlas leads to improved analysis results" (Rahman, Mumtahena, et al. _Bioinformatics_ [2015:10.1093/bioinformatics/btv377](http://bioinformatics.oxfordjournals.org/content/early/2015/08/14/bioinformatics.btv377.full).
 2 | 
 3 | ## What is this repository for?
 4 | 
 5 | * We used the 'Rsubread' R package to align and summarize reads at the gene level for 9264 tumor and 741 normal TCGA RNA-Seq samples. The R scripts we provide here can also be used to process samples that did not come from TCGA. We have also included the code for compiling clinical data available for these tumors into a matrix format and matching the clinical IDs with the RNA-Seq IDs.
 6 | * We have provided the code and various intermediate data files that we produced in performing the analyses we describe in the manuscript.
 7 | 
 8 | ## How to normalize raw RNA-Seq data and process clinical data from TCGA
 9 | 
10 | This pipeline is designed to be executed on Unix-based systems. Most of the code is written in the R programming language. But it also requires "bash" scripts to be executed at the command line.
11 | 
12 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0.
13 | 
14 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org):
15 |     * Rsubread
16 |     * limma
17 |     * edgeR
18 |     * tools
19 | 
20 | 3. Clone this git repository to your local computer.
21 | 
22 | 4. Via [dbGAP](http://www.ncbi.nlm.nih.gov/gap), obtain access to the raw TCGA data. Then obtain a private key that allows you download raw data via the [Cancer Genomics Hub](https://cghub.ucsc.edu/access/get_access.html). Store this key file as ```cghub.key``` in the current directory.
23 | 
24 | 5. In the ```Genome``` directory, store the reference genome file and GTF file that can be obtained from [here](http://support.illumina.com/sequencing/sequencing_software/igenome.html). We used version hg19. After extracting these files, you will find the reference genome in Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa and the GTF file in Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2012-03-09-03-24-41/Genes/genes.gtf. Move these directly to the local Genome directory. **Update [08/04/2020]: These files are no longer available. You can find a copy of them [here](https://osf.io/cqkfp/). You will need to decompress the files using the `gunzip` utility. However, if you are going to run this pipeline now, you might consider using a newer version of the human reference genome.**
25 | 
26 | 6. Execute Scripts/process_tcga_rsubread at the command line to begin downloading and normalizing samples.
27 | 
28 | All the RNA-Seq and clinical data files that we have processed are available from Gene Expression Omnibus (accession numbers: GSE62820 and GSE62944).
29 | 
30 | For informational purposes, we have also provided a bash script (Scripts/process_tcga_level_3) that contains the steps for producing "Level 3" values using the same steps that are performed by the TCGA consortium. These steps are described in more detail here: https://cghub.ucsc.edu/docs/tcga/UNC_mRNAseq_summary.pdf.
31 | 
32 | ### Process clinical data
33 | 
34 | 1. Install R package 'plyr' using the ```install.packages``` function in R.
35 | 
36 | 2. Download the Clinical data for individual cancer type from [TCGA Data Portal] (https://tcga-data.nci.nih.gov/tcga/dataAccessMatrix.htm) in Biotab format.
37 | 
38 | 3. Download [GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz](http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE62944&format=file&file=GSE62944%5F06%5F01%5F15%5FTCGA%5F24%5FCancerType%5FSamples%2Etxt%2Egz) from GEO (Accession number GSM1536837) and save the unzipped file to 'Datasets' folder.
39 | 
40 | 4. Set directory to where all the clinical data folders for each cancer type is located.
41 | 
42 | 5. Run the R script at Codes/ProcessClinicalData.R. 
43 | 
44 | ## How to reanalyze our findings
45 | 
46 | We also provide an R Markdown file (Analysis/TCGA_24_manuscript_analysis.Rmd) that contains the analysis code that we used for our manuscript. If you desire to reexecute this analysis, please complete the following steps:
47 | 
48 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0.
49 | 
50 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org):
51 |     * stats
52 |     * ROCR
53 |     * pROC
54 |     * caret
55 |     * knitr
56 |     * data.table
57 |     * heatmap3
58 |     * RColorBrewer
59 | 
60 | 3. We used the [BinReg 2](http://www.biomedcentral.com/1471-2105/12/443) algorithm to make HER2 signature predictions on TCGA breast cancer samples. BinReg 2 runs on the MatLab platform. We used our HER2 signature datasets as training samples and the TCGA breast cancer datasets as test samples. We used the following parameters: 200 genes, 2 metagenes, quantile normalization (-g 200 -m 2 -q) to minimize the batch effects between training and test samples. The original outputs from BinReg2 are located within the ```Analysis_datasets/10_14_predictions_raw``` directory. Rerun of the HER2 pathway prediction excluding the two less consistent HER2 training samples is located at ``Analysis_datasets/5_01_predictions_raw``` .These output predictions are summarized in the Analysis_datasets directory folder for further evaluation.
61 | 
62 | 4. The code we used to classify TCGA lung adenocarcinoma and squamous carcinoma samples is in Code/Classify_luad_vs_lusc.R. The outputs of this analysis are located in the ```Analysis_datasets``` directory. The bash script  describing additional analysis to identify discordant LUAD samples and differentially expressed gene is located at Code/LUSC_LUAD_discordant_analysis.
63 | 
64 | 5. Use the ```knitr``` package to compile Analysis/TCGA_24_manuscript_analysis.Rmd. (It is convenient to complete this step within the [RStudio environment](http://www.rstudio.com/).) Also be sure to set the working directory to ```Analysis_datasets```.  Our results are stored in the TCGA_24_manuscript_analysis.html file.
65 | 
66 | 6. Our analysis datasets and outputs are available [here] (https://www.dropbox.com/sh/4e0c8u7jke694tu/AADEQnB5LbCWihb3A5f04O9va?dl=0). 
67 | 
68 | ## Contact information
69 | 
70 | * Mumtahena Rahman. [moom.rahman@utah.edu](mailto:moom.rahman@utah.edu)
71 | * Stephen R Piccolo. [https://piccolo.byu.edu](https://piccolo.byu.edu)
72 | 


--------------------------------------------------------------------------------
/Scripts/LUSC_LUAD_discordant_analysis:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | Rscript --vanilla --max-ppsize=500000 ../Codes/LUSC_vs_LUAD.R
 5 | 
 6 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_AllGenes_ROC.pdf "TCGA Level 3 - All Genes"
 7 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_AllGenes_ROC.pdf "RSubread - All Genes"
 8 | 
 9 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_CommonGenes_ROC.pdf "TCGA Level 3 - Common Genes"
10 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_CommonGenes_ROC.pdf "RSubread - Common Genes"
11 | 
12 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs ../Analysis_datasets/LUAD RSubread_NonOverlappingGenes_ROC.pdf "RSubread - Non-Overlapping Genes"
13 | 
14 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
15 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
16 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
17 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
18 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
19 | 
20 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred
21 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred
22 | 
23 | Rscript --vanilla ../Codes/PlotDiscordant.R
24 | 


--------------------------------------------------------------------------------
/Scripts/normalize_tcga_rsubread:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | 
 5 | sampleIDFile=$1
 6 | 
 7 | tcgaID=$(basename $sampleIDFile)
 8 | analysisID=$(cat $sampleIDFile)
 9 | 
10 | currentDir=$(pwd)
11 | fastqDir=$currentDir/Temp/FASTQ
12 | outFpkmDir=$currentDir/FPKM
13 | outTpmDir=$currentDir/TPM
14 | outFeatureCountsDir=$currentDir/FeatureCounts
15 | outStatsDir=$currentDir/Stats
16 | inProgressFile=$currentDir/InProgress/$tcgaID
17 | 
18 | rm -fv $inProgressFile
19 | touch $inProgressFile
20 | 
21 | function cleanup {
22 |   rm -rfv $fastqDir/${analysisID}*
23 |   rm -rfv $fastqDir/${tcgaID}*
24 |   rm -fv $inProgressFile
25 | }
26 | 
27 | trap 'cleanup' TERM INT EXIT
28 | 
29 | mkdir -pv $fastqDir/$tcgaID $outFpkmDir $outFpkmLogDir $outTpmDir $outTpmLogDir $outFeatureCountsDir $outStatsDir
30 | 
31 | echo Downloading $tcgaID
32 | mkdir -p $currentDir/XmlFiles
33 | cgquery -o $currentDir/XmlFiles/$tcgaID.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&analysis_id=${analysisID}"
34 | gtdownload -vv -d $currentDir/XmlFiles/$tcgaID.xml -c $currentDir/cghub.key --max-children 1 -p $fastqDir
35 | 
36 | echo Rename and extract files $tcgaID
37 | if [ -f $fastqDir/$analysisID/*.tar.gz ]
38 | then
39 |   mv -v $fastqDir/$analysisID/*.tar.gz $fastqDir/$tcgaID.tar.gz
40 |   tar -zxvf $fastqDir/$tcgaID.tar.gz -C $fastqDir/$tcgaID
41 |   rm -fv $fastqDir/$tcgaID.tar.gz
42 | else
43 |   mv -v $fastqDir/$analysisID/*.tar $fastqDir/$tcgaID.tar
44 |   tar -xvf $fastqDir/$tcgaID.tar -C $fastqDir/$tcgaID
45 |   rm -fv $fastqDir/$tcgaID.tar
46 | fi
47 | 
48 | fastqFileNamesFile=$fastqDir/$tcgaID/FASTQFiles
49 | for f in $fastqDir/$tcgaID/*fastq* NULL
50 | do
51 |   echo $f >> $fastqFileNamesFile
52 | done
53 | 
54 | fastqFilePath1=$(head -n 1 $fastqFileNamesFile)
55 | fastqFilePath2=$(head -n 2 $fastqFileNamesFile | tail -n 1)
56 | 
57 | Rscript --vanilla $currentDir/Codes/ProcessRnaSeqFeatureCounts.R $currentDir/Genome/genome.fa $fastqFilePath1 $fastqFilePath2 $currentDir/Genome/genes.gtf $fastqDir/$tcgaID $outFpkmDir/$tcgaID $outTpmDir/$tcgaID $outFeatureCountsDir/$tcgaID $outStatsDir/$tcgaID
58 | 
59 | rm -fv $currentDir/XmlFiles/$tcgaID.xml
60 | 
61 | 


--------------------------------------------------------------------------------
/Scripts/process_tcga_level_3:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | fastqFile=$1
  4 | 
  5 | softwareDir=Software/TCGA_RNA_Seq_Pipeline
  6 | samTools=$softwareDir/rsem-1.1.13/sam/samtools
  7 | bwaDir=$softwareDir/MapSplice_multithreads_12_07/bowtie-0.12.7_fusion
  8 | mapspliceDir=$softwareDir/MapSplice_multithreads_12_07/bin
  9 | picardDir=$softwareDir/picard-tools-1.82
 10 | ubu=$softwareDir/ubu-1.2-jar-with-dependencies.jar
 11 | rsemDir=$softwareDir/rsem-1.2.12
 12 | bedToolsDir=$softwareDir/bedtools-2.17.0/bin
 13 | referenceGenomeRef=Genomes/hg19_M_rCRS_ref
 14 | referenceGenomeFile=Genomes/hg19_M_rCRS.fa
 15 | referenceGenomeIndexFile=Genomes/hg19_M_rCRS/chromosomes
 16 | referenceChromosomesDir=Genomes/hg19_M_rCRS/ebwt
 17 | referenceBedFile=Genomes/unc_hg19.bed
 18 | referenceTranscriptsFile=Genomes/hg19_M_rCRS_ref.transcripts.fa
 19 | 
 20 | workingDir=Level_3_Temp
 21 | 
 22 | mkdir -p $workingDir
 23 | 
 24 | 
 25 | sampleID=`basename $fastqFile`
 26 | sampleID=${sampleID/\.fastq/}
 27 | outDir=${fastqFile/\.fastq/_rsem}
 28 | mkdir -p $outDir
 29 | mkdir -p $outDir/working
 30 | tmpFastqFile=$outDir/`basename $fastqFile`
 31 | outBamFile1=$outDir/alignments.bam
 32 | outBamFile2=$outDir/rg_alignments.bam
 33 | outBamFile3=$outDir/phred33_alignments.bam
 34 | outBamFile4=$outDir/sorted_genome_alignments 
 35 | echo processing $sampleID
 36 | 
 37 | #1. Format fastq 1 for Mapsplice
 38 | java -Xmx512M -jar $ubu fastq-format --phred33to64 --strip --suffix /1 --in $fastqFile --out $tmpFastqFile> $outDir/working/mapsplice_prep.log 
 39 | echo preprocessing is done
 40 | 
 41 | #2.Mapsplice
 42 | python $mapspliceDir/mapsplice_multi_thread.py --fusion --all-chromosomes-files $referenceGenomeFile -X 8 -Q fq --chromosome-files-dir $referenceChromsomesFile --Bowtieidx $referenceGenomeIndexFile -1 $tmpFastqFile -o $outDir
 43 | #echo initial bam file is created now.. deleting the processed FASTQ file
 44 | rm $tmpFastqFile
 45 | 
 46 | #3.Add read groups
 47 | java -Xmx2G -jar $picardDir/AddOrReplaceReadGroups.jar INPUT=$outBamFile1  OUTPUT=$outBamFile2 RGSM=$sampleID RGID=$sampleID RGLB=TruSeq RGPL=illumina RGPU=barcode VALIDATION_STRINGENCY=SILENT TMP_DIR=$outDir/working/add_rg_tag_tmp > $outDir/working/add_rg_tag.log
 48 | echo read groups added or replaced now!
 49 | 
 50 | #4.Convert back to phred33
 51 | java -Xmx512M -jar $ubu sam-convert --phred64to33 --in $outBamFile2 --out $outBamFile3 > $outDir/working/sam_convert.log 
 52 | echo bam file converted back to phred33
 53 | 
 54 | #5.Sort by coordinate
 55 | $samTools sort $outBamFile3 $outBamFile4
 56 | echo converted Bam file is sorted now
 57 | 
 58 | #6.Flagstat
 59 | $samTools flagstat ${outBamFile4}.bam > ${outBamFile4}.flagstat
 60 | echo flagstat file created now!
 61 | 
 62 | #7.Index
 63 | $samTools index ${outBamFile4}.bam
 64 | echo Bam file is sorted now
 65 | 
 66 | #8. Sort By chromosome, then read id
 67 | echo using perl script from $softwareDir
 68 | perl $softwareDir/sort_bam_by_reference_and_name.pl --input ${outBamFile4}.bam --output $outDir/sorted_by_chr_read.bam --temp-dir ${outDir}.tmp --samtools $samTools  > $outDir/working/sorted_by_chr_read.log 
 69 | echo sorted by chromosome then id
 70 | 
 71 | #9. Translate to transcriptome coors
 72 | echo in directory $outDir
 73 | java -Xmx3G -jar $ubu sam-xlate --single --bed $referenceBedFile --in $outDir/sorted_by_chr_read.bam --out $outDir/transcriptome_alignments.bam --order $referenceTranscriptsFile --xgtags --reverse > $outDir/working/genome_to_transcriptome.log
 74 | echo translation to transcriptome coors done!
 75 | 
 76 | #10. Filter indels, large inserts, zero mapping quality from transcriptome bam $ubu 1.2 version needed for this step to use '--single' parameter
 77 | java -Xmx512M -jar $ubu sam-filter --single --in $outDir/transcriptome_alignments.bam --out $outDir/transcriptome_alignments_filtered.bam --strip-indels --max-insert 10000 --mapq 1 > $outDir/working/sam_filter.log
 78 | echo Filtered indels, large inserts, zero mapping quality from transcriptome bam
 79 | 
 80 | #11. RSEM
 81 | echo starting rsem normalization in $outDir for $sampleID
 82 | 
 83 | $rsemDir/rsem-calculate-expression --bam -p 8 --estimate-rspd --temporary-folder ${outDir}.temp_rsem --no-bam-output $outDir/transcriptome_alignments_filtered.bam $referenceGenomeRef $sampleID > $outDir/working/rsem.log
 84 | 
 85 | 
 86 | echo data is RSEM normalized
 87 | 
 88 | #12. Strip trailing tabs from rsem.isoforms.results
 89 | echo moving output files for $sampleID for final processing...
 90 | mv ${sampleID}* $workingDir/
 91 | 
 92 | perl $softwareDir/strip_trailing_tabs.pl --input $workingDir/${sampleID}.isoforms.results --temp $outDir/working/${sampleID}.orig.isoforms.results
 93 | 
 94 | #13. Prune isoforms from gene quant file
 95 | mv $workingDir/${sampleID}.genes.results $outDir/working/${sampleID}.orig.genes.results; sed /^uc0/d $outDir/working/${sampleID}.orig.genes.results >$workingDir/${sampleID}.genes.results
 96 | 
 97 | #14. Normalize gene quant
 98 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 1000 -o $workingDir/${sampleID}.rsem.genes.normalized_results $workingDir/${sampleID}.genes.results
 99 | 
100 | #16. Normalize isoform quant
101 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 300 -o $workingDir/${sampleID}.rsem.isoforms.normalized_results $workingDir/${sampleID}.isoforms.results
102 | 
103 | #********************************************************
104 | #outDir=/data2/u01_hmec_batch01/fastq/f1/FASTQ/f1
105 | #********************************************************
106 | #17. Junction counts
107 | #java -Xmx512M -jar $ubu sam-junc --junctions $softwareDir/splice_junctions.txt --in $outDir/$outDir/sorted_genome_alignments.bam --out $workingDir/${sampleID}.junction_quantification.txt > $outDir/working/${sampleID}_junction_quantification.log
108 | 
109 | #18. Exon counts
110 | #$bedToolsDir/coverageBed -split -abam $outDir/sorted_genome_alignments.bam -b $softwareDir/composite_exons.bed | perl $softwareDir/normalizeBedToolsExonQuant.pl $softwareDir/composite_exons.bed > $outDir/${sampleID}.bt.exon_quantification.txt
111 | 
112 | #19. Cleanup large intermediate output
113 | #rm alignments.bam logs/* working/phred33_alignments.bam working/rg_alignments.bam working/sorted_by_chr_read.bam working/transcriptome_alignments.bam working/transcriptome_alignments_filtered.bam working/prep_1.fastq working/prep_2.fastq > working/cleanup.log 
114 | 


--------------------------------------------------------------------------------
/Scripts/process_tcga_rsubread:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | 
 5 | mkdir -p DownloadSamples CancerTypes Temp FeatureCounts InProgress
 6 | 
 7 | cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178"
 8 | #cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178&disease_abbr=DLBC"
 9 | 
10 | rm -rfv DownloadSamples/* CancerTypes/*
11 | python Codes/ParseCgHubQueryResults.py Query.xml "" DownloadSamples CancerTypes
12 | 
13 | rm -rf Temp/*
14 | 
15 | for f in $(pwd)/DownloadSamples/*
16 | do
17 |   sampleID=$(basename $f)
18 |   sampleID=${sampleID/\.xml/}
19 | 
20 |   if [ -f FeatureCounts/$sampleID ]
21 |   then
22 |     echo $sampleID already processed
23 |     continue
24 |   fi
25 | 
26 |   if [ -f InProgress/$sampleID ]
27 |   then
28 |     echo $sampleID currently being processed
29 |     continue
30 |   fi
31 | 
32 |   $(pwd)/Scripts/normalize_tcga_rsubread $f
33 | done
34 | 


--------------------------------------------------------------------------------
/Scripts/summarize_tcga_rsubread:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function buildCancerTypesFile {
 4 |   rm -rf Temp/CancerTypes
 5 |   mkdir -p Temp/CancerTypes
 6 | 
 7 |   for f in FeatureCounts/*
 8 |   do
 9 |     sampleID=$(basename $f)
10 |     sampleCancerType=$(cat CancerTypes/$sampleID)
11 |     cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt)
12 | 
13 |     # Make sure we can publish on this sample
14 |     if [ "$cancerTypeMatch" == "$sampleCancerType" ]
15 |     then
16 |       cp -v CancerTypes/$sampleID Temp/CancerTypes/
17 |     fi
18 |   done
19 | 
20 |   python Codes/CombineScalarValues.py "Temp/CancerTypes/*" PANCAN20_CancerType_Samples.txt
21 | 
22 |   rm -rf Temp/CancerTypes
23 | }
24 | 
25 | function matricize {
26 |   subDir=$1
27 | 
28 |   tempSummDir=Temp/Summarize_${subDir}
29 |   rm -rf $tempSummDir
30 |   mkdir -p $tempSummDir
31 | 
32 |   for f in $subDir/*
33 |   do
34 |     sampleID=$(basename $f)
35 |     sampleCancerType=$(cat CancerTypes/$sampleID)
36 |     cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt)
37 | 
38 |     # Make sure we can publish on this sample
39 |     if [ "$cancerTypeMatch" == "$sampleCancerType" ]
40 |     then
41 |       cp -v $f $tempSummDir/
42 |     fi
43 |   done
44 | 
45 |   outFile=matrices/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_${subDir}.txt
46 | 
47 |   python Codes/BuildMatrixFile.py "$tempSummDir/*" $outFile
48 |   python Codes/PrintMatrixDimensions.py $outFile
49 | 
50 |   rm -f $outFile.gz
51 | 
52 |   echo Zipping $outFile
53 |   gzip -v $outFile
54 | 
55 |   rm -rf $tempSummDir
56 | }
57 | 
58 | buildCancerTypesFile
59 | 
60 | matricize RPKMlog &
61 | matricize RPKM &
62 | matricize FeatureCounts &
63 | wait
64 | 


--------------------------------------------------------------------------------
/TCGA_CancerType_Abbreviations.txt:
--------------------------------------------------------------------------------
 1 | LAML	Acute Myeloid Leukemia
 2 | ACC	Adrenocortical carcinoma
 3 | BLCA	Bladder Urothelial Carcinoma
 4 | LGG	Brain Lower Grade Glioma
 5 | BRCA	Breast invasive carcinoma
 6 | CESC	Cervical squamous cell carcinoma and endocervical adenocarcinoma
 7 | CHOL	Cholangiocarcinoma
 8 | LCML	Chronic Myelogenous Leukemia
 9 | COAD	Colon adenocarcinoma
10 | CNTL	Controls
11 | ESCA	Esophageal carcinoma 
12 | GBM	Glioblastoma multiforme
13 | HNSC	Head and Neck squamous cell carcinoma
14 | KICH	Kidney Chromophobe
15 | KIRC	Kidney renal clear cell carcinoma
16 | KIRP	Kidney renal papillary cell carcinoma
17 | LIHC	Liver hepatocellular carcinoma
18 | LUAD	Lung adenocarcinoma
19 | LUSC	Lung squamous cell carcinoma
20 | DLBC	Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
21 | MESO	Mesothelioma
22 | MISC	Miscellaneous
23 | OV	Ovarian serous cystadenocarcinoma
24 | PAAD	Pancreatic adenocarcinoma
25 | PCPG	Pheochromocytoma and Paraganglioma
26 | PRAD	Prostate adenocarcinoma
27 | READ	Rectum adenocarcinoma
28 | SARC	Sarcoma
29 | SKCM	Skin Cutaneous Melanoma
30 | STAD	Stomach adenocarcinoma
31 | TGCT	Testicular Germ Cell Tumors
32 | THYM	Thymoma
33 | THCA	Thyroid carcinoma
34 | UCS	Uterine Carcinosarcoma
35 | UCEC	Uterine Corpus Endometrial Carcinoma
36 | UVM	Uveal Melanoma
37 | 


--------------------------------------------------------------------------------
/TCGA_CancerType_Publishable.txt:
--------------------------------------------------------------------------------
 1 | ACC
 2 | BLCA
 3 | BRCA
 4 | CESC
 5 | COAD
 6 | DLBC
 7 | GBM
 8 | HNSC
 9 | KICH
10 | KIRC
11 | KIRP
12 | LAML
13 | LGG
14 | LIHC
15 | LUAD
16 | LUSC
17 | OV
18 | PRAD
19 | READ
20 | SKCM
21 | STAD
22 | THCA
23 | UCEC
24 | UCS
25 | 


--------------------------------------------------------------------------------