├── .gitignore ├── Analysis ├── TCGA_24_manuscript_analysis.Rmd └── TCGA_24_manuscript_analysis.html ├── Analysis_datasets ├── .RData ├── .Rhistory ├── 10_14_predictions_raw │ ├── RSEM_q_log_200_f │ │ ├── REPORT.html │ │ ├── model.txt │ │ ├── parameters.txt │ │ ├── predictions.png │ │ ├── probabilities.txt │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt │ ├── TPM_q_log_200_f │ │ ├── REPORT.html │ │ ├── model.txt │ │ ├── parameters.txt │ │ ├── predictions.png │ │ ├── probabilities.txt │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt │ └── fpkm_q_log_200_f │ │ ├── REPORT.html │ │ ├── model.txt │ │ ├── parameters.txt │ │ ├── predictions.png │ │ ├── probabilities.txt │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt ├── 5_01_predictions_raw │ ├── fpkmlog_no │ │ ├── REPORT.html │ │ ├── predictions.png │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt │ ├── rsem │ │ ├── REPORT.html │ │ ├── predictions.png │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt │ ├── rsem_no │ │ ├── REPORT.html │ │ ├── predictions.png │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt │ └── tpmlog_no │ │ ├── REPORT.html │ │ ├── predictions.png │ │ ├── signature.cdt │ │ ├── signature.png │ │ └── signature_s.cdt ├── Classification_12_LUAD_LUSC_Predictions.txt ├── Classification_20_LUAD_LUSC_Predictions.txt ├── GFP18_HER2_Rsubread_FPKM.txt ├── GFP18_HER2_Rsubread_TPM.txt ├── GFP18_HER2_Rsubread_geneCounts.txt ├── GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt ├── GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt ├── PANCAN12_19583_by_3380_numZeroes.txt ├── PANCAN20_19583_by_3380_numZeroes.txt ├── Rsem_10_14.txt ├── TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt └── rsubread_10_14.txt ├── Codes ├── BuildMatrixFile.py ├── CalcAUC.R ├── CalcAccuracy.R ├── Classify_luad_vs_lusc.R ├── CombineScalarValues.py ├── FileContainsText.py ├── GetFileExtension.py ├── IdentifyDiscordantPredictions.R ├── IdentifyInconsistentPredictions.R ├── LUSC_vs_LUAD.R ├── ParseCgHubQueryResults.py ├── ParseSampleTypes.py ├── PeekMatrix.py ├── PlotDiscordant.R ├── PrintMatrixDimensions.py ├── ProcessClinicalData.R ├── ProcessRnaSeqFeatureCounts.R ├── Split.py ├── TransposeData.py ├── biological_rep.R ├── numZero.R └── utilities.py ├── LICENSE ├── README.md ├── Scripts ├── LUSC_LUAD_discordant_analysis ├── normalize_tcga_rsubread ├── process_tcga_level_3 ├── process_tcga_rsubread └── summarize_tcga_rsubread ├── TCGA_CancerType_Abbreviations.txt └── TCGA_CancerType_Publishable.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | FeatureCounts 3 | FPKM 4 | FPKMlog 5 | TPM 6 | TPMlog 7 | DownloadSamples 8 | Temp 9 | CancerTypes 10 | InProgress 11 | go 12 | Query.xml 13 | *.key 14 | XmlFiles 15 | Genome 16 | *.jar 17 | temp* 18 | Scripts/*_rsubread2 19 | Codes/ProcessRnaSeqFeatureCounts2.R 20 | nohup* 21 | Stats 22 | update_git 23 | commit_git 24 | Analysis/*_cache 25 | Analysis/*_cache/* 26 | Analysis/*_files 27 | Analysis/*_files/* 28 | Codes/ForMoom 29 | Codes/ForMoom/* 30 | Analysis/*20* 31 | -------------------------------------------------------------------------------- /Analysis_datasets/.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/.RData -------------------------------------------------------------------------------- /Analysis_datasets/.Rhistory: -------------------------------------------------------------------------------- 1 | for (i in 1:4) 2 | + { 3 | + hist(iris[,i], main=colnames(iris)[i]) 4 | + } 5 | dskasd 6 | for (i in 1:4) 7 | hist(iris[,i], main=colnames(iris)[i]) 8 | par(mfrow=c(2,2)) 9 | for (i in 1:4) 10 | hist(iris[,i], main=colnames(iris)[i]) 11 | library("knitr", lib.loc="~/Library/R/3.1/library") 12 | install.packages(c("car", "colorspace", "manipulate", "Rcpp", "RcppArmadillo")) 13 | install.packages("mgcv", lib="/Library/Frameworks/R.framework/Versions/3.1/Resources/library") 14 | source('~/.active-rstudio-document') 15 | date: March 9, 2015 16 | date: 3/9/2015 17 | date: 18 | #date: 19 | date: 03-19-2015 20 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species)) 21 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot") 22 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species))) 23 | print(1:length(levels(iris$Species))) 24 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot") 25 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1) 26 | print(1:length(levels(iris$Species))) 27 | par(mfrow=c(2,2)) 28 | for (i in 1:(ncol(iris) - 1)) 29 | hist(iris[,i], main=colnames(iris[i]), xlab="centimeters") 30 | par(mfrow=c(1,1)) 31 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot") 32 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1) 33 | print(1:length(levels(iris$Species))) 34 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot") 35 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lwd=1) 36 | print(1:length(levels(iris$Species))) 37 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot") 38 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20) 39 | print(1:length(levels(iris$Species))) 40 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20) 41 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20) 42 | print(1:length(levels(iris$Species))) 43 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20) 44 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.5) 45 | print(1:length(levels(iris$Species))) 46 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.5) 47 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20) 48 | print(1:length(levels(iris$Species))) 49 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25) 50 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25) 51 | print(1:length(levels(iris$Species))) 52 | library("lattice", lib.loc="/Library/Frameworks/R.framework/Versions/3.1/Resources/library") 53 | install.packages("lattice") 54 | print(head(irisData)) 55 | irisData = select(iris, -Species) 56 | print(head(irisData)) 57 | irisData = select(iris, -Species) 58 | #print(head(irisData)) 59 | library(dplyr) 60 | irisData = select(iris, -Species) 61 | #print(head(irisData)) 62 | library(dplyr) 63 | install.packages("dplyr") 64 | library(dplyr) 65 | irisData = select(iris, -Species) 66 | print(head(irisData)) 67 | library(dplyr) 68 | irisData = select(iris, -Species) 69 | head(irisData) 70 | irisData = scale(irisData) 71 | head(irisData) 72 | boxplot(irisData) 73 | irisData = scale(irisData) 74 | head(irisData) 75 | boxplot(irisData) 76 | boxplot(irisData$Petal.Width~iris$Species) 77 | for (i in 1:(ncol(iris) - 1)) 78 | boxplot(irisData[,i] ~ iris$Species, main=colnames(iris)[i], ylab="centimeters") 79 | pcIrisData = prcomp(irisData) 80 | pcIrisData 81 | pcIrisData = prcomp(t(irisData)) 82 | pcIrisData 83 | pcIrisData = prcomp(irisData) 84 | pcIrisData 85 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st Principal Component", ylab="2nd Principal Component", pch=20, cex=1.25) 86 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point() 87 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data") 88 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25) 89 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25) 90 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25) 91 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25) 92 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species), pch=20, cex=1.25) 93 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25) 94 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25) 95 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point() 96 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data") 97 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25) 98 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25) 99 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25) 100 | plot(iris$Petal.Length, iris$Petal.Width) 101 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot") 102 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=10) 103 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=12) 104 | ``` 105 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=111) 106 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18) 107 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15) 108 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="green") 109 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="red") 110 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", xlab="Petal Length", ylab="Petal Width") 111 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=3, xlab="Petal Length", ylab="Petal Width") 112 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 113 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot of petal features") 114 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Sepal characteristics among species", pch=20, cex=1.25) 115 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25) 116 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 117 | model <- lm(iris$Petal.Length ~ iris$Petal.Width) 118 | abline(model, lwd = 2) 119 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 120 | model <- lm(iris$Petal.Width ~ iris$Petal.Length) 121 | abline(model, lwd = 2) 122 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 123 | model <- lm(iris$Petal.Width ~ iris$Petal.Length) 124 | abline(model, lwd = 2) 125 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 126 | model <- lm(iris$Petal.Width ~ iris$Petal.Length) 127 | abline(model, lwd = 4) 128 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width") 129 | model <- lm(iris$Petal.Width ~ iris$Petal.Length) 130 | abline(model, lwd = 4, lty=2) 131 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18) 132 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width") 133 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width") 134 | model <- lm(iris$Petal.Width ~ iris$Petal.Length) 135 | abline(model, lwd = 3, lty=2, col="gray") 136 | abline(model, lwd = 3, lty=2, col="darkgray") 137 | pca$rotation 138 | pcIrisData$rotation 139 | percent <- 100 * pcIrisData$sdev^2 / sum(pca$sdev^2) 140 | percent 141 | percent <- 100 * pcIrisData$sdev^2 / sum(pcIrisData$sdev^2) 142 | percent 143 | barplot(percent) 144 | barplot(percent, names.arg=1:4, xlab="Principal Component", ylab="% variance explained") 145 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 146 | xlim(-1.,1.25) + 147 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 148 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 149 | ggplot(rotation_data) + 150 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 151 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 152 | xlim(-1.,1.25) + 153 | ylim(-1.,1.) + 154 | coord_fixed() # fix aspect ratio to 1:1 155 | library(ggplot) 156 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 157 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 158 | ggplot(rotation_data) + 159 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 160 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 161 | xlim(-1.,1.25) + 162 | ylim(-1.,1.) + 163 | coord_fixed() # fix aspect ratio to 1:1 164 | library(ggplot2) 165 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 166 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 167 | ggplot(rotation_data) + 168 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 169 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 170 | xlim(-1.,1.25) + 171 | ylim(-1.,1.) + 172 | coord_fixed() # fix aspect ratio to 1:1 173 | library(graphics) 174 | library(ggplot2) 175 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 176 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 177 | ggplot(rotation_data) + 178 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 179 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 180 | xlim(-1.,1.25) + 181 | ylim(-1.,1.) + 182 | coord_fixed() # fix aspect ratio to 1:1 183 | library(graphics) 184 | library(ggplot2) 185 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 186 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 187 | #ggplot(rotation_data) + 188 | # geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 189 | # geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 190 | # xlim(-1.,1.25) + 191 | # ylim(-1.,1.) + 192 | # coord_fixed() # fix aspect ratio to 1:1 193 | library(ggplot2) 194 | library(grid) 195 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 196 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 197 | #ggplot(rotation_data) + 198 | # geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 199 | # geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 200 | # xlim(-1.,1.25) + 201 | # ylim(-1.,1.) + 202 | # coord_fixed() # fix aspect ratio to 1:1 203 | library(ggplot2) 204 | library(grid) 205 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation)) 206 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed") 207 | ggplot(rotation_data) + 208 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) + 209 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') + 210 | xlim(-1.,1.25) + 211 | ylim(-1.,1.) + 212 | coord_fixed() # fix aspect ratio to 1:1 213 | ?prcomp 214 | iris 215 | ?kmeans 216 | install.packages("useful") 217 | plot(k1, data=iris) 218 | source('~/.active-rstudio-document') 219 | k1 <- kmeans(x=iris[, 1:4], centers=3) 220 | plot(k1) 221 | plot(k1, data=iris) 222 | k1 <- kmeans(x=iris[, 1:4], centers=3) 223 | library(useful) 224 | plot(k1) 225 | irisData = iris[,-5] # Negative sign excludes the specified column 226 | head(irisData) 227 | irisData = scale(irisData) 228 | head(irisData) 229 | boxplot(irisData) 230 | ?subset 231 | ?c 232 | x = rep(1, 1000) 233 | x 234 | hist(x) 235 | plot(density(x)) 236 | ls() 237 | library(dplyr) 238 | ?inner_join 239 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets") 240 | setwd("TCGA_RNASeq_clinical/Analysis_datasets") 241 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets") 242 | getwd() 243 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped 244 | # Rsubread pipeline, gene counts 245 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 246 | # TCGA pipeline, normalized expression files 247 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped 248 | # Rsubread pipeline, FPKM values 249 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 250 | rsub_fpkmlog<-log2(rsub_fpkm+1) 251 | # Rsubread pipeline, TPM values 252 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 253 | rsub_tpmlog<-log2(rsub_tpm+1) 254 | # Clinical data 255 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped 256 | ``` 257 | # TCGA pipeline, expected counts 258 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped 259 | # Rsubread pipeline, gene counts 260 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 261 | # TCGA pipeline, normalized expression files 262 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped 263 | # Rsubread pipeline, FPKM values 264 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 265 | rsub_fpkmlog<-log2(rsub_fpkm+1) 266 | # Rsubread pipeline, TPM values 267 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped 268 | rsub_tpmlog<-log2(rsub_tpm+1) 269 | # Clinical data 270 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped 271 | rsub_preds<-read.table("rsubread_10_14.txt", sep='\t', header=1, row.names=1) 272 | tcga_preds<-read.table("Rsem_10_14.txt", sep='\t', header=1, row.names=1) 273 | pancan12_zero<-read.table("PANCAN12_19583_by_3380_numZeroes.txt",row.names=1,sep='\t')# File is at Analysis_datasets 274 | pancan20_tpm_zero<-read.table("PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',row.names=1)# File is at Analysis_datasets 275 | data12 = read.table("Classification_12_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1) # File is at Analysis_datasets 276 | data20 = read.table("Classification_20_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1)# File is at Analysis_datasets 277 | #This function calculates the standardized mean using Hedge's formula 278 | standardized_mean<-function(m.1,sd.1,n.1,m.2,sd.2,n.2){ 279 | sd_pooled=sqrt(((n.1-1)*sd.1^2+(n.2-1)*sd.2^2)/(n.1+n.2-2)) 280 | (m.1-m.2)/sd_pooled 281 | } 282 | #This function merges two matrices on row names, sets the common items as rownames and removes the extra column resulting from merge function. 283 | merge_drop<-function(x,y,by=0) 284 | { 285 | new_m<-merge(x,y,by=by) 286 | rownames(new_m)<-new_m$Row.names 287 | return(new_m[,2:length(colnames(new_m))]) 288 | } 289 | #This function plots the ROC based on the actual and predicted class 290 | plotROC = function(actual, probabilities, plotCI=FALSE) 291 | { 292 | # bottom, left, top, right 293 | par(mar=c(4.5, 4.7, 0.0, 0.5),lwd=4) 294 | library(pROC) 295 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE) 296 | lowerBoundAuc = format(roc_result$ci[1], digits=3) 297 | midAuc = format(roc_result$ci[2], digits=3) 298 | upperBoundAuc = format(roc_result$ci[3], digits=3) 299 | if (plotCI) 300 | { 301 | ci(roc_result) 302 | sens.ci <- ci.se(roc_result) 303 | plot(sens.ci, type="shape", col="gray95") 304 | plot(sens.ci, type="bars") 305 | plot(roc_result, add=TRUE) 306 | } 307 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep="")) 308 | par(mar=c(5.1, 4.1, 2.1, 2.1)) 309 | } 310 | ##########computing the empiric cumulative distribution per sample overlaied on same graph######## 311 | ###using TCGA pipelined aligned data 312 | ecdf_all_ex<-apply(log2(rsem_her2_expected_counts+1),2,ecdf) 313 | par( mfrow = c( 1, 2 ) ) 314 | plot(ecdf_all_ex[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="TCGA pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5) 315 | legend(10,10,c("GFP", "HER2"), col = c("blue","brown")) 316 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")} 317 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")} 318 | ###using Rsubread pipeline aligned data 319 | ecdf_all<-apply(log2(feature+1),2,ecdf) 320 | plot(ecdf_all[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="Rsubread pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5) 321 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")} 322 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")} 323 | ############computing total number of read counts per samples and plotting them as dot plots#### 324 | expected_counts<-apply(rsem_her2_expected_counts,2,sum) 325 | feature_counts<-apply(feature,2,sum) 326 | # Creating a plot showing total mapped reads per sample 327 | par( mfrow = c( 1, 2 ),lwd=4 ) 328 | x = c(rep(1, 12), rep(2, 5)) # this indicates where on the x axis to plot 329 | par(mar=c(3.1, 4.6, 2.1, 0.6)) # figure margins 330 | boxplot(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",main="TCGA Pipeline",col='grey75',medcol="grey75",lwd=4,border = "grey35") 331 | points(jitter(x, factor=2), c(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1)), pch=4, cex=2, col=1, xaxt="n",cex.lab=1.5) 332 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5) 333 | boxplot(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",col='grey75',medcol="grey75",lwd=4,main="Rsubread Pipeline",border = "grey35") 334 | points(jitter(x, factor=1.5), c(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1)), pch=4,cex=2,cex.lab=1.5,col="black") 335 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5) 336 | #######Boxplotting ERBB2 gene counts in HMEC samples##### 337 | par(mfrow = c(1, 1),lwd=4) 338 | names=c('TCGA\nGFP','TCGA\nHER2','Rsubread\nGFP', 'Rsubread\nHER2') 339 | rsem_her2<-data.frame(t(rsem_her2_expected_counts["ERBB2",])) 340 | rsub_her2<-data.frame(t(feature["ERBB2",])) 341 | x = c(rep(1, 12), rep(2, 5),rep(3, 12), rep(4, 5)) 342 | boxplot(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1),log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1),ylab="",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,main=paste('Comparing TGCA and Rsubread Pipelines','\n', 'in Differentiating HER2 Overexpression from Controls',sep=''),border = "grey35") 343 | points(jitter(rep(1,12),factor=2),log2(rsem_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col="black") 344 | points(jitter(rep(2,5),factor=2),log2(rsem_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black') 345 | points(jitter(rep(3,12),factor=2),log2(rsub_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col='black') 346 | points(jitter(rep(4,5),factor=2),log2(rsub_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black') 347 | axis(1, at=1:4, tick=T, labels=c("TCGA\nControl", "TCGA\nHER2","Rsubread\nControl", "Rsubread\nHER2"), cex.axis=0.8) 348 | ##using data processed by RSEM detected difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples 349 | ##t = -12.1833, df = 4.157, p-value = 0.0002081 but was worse than Rsubread 350 | t.test(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1)) 351 | ##using not normalized data processed by Rsubread was much better at detecting difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples 352 | ##t = -46.6747, df = 8.35, p-value = 2.152e-11 353 | t.test(log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1)) 354 | ###########here we are computing standardized mean difference using the exprected gene counts from TCGA pipeline and gene counts from Rsubread algorithm ############ 355 | ####Hedge's standardized mean/effect size using TCGA pipeline 356 | standardized_mean(mean(log2(rsem_her2$ERBB2[13:17]+1)),sd(log2(rsem_her2$ERBB2[13:17]+1)),5,mean(log2(rsem_her2$ERBB2[1:12]+1)),sd(log2(rsem_her2$ERBB2[1:12]+1)),12) 357 | ####Hedge's standardized mean/effect size using Rsubread pipeline 358 | standardized_mean(m.1=mean((log2(rsub_her2$ERBB2[13:17]+1))),sd.1=sd((log2(rsub_her2$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(rsub_her2$ERBB2[1:12]+1))),sd.2=sd((log2(rsub_her2$ERBB2[1:12]+1))),n.2 = 12) 359 | #######################comparing gene counts results ############ 360 | par( mfrow = c( 1,3 ) ,lwd=4) 361 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),] 362 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene 363 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene) 364 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1) 365 | ###Coefficient of variation in GFP samples across all common genes 366 | ####Coefficient of variation in TCGA pipeline processed data 367 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_fpkm),] 368 | hist(na.omit(apply(com_genes_TCGA,1,sd)/apply(com_genes_TCGA,1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20) 369 | hist(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20) 370 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the control samples:",median(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean))),sep=" ")) 371 | hist(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20) 372 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the HER2-overexpressed samples:",median(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean))),sep=" ")) 373 | tcga_her2_normalized<-data.frame(t(TCGA_her2["ERBB2",])) 374 | ####Coefficient of variation in Rsubread pipeline processed data 375 | com_genes_fpkm<-rsub_fpkm[rownames(rsub_fpkm)%in%rownames(com_genes_TCGA),] 376 | hist(na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)),main = "Rsubread FPKM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes",breaks=20) 377 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_fpkm[,1:12],1,sd)/apply(com_genes_fpkm[,1:12],1,mean)))),sep='')) 378 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)))),sep='')) 379 | rsub_fpkmlog_her2<-data.frame(t(rsub_fpkmlog["ERBB2",])) 380 | rsub_fpkm_her2<-data.frame(t(rsub_fpkm["ERBB2",])) 381 | com_genes_tpm<-rsub_fpkm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),] 382 | hist(na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)),main = "Rsubread TPM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes") 383 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_tpm[,1:12],1,sd)/apply(com_genes_tpm[,1:12],1,mean)))),sep='')) 384 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)))),sep='')) 385 | rsub_tpm_her2<-data.frame(t(rsub_tpm["ERBB2",])) 386 | rsub_tpmlog_her2<-data.frame(t(rsub_tpmlog["ERBB2",])) 387 | #######post normalization ecdf 388 | ecdf_all_ex<-apply(log2(TCGA_her2+1),2,ecdf) 389 | par( mfrow = c( 1, 3 ) ) 390 | plot(ecdf_all_ex[[1]],xlab=NA, ylab = NA,col="blue",main="TCGA Level 3",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,) 391 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")} 392 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")} 393 | ###using Rsubread pipeline aligned data 394 | ecdf_all<-apply(rsub_fpkmlog,2,ecdf) 395 | plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion") 396 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")} 397 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")} 398 | ecdf_all_t<-apply(rsub_tpmlog,2,ecdf) 399 | plot(ecdf_all_t[[1]],col="blue",main="Rsubread TPM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion") 400 | for(i in 2:12){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="blue")} 401 | for(i in 13:17){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="brown")} 402 | ###Creating boxplots of the normalized ERBB2 expression 403 | par( mfrow = c( 1, 1 ) ) 404 | par(mar=c(5, 4.5, 3.5, 0.5)) 405 | boxplot(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1),rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17],rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17],ylab="log2(HER2 gene expression values)",main="Comparing HER2 normalized expression between\n control and her2 samples",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,border = "grey35") 406 | names=c("TCGA\nGFP","TCGA\nHER2","Rsubred FPKM\nGFP", "Rsubred FPKM\nHER2","Rsubred TPM\nGFP", "Rsubred TPM\nHER2") 407 | text(seq(1,6,by=1),par("usr")[3] - 2, labels = names, srt = 45, pos = 1, xpd = TRUE) 408 | points(jitter(rep(1,12),factor=2),log2(tcga_her2_normalized$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5) 409 | points(jitter(rep(2,5),factor=2),log2(tcga_her2_normalized$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5) 410 | points(jitter(rep(3,12),factor=2),rsub_fpkmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5) 411 | points(jitter(rep(4,5),factor=2),rsub_fpkmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5) 412 | points(jitter(rep(5,12),factor=2),rsub_tpmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5) 413 | points(jitter(rep(6,5),factor=2),rsub_tpmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5) 414 | ###t.test to see if there is significance 415 | t.test(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1)) 416 | t.test(rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17]) 417 | t.test(rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17]) 418 | ###Standardized mean difference: TCGA pipeline normalized ERBB2 expression values 419 | standardized_mean(m.1=mean((log2(tcga_her2_normalized$ERBB2[13:17]+1))),sd.1=sd((log2(tcga_her2_normalized$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(tcga_her2_normalized$ERBB2[1:12]+1))),sd.2=sd((log2(tcga_her2_normalized$ERBB2[1:12]+1))),n.2=12) 420 | ###Standardized mean difference: Rsubread pipeline FPKM normalized ERBB2 expression values 421 | standardized_mean(mean(rsub_fpkmlog_her2$ERBB2[13:17]),sd(rsub_fpkmlog_her2$ERBB2[13:17]),5,mean(rsub_fpkmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12) 422 | ###Standardized mean difference:Rsubread pipeline TPM normalized ERBB2 expression values 423 | standardized_mean(mean(rsub_tpmlog_her2$ERBB2[13:17]),sd(rsub_tpmlog_her2$ERBB2[13:17]),5,mean(rsub_tpmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12) 424 | colnames(pancan12_zero)<-"PANCAN12" 425 | colnames(pancan20_tpm_zero)<-"TPM" 426 | all_zeros<-merge_drop(pancan12_zero,pancan20_tpm_zero) 427 | #3380 samples are common 428 | par(mfrow = c(1, 2),lwd=4) 429 | h1<-hist(all_zeros$PANCAN12,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks = 25) 430 | abline(v=median(all_zeros$PANCAN12),col="red",lty=2) 431 | h2<-hist(all_zeros$TPM,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks=25) 432 | abline(v=median(all_zeros$TPM),col="red",lty=2) 433 | t.test(all_zeros$PANCAN12,all_zeros$TPM) 434 | #############Predicted HER2 pathway activity analysis############################################# 435 | all_preds<-merge_drop(rsub_preds,tcga_preds,by=0) 436 | brca_clinical<-subset(clinicals,clinicals[,'tumor_tissue_site']=='Breast',select=c("bcr_patient_barcode","her2_status_by_ihc")) 437 | common_all<-merge_drop(all_preds,brca_clinical,by=0) 438 | all_preds_pos_neg<-subset(common_all,common_all$her2_status_by_ihc=="Negative"|common_all$her2_status_by_ihc=="Positive") 439 | all_ranked<-apply(all_preds_pos_neg[,1:3],2,rank) 440 | all<-cbind(all_ranked,all_preds_pos_neg[,4:5]) 441 | ihc_neg<-subset(all,all$her2_status_by_ihc=="Negative") 442 | ihc_pos<-subset(all,all$her2_status_by_ihc=="Positive") 443 | ##############boxplot of ranked estimated HER2 pathway activity 444 | ##in TCGA BRCA samples#### 445 | par(mfrow = c(1, 1)) 446 | par(mar=c(5, 4.6, 2.5, 0.6)) # figure margins 447 | boxplot(ihc_pos$Rsem_log_q_200_f,ihc_neg$Rsem_log_q_200_f,ihc_pos$FPKM_log_q_200_f,ihc_neg$FPKM_log_q_200_f,ihc_pos$TPM_log_q_200_f,ihc_neg$TPM_log_q_200_f,cex.axis=1.5, cex.lab=1.5,outpch=NA,range=0,cex.axis=1, cex.lab=0.7,outpch=NA,col='grey75',medcol="grey5",lwd=4,border = "grey5", main="Comparison of rank-based estimate \nof HER2 activation",ylab="Ranked HER2 prediction") 448 | names=c("TCGA\nLevel3\nHER2(+)","TCGA\nLevel3\nHER2(-)","Rsubred\nFPKM\nHER2(+)", "Rsubred\nFPKM\nHER2(-)","Rsubred\nTPM\nHER2(+)", "Rsubred\nTPM\nHER2(-)") 449 | text(seq(1,6,by=1),par("usr")[3] - 4.5, labels = names, srt = 45, pos = 1, xpd = TRUE) 450 | ihc_neg_t<-subset(common_all,common_all$her2_status_by_ihc=="Negative") 451 | ihc_pos_t<-subset(common_all,common_all$her2_status_by_ihc=="Positive") 452 | ##coefficient of variation in TCGA pipeline processed HER2 predictions 453 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$Rsem_log_q_200_f)/mean(ihc_neg_t$Rsem_log_q_200_f),sep=' ')) 454 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$Rsem_log_q_200_f)/mean(ihc_pos_t$Rsem_log_q_200_f),sep=' ')) 455 | ##coefficient of variation in Rsubread FPKM pipeline processed HER2 predictions 456 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$FPKM_log_q_200_f)/mean(ihc_neg_t$FPKM_log_q_200_f),sep=" ")) 457 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$FPKM_log_q_200_f)/mean(ihc_pos_t$FPKM_log_q_200_f),sep=" ")) 458 | ##coefficient of variation in Rsubread TPM pipeline processed HER2 predictions 459 | print(paste("Coefficient of variation in Rsubread TPM processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$TPM_log_q_200_f)/mean(ihc_neg_t$TPM_log_q_200_f),sep=" ")) 460 | print(paste("Coefficient of variation in Rsubread TPM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$TPM_log_q_200_f)/mean(ihc_pos_t$TPM_log_q_200_f),sep=" ")) 461 | ##Calculating standardized mean differences between the HER2(+) and HER2(-) groups 462 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for TCGA Level 3 data :",standardized_mean(m.1=mean(ihc_pos_t$Rsem_log_q_200_f),sd.1=sd(ihc_pos_t$Rsem_log_q_200_f),n.1=length(ihc_pos_t$Rsem_log_q_200_f),m.2=mean(ihc_neg_t$Rsem_log_q_200_f),sd.2=sd(ihc_neg_t$Rsem_log_q_200_f),n.2=length(ihc_neg_t$Rsem_log_q_200_f)),sep=' ')) 463 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for Rsubread FPKM data :",standardized_mean(m.1=mean(ihc_pos_t$FPKM_log_q_200_f),sd.1=sd(ihc_pos_t$FPKM_log_q_200_f),n.1=length(ihc_pos_t$FPKM_log_q_200_f),m.2=mean(ihc_neg_t$FPKM_log_q_200_f),sd.2=sd(ihc_neg_t$FPKM_log_q_200_f),n.2=length(ihc_neg_t$FPKM_log_q_200_f)),sep=' ')) 464 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for FPKM TPM data :",standardized_mean(m.1=mean(ihc_pos_t$TPM_log_q_200_f),sd.1=sd(ihc_pos_t$TPM_log_q_200_f),n.1=length(ihc_pos_t$TPM_log_q_200_f),m.2=mean(ihc_neg_t$TPM_log_q_200_f),sd.2=sd(ihc_neg_t$TPM_log_q_200_f),n.2=length(ihc_neg_t$TPM_log_q_200_f)),sep=' ')) 465 | ## t-tests comparing HER(+) and HER(-) prediction 466 | t.test(ihc_pos_t$Rsem_log_q_200_f,ihc_neg_t$Rsem_log_q_200_f)# For TCGA Level 3: p-value = 2.009e-05 467 | t.test(ihc_pos_t$FPKM_log_q_200_f,ihc_neg_t$FPKM_log_q_200_f)#For Rsubread FPKM: p-value = 1.493e-10 468 | t.test(ihc_pos_t$TPM_log_q_200_f,ihc_neg_t$TPM_log_q_200_f)#For Rsubread TPM:p-value = 3.197e-12 469 | par(mfrow = c(1, 1),lwd=4) 470 | actual12 = data12$ActualClass 471 | predictions12 = data12$LUAD_Probability 472 | auc = plotROC(actual12, predictions12, TRUE) 473 | title("TCGA Level 3 LUAD vs LUSC") 474 | actual20 = data20$ActualClass 475 | predictions20 = data20$LUAD_Probability 476 | auc = plotROC(actual20, predictions20, TRUE) 477 | title("Rsubread TPM LUAD vs LUSC") 478 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Monday, 13 October 2014, 11:34 PM on adira.genetics.utah.edu. It took 12m 2s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/model.txt: -------------------------------------------------------------------------------- 1 | Name Coefficient 2 | Intercept 4.524853 3 | ERBB2 0.164782 4 | HSPA7 -0.125612 5 | GDF6 -0.111343 6 | HSPA6 -0.097087 7 | CCL2 -0.093873 8 | CXCL10 -0.092074 9 | LOC338651 0.079326 10 | TNFSF14 -0.073710 11 | CD248 -0.059249 12 | IFIT1 -0.057644 13 | DNAJA4 -0.053322 14 | GNAO1 -0.050292 15 | CRHR1 0.048706 16 | EEF1A2 0.045896 17 | HSPA1B -0.045632 18 | CCL20 -0.044527 19 | TNFAIP2 -0.044330 20 | LOC91948 0.042751 21 | ATP6V0A4 0.038768 22 | CFB -0.037830 23 | CALB2 0.036782 24 | PADI1 0.035659 25 | PDGFB 0.034971 26 | LOC285629 -0.034876 27 | CRYAB -0.032468 28 | GABRA2 0.030593 29 | SOD2 -0.028653 30 | ULBP1 -0.028346 31 | KRT18 0.028246 32 | GPR1 -0.027639 33 | CXCL5 -0.027617 34 | EPHA3 -0.026868 35 | IL8 -0.025943 36 | EPHA4 -0.025735 37 | TLR3 -0.025646 38 | HSPB8 -0.025054 39 | RPSAP52 0.024980 40 | RGS2 -0.024874 41 | SLC2A12 -0.024861 42 | KRT19 0.024626 43 | TRANK1 -0.024277 44 | MGP 0.023918 45 | SAA1 -0.023534 46 | SHC4 0.022446 47 | KITLG -0.022152 48 | KRT8 0.022084 49 | CGNL1 -0.021984 50 | MYCL1 -0.021942 51 | ANGPTL4 0.021650 52 | PARP9 -0.021303 53 | DNAJB4 -0.021262 54 | SPON1 0.021236 55 | PIK3C2B -0.021143 56 | PARP14 -0.021042 57 | SERPINB1 0.020839 58 | CXCL2 -0.020713 59 | SERPINB13 -0.020613 60 | SNX9 0.020262 61 | TRIM22 -0.020121 62 | DNAJB1 -0.019926 63 | KANK4 -0.019885 64 | GBP6 -0.019667 65 | MLPH 0.019478 66 | APOL6 -0.019334 67 | OAS3 -0.019302 68 | HSP90AA1 -0.019165 69 | KRT81 0.019156 70 | GM2A -0.019126 71 | ENGASE -0.017973 72 | KRT75 0.017856 73 | CBLC 0.017765 74 | CCNA1 0.017623 75 | FERMT2 0.017321 76 | CEACAM1 0.017130 77 | SLC13A5 0.017066 78 | MTSS1L -0.017003 79 | TCF4 -0.016884 80 | PLAUR 0.016528 81 | GPR110 0.016330 82 | TP53AIP1 -0.016244 83 | APAF1 0.016161 84 | HSPH1 -0.016115 85 | RAB6B 0.016005 86 | LOXL4 0.015594 87 | OSBP2 0.015384 88 | HSPA8 -0.015298 89 | UNC5B -0.015048 90 | RASA3 0.014898 91 | KCNN4 0.014783 92 | ANPEP 0.014734 93 | AMACR -0.014480 94 | ZC3HAV1 -0.014280 95 | COBLL1 -0.014277 96 | ECT2 0.014259 97 | SMURF2 0.014218 98 | CBR1 -0.014049 99 | TUFT1 0.013455 100 | C1R -0.013313 101 | SESN2 -0.013303 102 | TWF2 0.013165 103 | INPP4B 0.013134 104 | SMO -0.013129 105 | ITGB3 0.013106 106 | CAST 0.013084 107 | FBXW7 -0.013061 108 | VASP 0.012979 109 | SASH1 -0.012828 110 | MT2A 0.012725 111 | NAV3 0.012684 112 | NET1 0.012572 113 | CGN 0.012481 114 | SYTL2 -0.012440 115 | CYBASC3 -0.012341 116 | ST3GAL4 0.012295 117 | TNS3 -0.012073 118 | BCAR3 0.011678 119 | SEC24D 0.011623 120 | DTX4 -0.011553 121 | PYGB 0.011389 122 | MYO1E 0.011297 123 | PTPRE 0.011089 124 | GFPT1 0.011087 125 | ACTB 0.011033 126 | STIM2 -0.011012 127 | XPC -0.011008 128 | MFI2 0.010950 129 | NFATC3 -0.010879 130 | C19orf66 -0.010511 131 | PDZD2 -0.010452 132 | ARHGEF2 0.010354 133 | TRIOBP 0.010316 134 | SLC34A2 -0.010288 135 | FRMD4A -0.010219 136 | MAP3K2 -0.010081 137 | NPAS2 0.010074 138 | IGFL3 -0.009956 139 | ARHGAP12 0.009927 140 | SH2D3A 0.009911 141 | NAV2 -0.009866 142 | SMOC1 0.009764 143 | HERPUD1 0.009567 144 | WDR1 0.009562 145 | RASA1 0.009529 146 | MBD4 -0.009337 147 | PLEK2 0.009276 148 | BCAP29 0.009270 149 | ATG16L1 0.009237 150 | LDB1 -0.009222 151 | NCDN -0.009177 152 | NEK9 -0.009083 153 | CSGALNACT2 0.009018 154 | ATP1B1 -0.008895 155 | APBB2 -0.008881 156 | CAPN2 0.008880 157 | CALM2 0.008674 158 | TRAFD1 -0.008589 159 | PGM1 0.008555 160 | FGFR2 -0.008354 161 | DOPEY1 -0.008331 162 | NISCH -0.008191 163 | PI4KB -0.008141 164 | TOR3A -0.007819 165 | LRIG3 0.007766 166 | POLR2A -0.007749 167 | NEU1 -0.007665 168 | KPNA4 0.007656 169 | PIK3CD 0.007606 170 | ANKRD13A -0.007496 171 | TBRG1 -0.007462 172 | EPS15 0.007458 173 | TRIM5 -0.007361 174 | PCSK7 -0.007332 175 | ANKFY1 -0.007320 176 | C20orf194 0.007244 177 | C19orf42 -0.007162 178 | ITGA5 0.007095 179 | ARHGEF12 -0.006996 180 | STK40 -0.006932 181 | MLLT6 -0.006786 182 | C1orf85 -0.006767 183 | PTPN12 0.006480 184 | MAP2K4 -0.006351 185 | ZNF532 -0.006134 186 | AFAP1L2 0.006103 187 | ARID1B -0.005924 188 | SEC14L1 0.005811 189 | PLEKHA6 -0.005776 190 | ELOVL1 0.005764 191 | CLASP1 -0.005727 192 | SMEK1 -0.005478 193 | NUMA1 -0.005168 194 | ZMYND8 0.005151 195 | PDXK -0.005071 196 | MYO10 0.004929 197 | UBP1 -0.004780 198 | RCC2 0.004742 199 | SGK1 0.004731 200 | RFWD3 -0.004666 201 | C20orf3 -0.004354 202 | WDR91 -0.004333 203 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/parameters.txt: -------------------------------------------------------------------------------- 1 | NAME VALUE 2 | Binreg Version 2 3 | Genes 200 4 | Metagenes 2 5 | Strip AFFX control 0 6 | Log Train0 0 7 | Log Train1 0 8 | Log Test 0 9 | Quantile Normalize 1 10 | Shift-Scale Normalize 0 11 | DWD Normalize 0 12 | DWD Normalize (Bild) 0 13 | Burn In 1000 14 | Samples 5000 15 | Skips 1 16 | Credible Interval 95 17 | Cross Validate 1 18 | Make Plots 1 19 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Monday, 13 October 2014, 11:47 PM on adira.genetics.utah.edu. It took 11m 41s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/model.txt: -------------------------------------------------------------------------------- 1 | Name Coefficient 2 | Intercept -0.504928 3 | ERBB2 0.305527 4 | HSPA6 -0.158780 5 | HSPA7 -0.151412 6 | CCL2 -0.106984 7 | DNAJA4 -0.093340 8 | TNFAIP2 -0.075825 9 | HSPA1A -0.073306 10 | EEF1A2 0.071440 11 | PDGFB 0.067870 12 | EPGN -0.067303 13 | HSPA1B -0.066745 14 | ATP6V0A4 0.062446 15 | CFB -0.060075 16 | CALB2 0.058290 17 | CRYAB -0.054796 18 | SAA2 -0.050794 19 | PNMA2 0.050400 20 | KRT80 0.050203 21 | TNFRSF11B 0.048283 22 | UCA1 0.046302 23 | CXCL5 -0.045923 24 | ANGPTL7 -0.044990 25 | KPRP 0.044522 26 | SOD2 -0.044234 27 | SYTL5 0.043949 28 | KRT19 0.043441 29 | AKAP12 0.043351 30 | SRMS 0.042485 31 | PADI1 0.042177 32 | GPR1 -0.041418 33 | RGS2 -0.041195 34 | MYADM 0.040819 35 | SHC4 0.040550 36 | BST2 -0.039644 37 | EPHA3 -0.039500 38 | KLK6 0.038871 39 | KRT18 0.038599 40 | SAA1 -0.038474 41 | SPON1 0.038178 42 | HSP90AA1 -0.038082 43 | TSPAN18 0.037454 44 | EPHA4 -0.037243 45 | ANGPTL4 0.036491 46 | PAQR7 -0.036256 47 | ULBP1 -0.035505 48 | HSPH1 -0.035296 49 | PGM2L1 0.035069 50 | CRHR1 0.034918 51 | SERPINB13 -0.034840 52 | PIK3C2B -0.034825 53 | PTK6 0.034722 54 | CXCR1 0.034384 55 | FAM198B -0.034254 56 | GRAMD2 -0.034033 57 | DDAH1 0.033964 58 | GPRC5A 0.033659 59 | DAPK1 -0.033620 60 | SLC1A1 0.033565 61 | VWA1 0.033251 62 | DNAJA1 -0.032433 63 | SNX9 0.032379 64 | KITLG -0.032252 65 | HSPB8 -0.032155 66 | GBP6 -0.031284 67 | C10orf10 0.030517 68 | CCNA1 0.030310 69 | GM2A -0.030108 70 | C8orf84 0.029972 71 | ALDH1A3 0.029680 72 | TRIM22 -0.029548 73 | SREK1IP1 0.029351 74 | KRT8 0.029074 75 | NOTCH1 -0.028721 76 | DNAJB4 -0.028676 77 | FERMT2 0.027438 78 | EMP1 0.027141 79 | MAFF 0.026901 80 | TCF4 -0.026670 81 | DNAJB1 -0.026460 82 | PARP14 -0.026319 83 | PLAUR 0.026168 84 | LOC644961 0.026082 85 | KHDRBS3 0.025650 86 | PLAU 0.025228 87 | KANK4 -0.025090 88 | ESR1 -0.024670 89 | APOL6 -0.024617 90 | KCNN4 0.024463 91 | IGFL3 -0.024452 92 | MTSS1L -0.024210 93 | RAPH1 0.024168 94 | IFIT5 -0.024094 95 | DUSP10 0.024043 96 | PMP22 0.023801 97 | VASP 0.023373 98 | ARRDC4 -0.023118 99 | SMO -0.023104 100 | FAM176A 0.022803 101 | CBR1 -0.022764 102 | WWTR1 0.022599 103 | PGF 0.022576 104 | STX2 0.022286 105 | ZPLD1 0.022175 106 | KMO -0.022123 107 | FAM214B 0.021843 108 | TUFT1 0.021717 109 | TNS3 -0.021558 110 | MAP6 0.021499 111 | ST3GAL4 0.021422 112 | HMGB3 0.021401 113 | HS6ST1 -0.021304 114 | DLC1 -0.021275 115 | POU2F1 0.021216 116 | APAF1 0.021057 117 | STOX2 -0.020845 118 | RASA3 0.020767 119 | HERC3 0.020487 120 | DFNB31 -0.020337 121 | FBXO22 -0.020150 122 | BRMS1 -0.020097 123 | IER3 0.020017 124 | NET1 0.019989 125 | CYBASC3 -0.019984 126 | PYGB 0.019830 127 | XPC -0.019811 128 | BCAR3 0.019647 129 | ZXDB 0.019586 130 | CELF2 0.019402 131 | IGF2BP3 0.019325 132 | TIMP1 -0.019048 133 | ARHGAP12 0.019010 134 | NME7 0.018951 135 | ARV1 -0.018928 136 | CASP1 -0.018873 137 | MR1 -0.018826 138 | KCNJ5 -0.018762 139 | LRRC8C 0.018716 140 | TWF2 0.018592 141 | PPP3CC 0.018547 142 | ANKRD33B -0.018542 143 | CAST 0.018294 144 | SH3KBP1 0.017947 145 | PODXL2 0.017847 146 | INPP4B 0.017676 147 | TNS4 0.017660 148 | DAB2 0.017551 149 | MFI2 0.017540 150 | RBMS2 0.017501 151 | FGFR2 -0.017469 152 | GFPT1 0.017427 153 | TP53AIP1 -0.017304 154 | NAV3 0.017121 155 | ARHGEF2 0.017063 156 | SESN1 -0.016845 157 | DNAJB9 0.016278 158 | NFE2L1 -0.016229 159 | TRIOBP 0.016197 160 | KIAA1671 -0.016057 161 | ZNFX1 -0.015835 162 | CROT -0.015664 163 | SLC20A2 0.015334 164 | B2M -0.015314 165 | UBB -0.015001 166 | FBXW2 -0.014918 167 | LDB1 -0.014863 168 | SEC24D 0.014746 169 | MICALCL 0.014702 170 | MYO1E 0.014521 171 | RASSF1 0.014486 172 | TOR3A -0.014460 173 | PIK3R1 -0.014459 174 | TRAFD1 -0.014282 175 | ANKRD13A -0.014195 176 | SLC41A1 -0.014065 177 | MEF2D 0.013983 178 | PI4KB -0.013683 179 | LRRFIP1 0.013638 180 | PRRC1 0.013535 181 | FRMD4A -0.012667 182 | PNMAL1 -0.012235 183 | LPP 0.011861 184 | CAPN2 0.011646 185 | ADAR -0.011625 186 | PRDM4 -0.011432 187 | APBB2 -0.011350 188 | SEC14L1 0.011315 189 | UBP1 -0.010824 190 | ASAP2 0.010731 191 | PRPSAP2 -0.010671 192 | PPP2R5B 0.010646 193 | NFATC3 -0.010535 194 | AFAP1 0.010482 195 | DCAF7 0.010296 196 | MYL12A 0.009901 197 | ARHGEF12 -0.009895 198 | STAT3 -0.009518 199 | ANKRD27 0.008986 200 | IFFO2 0.008553 201 | GTF2I -0.008151 202 | CYB561 0.007650 203 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/parameters.txt: -------------------------------------------------------------------------------- 1 | NAME VALUE 2 | Binreg Version 2 3 | Genes 200 4 | Metagenes 2 5 | Strip AFFX control 0 6 | Log Train0 0 7 | Log Train1 0 8 | Log Test 0 9 | Quantile Normalize 1 10 | Shift-Scale Normalize 0 11 | DWD Normalize 0 12 | DWD Normalize (Bild) 0 13 | Burn In 1000 14 | Samples 5000 15 | Skips 1 16 | Credible Interval 95 17 | Cross Validate 1 18 | Make Plots 1 19 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 15 |

16 |

II. Results

17 | 18 | 19 | 22 | 23 | 24 | 33 | 34 | 35 | 42 | 43 |
20 |

200 Genes, 2 Metagenes

21 |
25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
36 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 37 | 38 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 39 | probabilities.txt 40 | . 41 |

44 |

45 |


46 | This analysis was run on Monday, 13 October 2014, 11:21 PM on adira.genetics.utah.edu. It took 12m 57s to complete. 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/model.txt: -------------------------------------------------------------------------------- 1 | Name Coefficient 2 | Intercept 0.168851 3 | ERBB2 0.257577 4 | HSPA7 -0.187866 5 | HSPA6 -0.136333 6 | GDF6 0.098740 7 | DNAJA4 -0.080598 8 | KPRP 0.074612 9 | EEF1A2 0.069003 10 | TNFAIP2 -0.067720 11 | PDGFB 0.066514 12 | TSPAN18 0.066512 13 | HSPA1A -0.062749 14 | ATP6V0A4 0.058443 15 | CFB -0.058034 16 | HSPA1B -0.057605 17 | EPGN -0.057545 18 | CALB2 0.054193 19 | PNMA2 0.048449 20 | SAA2 -0.047311 21 | CRYAB -0.046179 22 | KRT80 0.045195 23 | SRMS 0.043627 24 | GPR1 -0.043320 25 | UCA1 0.041757 26 | TNFRSF11B 0.041583 27 | FAM83A 0.040141 28 | EPHA3 -0.039923 29 | CXCL5 -0.039762 30 | RGS2 -0.039724 31 | DDAH1 0.039198 32 | ULBP1 -0.038466 33 | AKAP12 0.038418 34 | SOD2 -0.037183 35 | KRT19 0.036641 36 | TLR3 -0.035985 37 | SHC4 0.035642 38 | PPP1R3C -0.035295 39 | PTK6 0.034658 40 | SPON1 0.034473 41 | MYADM 0.034361 42 | BST2 -0.034136 43 | GRAMD2 -0.034067 44 | SAA1 -0.033523 45 | HSP90AA1 -0.032999 46 | KRT18 0.032801 47 | EPHA4 -0.032767 48 | PIK3C2B -0.032631 49 | KLK6 0.032407 50 | CXCR1 0.031954 51 | PGM2L1 0.031133 52 | ANGPTL4 0.031075 53 | PAQR7 -0.031038 54 | DAPK1 -0.030705 55 | FAM198B -0.030230 56 | SERPINB13 -0.030208 57 | GBP6 -0.030003 58 | VWA1 0.029805 59 | SLC1A1 0.029764 60 | HSPH1 -0.029464 61 | KITLG -0.028275 62 | GPRC5A 0.027836 63 | HSPB8 -0.027616 64 | SNX9 0.027574 65 | DNAJA1 -0.026591 66 | C10orf10 0.026544 67 | SREK1IP1 0.026213 68 | GM2A -0.026028 69 | C8orf84 0.025904 70 | CCNA1 0.025808 71 | TRIM22 -0.025731 72 | APOL6 -0.025483 73 | KRT8 0.025158 74 | DNAJB4 -0.025018 75 | TCF4 -0.024505 76 | NOTCH1 -0.024433 77 | ALDH1A3 0.024322 78 | MAFF 0.023981 79 | PARP14 -0.023917 80 | FERMT2 0.023615 81 | IL7R -0.023182 82 | LOC644961 0.023169 83 | KHDRBS3 0.022993 84 | EMP1 0.022449 85 | KMO -0.022438 86 | PLAUR 0.022023 87 | DNAJB1 -0.022019 88 | IFIT5 -0.021954 89 | RAPH1 0.021690 90 | KANK4 -0.021458 91 | DUSP10 0.020861 92 | SMO -0.020834 93 | DFNB31 -0.020759 94 | MTSS1L -0.020665 95 | PLAU 0.020509 96 | KCNN4 0.020505 97 | PMP22 0.020330 98 | STX2 0.020322 99 | VASP 0.020230 100 | IGFL3 -0.020208 101 | POU2F1 0.020096 102 | WWTR1 0.019760 103 | FAM176A 0.019732 104 | PGF 0.019637 105 | ARRDC4 -0.019625 106 | TNS3 -0.019394 107 | CBR1 -0.019365 108 | RASA3 0.019126 109 | APAF1 0.018740 110 | HERC3 0.018697 111 | HMGB3 0.018691 112 | ZXDB 0.018650 113 | ST3GAL4 0.018588 114 | HS6ST1 -0.018541 115 | IGF2BP3 0.018523 116 | TUFT1 0.018493 117 | FAM214B 0.018467 118 | NET1 0.017866 119 | XPC -0.017726 120 | FBXO22 -0.017678 121 | MR1 -0.017472 122 | CYBASC3 -0.017218 123 | KCNJ5 -0.017167 124 | IER3 0.017056 125 | NME7 0.016958 126 | PYGB 0.016808 127 | NAV3 0.016742 128 | BRMS1 -0.016648 129 | ARV1 -0.016434 130 | BCAR3 0.016403 131 | ARHGAP12 0.016383 132 | PPP3CC 0.016377 133 | PODXL2 0.016365 134 | PDZD2 -0.016253 135 | TWF2 0.016132 136 | RBMS2 0.016093 137 | CASP1 -0.015992 138 | TIMP1 -0.015829 139 | LRRC8C 0.015828 140 | SH3KBP1 0.015714 141 | CAST 0.015525 142 | TP53AIP1 -0.015300 143 | DAB2 0.015248 144 | FGFR2 -0.015210 145 | INPP4B 0.015146 146 | HMGN3 -0.015120 147 | SESN1 -0.014994 148 | TRIOBP 0.014970 149 | GFPT1 0.014771 150 | ARHGEF2 0.014671 151 | TNS4 0.014658 152 | MFI2 0.014631 153 | CROT -0.014554 154 | KIAA1671 -0.013946 155 | ZNFX1 -0.013815 156 | DNAJB9 0.013602 157 | NFE2L1 -0.013277 158 | PIK3R1 -0.013264 159 | FBXW2 -0.013023 160 | RASSF1 0.012832 161 | MICALCL 0.012790 162 | SLC20A2 0.012767 163 | LDB1 -0.012706 164 | IGFBP4 -0.012603 165 | SEC24D 0.012592 166 | B2M -0.012511 167 | CCDC50 0.012451 168 | SLC41A1 -0.012315 169 | TOR3A -0.012280 170 | HERPUD1 0.012254 171 | TRAFD1 -0.012195 172 | MYO1E 0.012108 173 | MEF2D 0.012092 174 | FRMD4A -0.011928 175 | LRRFIP1 0.011781 176 | ANKRD13A -0.011763 177 | PI4KB -0.011583 178 | PRRC1 0.011518 179 | UBB -0.011513 180 | FAM129B 0.011441 181 | PNMAL1 -0.010498 182 | LPP 0.010416 183 | APBB2 -0.010189 184 | PRDM4 -0.010085 185 | ADAR -0.010018 186 | SEC14L1 0.009938 187 | CAPN2 0.009793 188 | ASAP2 0.009678 189 | PPP2R5B 0.009550 190 | NFATC3 -0.009429 191 | PRPSAP2 -0.009416 192 | DCAF7 0.009216 193 | MEX3C 0.009174 194 | AFAP1 0.009148 195 | UBP1 -0.008794 196 | ARHGEF12 -0.008606 197 | SDC1 0.008466 198 | ADCY9 -0.008152 199 | STAT3 -0.008103 200 | ANKRD27 0.007958 201 | IFFO2 0.007081 202 | GTF2I -0.006848 203 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/parameters.txt: -------------------------------------------------------------------------------- 1 | NAME VALUE 2 | Binreg Version 2 3 | Genes 200 4 | Metagenes 2 5 | Strip AFFX control 0 6 | Log Train0 1 7 | Log Train1 1 8 | Log Test 0 9 | Quantile Normalize 1 10 | Shift-Scale Normalize 0 11 | DWD Normalize 0 12 | DWD Normalize (Bild) 0 13 | Burn In 1000 14 | Samples 5000 15 | Skips 1 16 | Credible Interval 95 17 | Cross Validate 1 18 | Make Plots 1 19 | -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/fpkmlog_no/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Saturday, 02 May 2015, 02:43 PM on adira.genetics.utah.edu. It took 23m 34s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Friday, 01 May 2015, 11:17 AM on adira.genetics.utah.edu. It took 13m 38s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem_no/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Saturday, 02 May 2015, 04:04 PM on adira.genetics.utah.edu. It took 17m 54s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/tpmlog_no/REPORT.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CreateSignatures Report 4 | 5 | 6 |

CreateSignatures Report

7 |

I. Analysis

8 | 14 |

15 |

II. Results

16 | 17 | 18 | 21 | 22 | 23 | 32 | 33 | 34 | 41 | 42 |
19 |

200 Genes, 2 Metagenes

20 |
24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression. 36 | 37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values.

The raw values from this plot are available as a tab-delimited text file: 38 | probabilities.txt 39 | . 40 |

43 |

44 |


45 | This analysis was run on Saturday, 02 May 2015, 03:35 PM on adira.genetics.utah.edu. It took 21m 33s to complete. 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png -------------------------------------------------------------------------------- /Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png -------------------------------------------------------------------------------- /Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt: -------------------------------------------------------------------------------- 1 | SampleID ActualClass PredictedClass LUAD_Probability LUSC_Probability 2 | TCGA-05-4244-01A-01R-1107-07 LUAD LUAD 0.948 0.052 3 | TCGA-05-4249-01A-01R-1107-07 LUAD LUAD 0.946 0.054 4 | TCGA-05-4250-01A-01R-1107-07 LUAD LUAD 0.924 0.076 5 | TCGA-05-4382-01A-01R-1206-07 LUAD LUAD 0.94 0.06 6 | TCGA-05-4384-01A-01R-1755-07 LUAD LUAD 0.994 0.006 7 | TCGA-05-4389-01A-01R-1206-07 LUAD LUAD 0.86 0.14 8 | TCGA-05-4390-01A-02R-1755-07 LUAD LUAD 0.918 0.082 9 | TCGA-05-4395-01A-01R-1206-07 LUAD LUAD 0.776 0.224 10 | TCGA-05-4396-01A-21R-1858-07 LUAD LUAD 0.94 0.06 11 | TCGA-05-4397-01A-01R-1206-07 LUAD LUAD 0.632 0.368 12 | TCGA-05-4398-01A-01R-1206-07 LUAD LUAD 0.914 0.086 13 | TCGA-05-4402-01A-01R-1206-07 LUAD LUAD 0.968 0.032 14 | TCGA-05-4403-01A-01R-1206-07 LUAD LUAD 0.934 0.066 15 | TCGA-05-4405-01A-21R-1858-07 LUAD LUAD 0.968 0.032 16 | TCGA-05-4410-01A-21R-1858-07 LUAD LUAD 0.958 0.042 17 | TCGA-05-4415-01A-22R-1858-07 LUAD LUAD 0.714 0.286 18 | TCGA-05-4417-01A-22R-1858-07 LUAD LUAD 0.976 0.024 19 | TCGA-05-4418-01A-01R-1206-07 LUAD LUAD 0.85 0.15 20 | TCGA-05-4420-01A-01R-1206-07 LUAD LUAD 0.858 0.142 21 | TCGA-05-4422-01A-01R-1206-07 LUAD LUAD 0.938 0.062 22 | TCGA-05-4424-01A-22R-1858-07 LUAD LUAD 0.9 0.1 23 | TCGA-05-4425-01A-01R-1755-07 LUAD LUAD 0.952 0.048 24 | TCGA-05-4426-01A-01R-1206-07 LUAD LUAD 0.94 0.06 25 | TCGA-05-4427-01A-21R-1858-07 LUAD LUAD 0.914 0.086 26 | TCGA-05-4430-01A-02R-1206-07 LUAD LUAD 0.938 0.062 27 | TCGA-05-4432-01A-01R-1206-07 LUAD LUAD 0.932 0.068 28 | TCGA-05-4433-01A-22R-1858-07 LUAD LUAD 0.976 0.024 29 | TCGA-05-4434-01A-01R-1206-07 LUAD LUAD 0.776 0.224 30 | TCGA-05-5420-01A-01R-1628-07 LUAD LUAD 0.78 0.22 31 | TCGA-05-5423-01A-01R-1628-07 LUAD LUAD 0.952 0.048 32 | TCGA-05-5425-01A-02R-1628-07 LUAD LUAD 0.928 0.072 33 | TCGA-05-5428-01A-01R-1628-07 LUAD LUAD 0.834 0.166 34 | TCGA-05-5429-01A-01R-1628-07 LUAD LUAD 0.854 0.146 35 | TCGA-05-5715-01A-01R-1628-07 LUAD LUAD 0.96 0.04 36 | TCGA-35-3615-01A-01R-0946-07 LUAD LUAD 0.984 0.016 37 | TCGA-35-4122-01A-01R-1107-07 LUAD LUAD 0.748 0.252 38 | TCGA-35-4123-01A-01R-1107-07 LUAD LUAD 0.818 0.182 39 | TCGA-35-5375-01A-01R-1628-07 LUAD LUSC 0.44 0.56 40 | TCGA-38-4625-01A-01R-1206-07 LUAD LUAD 0.766 0.234 41 | TCGA-38-4626-01A-01R-1206-07 LUAD LUAD 0.964 0.036 42 | TCGA-38-4627-01A-01R-1206-07 LUAD LUAD 0.89 0.11 43 | TCGA-38-4628-01A-01R-1206-07 LUAD LUAD 0.896 0.104 44 | TCGA-38-4629-01A-02R-1206-07 LUAD LUAD 0.878 0.122 45 | TCGA-38-4630-01A-01R-1206-07 LUAD LUAD 0.52 0.48 46 | TCGA-38-4631-01A-01R-1755-07 LUAD LUAD 0.744 0.256 47 | TCGA-38-4632-01A-01R-1755-07 LUAD LUAD 0.888 0.112 48 | TCGA-38-6178-01A-11R-1755-07 LUAD LUAD 0.948 0.052 49 | TCGA-38-7271-01A-11R-2039-07 LUAD LUAD 0.944 0.056 50 | TCGA-44-2655-01A-01R-0946-07 LUAD LUAD 0.992 0.008 51 | TCGA-44-2656-01A-02R-0946-07 LUAD LUAD 0.988 0.012 52 | TCGA-44-2657-01A-01R-1107-07 LUAD LUAD 0.97 0.03 53 | TCGA-44-2659-01A-01R-0946-07 LUAD LUAD 0.976 0.024 54 | TCGA-44-2661-01A-01R-1107-07 LUAD LUAD 0.972 0.028 55 | TCGA-44-2662-01A-01R-0946-07 LUAD LUAD 0.926 0.074 56 | TCGA-44-2665-01A-01R-0946-07 LUAD LUAD 0.894 0.106 57 | TCGA-44-2666-01A-01R-0946-07 LUAD LUAD 0.978 0.022 58 | TCGA-44-2668-01A-01R-0946-07 LUAD LUAD 0.83 0.17 59 | TCGA-44-3396-01A-01R-1206-07 LUAD LUAD 0.946 0.054 60 | TCGA-44-3398-01A-01R-1107-07 LUAD LUAD 0.822 0.178 61 | TCGA-44-3918-01A-01R-1107-07 LUAD LUAD 0.934 0.066 62 | TCGA-44-3919-01A-02R-1107-07 LUAD LUAD 0.978 0.022 63 | TCGA-44-4112-01A-01R-1107-07 LUAD LUAD 0.944 0.056 64 | TCGA-44-5643-01A-01R-1628-07 LUAD LUSC 0.04 0.96 65 | TCGA-44-5644-01A-21R-2039-07 LUAD LUAD 0.838 0.162 66 | TCGA-44-5645-01A-01R-1628-07 LUAD LUAD 0.96 0.04 67 | TCGA-44-6145-01A-11R-1755-07 LUAD LUAD 0.982 0.018 68 | TCGA-44-6146-01A-11R-1755-07 LUAD LUAD 0.93 0.07 69 | TCGA-44-6147-01A-11R-1755-07 LUAD LUAD 0.946 0.054 70 | TCGA-44-6148-01A-11R-1755-07 LUAD LUAD 0.864 0.136 71 | TCGA-44-6774-01A-21R-1858-07 LUAD LUAD 0.898 0.102 72 | TCGA-44-6775-01A-11R-1858-07 LUAD LUAD 0.972 0.028 73 | TCGA-44-6776-01A-11R-1858-07 LUAD LUAD 0.976 0.024 74 | TCGA-44-6777-01A-11R-1858-07 LUAD LUAD 0.946 0.054 75 | TCGA-44-6778-01A-11R-1858-07 LUAD LUAD 0.878 0.122 76 | TCGA-44-6779-01A-11R-1858-07 LUAD LUAD 0.87 0.13 77 | TCGA-44-7659-01A-11R-2066-07 LUAD LUAD 0.98 0.02 78 | TCGA-44-7660-01A-11R-2066-07 LUAD LUSC 0.46 0.54 79 | TCGA-44-7661-01A-11R-2066-07 LUAD LUAD 0.906 0.094 80 | TCGA-44-7662-01A-11R-2066-07 LUAD LUAD 0.938 0.062 81 | TCGA-44-7667-01A-31R-2066-07 LUAD LUAD 0.568 0.432 82 | TCGA-44-7669-01A-21R-2066-07 LUAD LUAD 0.816 0.184 83 | TCGA-44-7670-01A-11R-2066-07 LUAD LUAD 0.744 0.256 84 | TCGA-44-7671-01A-11R-2066-07 LUAD LUAD 0.976 0.024 85 | TCGA-44-7672-01A-11R-2066-07 LUAD LUAD 0.988 0.012 86 | TCGA-44-8117-01A-11R-2241-07 LUAD LUAD 0.918 0.082 87 | TCGA-44-8119-01A-11R-2241-07 LUAD LUAD 0.878 0.122 88 | TCGA-44-8120-01A-11R-2241-07 LUAD LUAD 0.976 0.024 89 | TCGA-49-4486-01A-01R-1206-07 LUAD LUAD 0.938 0.062 90 | TCGA-49-4487-01A-21R-1858-07 LUAD LUAD 0.904 0.096 91 | TCGA-49-4488-01A-01R-1755-07 LUAD LUAD 0.918 0.082 92 | TCGA-49-4490-01A-21R-1858-07 LUAD LUAD 0.892 0.108 93 | TCGA-49-4494-01A-01R-1206-07 LUAD LUAD 0.89 0.11 94 | TCGA-49-4501-01A-01R-1206-07 LUAD LUAD 0.976 0.024 95 | TCGA-49-4505-01A-01R-1206-07 LUAD LUAD 0.978 0.022 96 | TCGA-49-4506-01A-01R-1206-07 LUAD LUAD 0.698 0.302 97 | TCGA-49-4507-01A-01R-1206-07 LUAD LUAD 0.798 0.202 98 | TCGA-49-4510-01A-01R-1206-07 LUAD LUAD 0.966 0.034 99 | TCGA-49-4512-01A-21R-1858-07 LUAD LUAD 0.958 0.042 100 | TCGA-49-4514-01A-21R-1858-07 LUAD LUAD 0.824 0.176 101 | TCGA-49-6742-01A-11R-1858-07 LUAD LUAD 0.954 0.046 102 | TCGA-49-6743-01A-11R-1858-07 LUAD LUAD 0.868 0.132 103 | TCGA-49-6744-01A-11R-1858-07 LUAD LUAD 0.998 0.002 104 | TCGA-49-6745-01A-11R-1858-07 LUAD LUAD 0.956 0.044 105 | TCGA-49-6761-01A-31R-1949-07 LUAD LUAD 0.84 0.16 106 | TCGA-49-6767-01A-11R-1858-07 LUAD LUAD 0.808 0.192 107 | TCGA-50-5044-01A-21R-1858-07 LUAD LUAD 0.768 0.232 108 | TCGA-50-5049-01A-01R-1628-07 LUAD LUAD 0.922 0.078 109 | TCGA-50-5051-01A-21R-1858-07 LUAD LUAD 0.912 0.088 110 | TCGA-50-5055-01A-01R-1628-07 LUAD LUAD 0.912 0.088 111 | TCGA-50-5066-01A-01R-1628-07 LUAD LUAD 0.762 0.238 112 | TCGA-50-5066-02A-11R-2090-07 LUAD LUAD 0.938 0.062 113 | TCGA-50-5068-01A-01R-1628-07 LUAD LUAD 0.856 0.144 114 | TCGA-50-5072-01A-21R-1858-07 LUAD LUAD 0.848 0.152 115 | TCGA-50-5931-01A-11R-1755-07 LUAD LUSC 0.226 0.774 116 | TCGA-50-5932-01A-11R-1755-07 LUAD LUAD 0.954 0.046 117 | TCGA-50-5933-01A-11R-1755-07 LUAD LUAD 0.89 0.11 118 | TCGA-50-5935-01A-11R-1755-07 LUAD LUAD 0.984 0.016 119 | TCGA-50-5936-01A-11R-1628-07 LUAD LUAD 0.95 0.05 120 | TCGA-50-5939-01A-11R-1628-07 LUAD LUAD 0.918 0.082 121 | TCGA-50-5941-01A-11R-1755-07 LUAD LUAD 0.962 0.038 122 | TCGA-50-5942-01A-21R-1755-07 LUAD LUAD 0.974 0.026 123 | TCGA-50-5944-01A-11R-1755-07 LUAD LUAD 0.99 0.01 124 | TCGA-50-5946-01A-11R-1755-07 LUAD LUAD 0.854 0.146 125 | TCGA-50-5946-02A-11R-2090-07 LUAD LUAD 0.904 0.096 126 | TCGA-50-6590-01A-12R-1858-07 LUAD LUAD 0.682 0.318 127 | TCGA-50-6591-01A-11R-1755-07 LUAD LUAD 0.632 0.368 128 | TCGA-50-6592-01A-11R-1755-07 LUAD LUAD 0.792 0.208 129 | TCGA-50-6593-01A-11R-1755-07 LUAD LUAD 0.97 0.03 130 | TCGA-50-6594-01A-11R-1755-07 LUAD LUAD 0.832 0.168 131 | TCGA-50-6595-01A-12R-1858-07 LUAD LUAD 0.74 0.26 132 | TCGA-50-6597-01A-11R-1858-07 LUAD LUAD 0.87 0.13 133 | TCGA-50-6673-01A-11R-1949-07 LUAD LUAD 0.934 0.066 134 | TCGA-50-7109-01A-11R-2039-07 LUAD LUAD 0.95 0.05 135 | TCGA-53-7624-01A-11R-2066-07 LUAD LUAD 0.576 0.424 136 | TCGA-53-7626-01A-12R-2066-07 LUAD LUAD 0.986 0.014 137 | TCGA-53-7813-01A-11R-2170-07 LUAD LUAD 0.952 0.048 138 | TCGA-55-1592-01A-01R-0946-07 LUAD LUAD 0.96 0.04 139 | TCGA-55-1594-01A-01R-0946-07 LUAD LUAD 0.79 0.21 140 | TCGA-55-1595-01A-01R-0946-07 LUAD LUAD 0.968 0.032 141 | TCGA-55-1596-01A-01R-0946-07 LUAD LUAD 0.844 0.156 142 | TCGA-55-5899-01A-11R-1628-07 LUAD LUAD 0.73 0.27 143 | TCGA-55-6543-01A-11R-1755-07 LUAD LUAD 0.98 0.02 144 | TCGA-55-6642-01A-11R-1858-07 LUAD LUAD 0.962 0.038 145 | TCGA-55-6712-01A-11R-1858-07 LUAD LUAD 0.944 0.056 146 | TCGA-55-6968-01A-11R-1949-07 LUAD LUAD 0.544 0.456 147 | TCGA-55-6969-01A-11R-1949-07 LUAD LUAD 0.84 0.16 148 | TCGA-55-6970-01A-11R-1949-07 LUAD LUAD 0.97 0.03 149 | TCGA-55-6971-01A-11R-1949-07 LUAD LUAD 0.964 0.036 150 | TCGA-55-6972-01A-11R-1949-07 LUAD LUAD 0.924 0.076 151 | TCGA-55-6975-01A-11R-1949-07 LUAD LUAD 0.902 0.098 152 | TCGA-55-6978-01A-11R-1949-07 LUAD LUAD 0.91 0.09 153 | TCGA-55-6979-01A-11R-1949-07 LUAD LUAD 0.946 0.054 154 | TCGA-55-6980-01A-11R-1949-07 LUAD LUAD 0.968 0.032 155 | TCGA-55-6981-01A-11R-1949-07 LUAD LUAD 0.95 0.05 156 | TCGA-55-6982-01A-11R-1949-07 LUAD LUAD 0.97 0.03 157 | TCGA-55-6983-01A-11R-1949-07 LUAD LUAD 0.988 0.012 158 | TCGA-55-6984-01A-11R-1949-07 LUAD LUAD 0.888 0.112 159 | TCGA-55-6985-01A-11R-1949-07 LUAD LUAD 0.964 0.036 160 | TCGA-55-6986-01A-11R-1949-07 LUAD LUAD 0.978 0.022 161 | TCGA-55-6987-01A-11R-1949-07 LUAD LUAD 0.95 0.05 162 | TCGA-55-7227-01A-11R-2039-07 LUAD LUAD 0.992 0.008 163 | TCGA-55-7281-01A-11R-2039-07 LUAD LUAD 0.954 0.046 164 | TCGA-55-7283-01A-11R-2039-07 LUAD LUAD 0.99 0.01 165 | TCGA-55-7284-01B-11R-2241-07 LUAD LUAD 0.974 0.026 166 | TCGA-55-7570-01A-11R-2039-07 LUAD LUAD 0.58 0.42 167 | TCGA-55-7573-01A-11R-2039-07 LUAD LUAD 0.982 0.018 168 | TCGA-55-7574-01A-11R-2039-07 LUAD LUAD 0.98 0.02 169 | TCGA-55-7576-01A-11R-2066-07 LUAD LUAD 0.97 0.03 170 | TCGA-55-7724-01A-11R-2170-07 LUAD LUSC 0.228 0.772 171 | TCGA-55-7725-01A-11R-2170-07 LUAD LUAD 0.936 0.064 172 | TCGA-55-7726-01A-11R-2170-07 LUAD LUSC 0.368 0.632 173 | TCGA-55-7727-01A-11R-2170-07 LUAD LUAD 0.944 0.056 174 | TCGA-55-7728-01A-11R-2187-07 LUAD LUAD 0.886 0.114 175 | TCGA-55-7815-01A-11R-2170-07 LUAD LUAD 0.848 0.152 176 | TCGA-55-7903-01A-11R-2170-07 LUAD LUAD 0.966 0.034 177 | TCGA-55-7907-01A-11R-2170-07 LUAD LUAD 0.956 0.044 178 | TCGA-55-7910-01A-11R-2170-07 LUAD LUAD 0.916 0.084 179 | TCGA-55-7911-01A-11R-2170-07 LUAD LUAD 0.966 0.034 180 | TCGA-55-7913-01B-11R-2241-07 LUAD LUAD 0.904 0.096 181 | TCGA-55-7914-01A-11R-2170-07 LUAD LUAD 0.956 0.044 182 | TCGA-55-7994-01A-11R-2187-07 LUAD LUAD 0.906 0.094 183 | TCGA-55-7995-01A-11R-2187-07 LUAD LUAD 0.756 0.244 184 | TCGA-55-8085-01A-11R-2241-07 LUAD LUAD 0.97 0.03 185 | TCGA-55-8087-01A-11R-2241-07 LUAD LUAD 0.976 0.024 186 | TCGA-55-8089-01A-11R-2241-07 LUAD LUAD 0.902 0.098 187 | TCGA-55-8090-01A-11R-2241-07 LUAD LUAD 0.96 0.04 188 | TCGA-55-8091-01A-11R-2241-07 LUAD LUAD 0.948 0.052 189 | TCGA-55-8092-01A-11R-2241-07 LUAD LUAD 0.924 0.076 190 | TCGA-55-8094-01A-11R-2241-07 LUAD LUAD 0.842 0.158 191 | TCGA-55-8096-01A-11R-2241-07 LUAD LUAD 0.948 0.052 192 | TCGA-55-8097-01A-11R-2241-07 LUAD LUAD 0.986 0.014 193 | TCGA-55-8203-01A-11R-2241-07 LUAD LUAD 0.992 0.008 194 | TCGA-55-8204-01A-11R-2241-07 LUAD LUSC 0.184 0.816 195 | TCGA-55-8205-01A-11R-2241-07 LUAD LUAD 0.922 0.078 196 | TCGA-55-8206-01A-11R-2241-07 LUAD LUAD 0.97 0.03 197 | TCGA-55-8207-01A-11R-2241-07 LUAD LUAD 0.976 0.024 198 | TCGA-55-8208-01A-11R-2241-07 LUAD LUAD 0.954 0.046 199 | TCGA-55-8299-01A-11R-2287-07 LUAD LUAD 0.952 0.048 200 | TCGA-55-8301-01A-11R-2287-07 LUAD LUAD 0.956 0.044 201 | TCGA-64-1676-01A-01R-0946-07 LUAD LUAD 0.836 0.164 202 | TCGA-64-1677-01A-01R-0946-07 LUAD LUAD 0.89 0.11 203 | TCGA-64-1678-01A-01R-0946-07 LUAD LUAD 0.7 0.3 204 | TCGA-64-1679-01A-21R-2066-07 LUAD LUAD 0.904 0.096 205 | TCGA-64-1680-01A-02R-0946-07 LUAD LUAD 0.966 0.034 206 | TCGA-64-1681-01A-11R-2066-07 LUAD LUAD 0.968 0.032 207 | TCGA-64-5774-01A-01R-1628-07 LUAD LUAD 0.938 0.062 208 | TCGA-64-5775-01A-01R-1628-07 LUAD LUAD 0.688 0.312 209 | TCGA-64-5778-01A-01R-1628-07 LUAD LUAD 0.936 0.064 210 | TCGA-64-5779-01A-01R-1628-07 LUAD LUAD 0.908 0.092 211 | TCGA-64-5781-01A-01R-1628-07 LUAD LUAD 0.808 0.192 212 | TCGA-64-5815-01A-01R-1628-07 LUAD LUAD 0.872 0.128 213 | TCGA-67-3770-01A-01R-0946-07 LUAD LUAD 0.94 0.06 214 | TCGA-67-3771-01A-01R-0946-07 LUAD LUAD 0.81 0.19 215 | TCGA-67-3772-01A-01R-0946-07 LUAD LUAD 0.954 0.046 216 | TCGA-67-3773-01A-01R-0946-07 LUAD LUAD 0.94 0.06 217 | TCGA-67-3774-01A-01R-0946-07 LUAD LUAD 0.978 0.022 218 | TCGA-67-4679-01B-01R-1755-07 LUAD LUAD 0.988 0.012 219 | TCGA-67-6215-01A-11R-1755-07 LUAD LUAD 0.974 0.026 220 | TCGA-67-6216-01A-11R-1755-07 LUAD LUAD 0.98 0.02 221 | TCGA-67-6217-01A-11R-1755-07 LUAD LUAD 0.982 0.018 222 | TCGA-69-7760-01A-11R-2170-07 LUAD LUAD 0.904 0.096 223 | TCGA-69-7761-01A-11R-2170-07 LUAD LUAD 0.938 0.062 224 | TCGA-69-7763-01A-11R-2170-07 LUAD LUAD 0.96 0.04 225 | TCGA-69-7764-01A-11R-2170-07 LUAD LUAD 0.974 0.026 226 | TCGA-69-7765-01A-11R-2170-07 LUAD LUAD 0.98 0.02 227 | TCGA-69-7973-01A-11R-2187-07 LUAD LUAD 0.948 0.052 228 | TCGA-69-7974-01A-11R-2187-07 LUAD LUAD 0.982 0.018 229 | TCGA-69-7978-01A-11R-2187-07 LUAD LUAD 0.958 0.042 230 | TCGA-69-7979-01A-11R-2187-07 LUAD LUAD 0.772 0.228 231 | TCGA-69-7980-01A-11R-2187-07 LUAD LUAD 0.956 0.044 232 | TCGA-69-8253-01A-11R-2287-07 LUAD LUAD 0.958 0.042 233 | TCGA-69-8254-01A-11R-2287-07 LUAD LUAD 0.982 0.018 234 | TCGA-69-8255-01A-11R-2287-07 LUAD LUAD 0.7 0.3 235 | TCGA-71-6725-01A-11R-1858-07 LUAD LUAD 0.934 0.066 236 | TCGA-73-4658-01A-01R-1755-07 LUAD LUAD 0.968 0.032 237 | TCGA-73-4659-01A-01R-1206-07 LUAD LUAD 0.976 0.024 238 | TCGA-73-4662-01A-01R-1206-07 LUAD LUAD 0.99 0.01 239 | TCGA-73-4666-01A-01R-1206-07 LUAD LUAD 0.852 0.148 240 | TCGA-73-4668-01A-01R-1206-07 LUAD LUAD 0.94 0.06 241 | TCGA-73-4670-01A-01R-1206-07 LUAD LUAD 0.81 0.19 242 | TCGA-73-4675-01A-01R-1206-07 LUAD LUAD 0.898 0.102 243 | TCGA-73-4676-01A-01R-1755-07 LUAD LUAD 0.886 0.114 244 | TCGA-73-4677-01A-01R-1206-07 LUAD LUAD 0.982 0.018 245 | TCGA-73-7498-01A-12R-2187-07 LUAD LUAD 0.996 0.004 246 | TCGA-73-7499-01A-11R-2187-07 LUAD LUAD 0.872 0.128 247 | TCGA-75-5122-01A-01R-1755-07 LUAD LUAD 0.88 0.12 248 | TCGA-75-5125-01A-01R-1755-07 LUAD LUAD 0.896 0.104 249 | TCGA-75-5126-01A-01R-1755-07 LUAD LUAD 0.93 0.07 250 | TCGA-75-5146-01A-01R-1628-07 LUAD LUAD 0.944 0.056 251 | TCGA-75-5147-01A-01R-1628-07 LUAD LUAD 0.87 0.13 252 | TCGA-75-6203-01A-11R-1755-07 LUAD LUAD 0.952 0.048 253 | TCGA-75-6205-01A-11R-1755-07 LUAD LUAD 0.892 0.108 254 | TCGA-75-6206-01A-11R-1755-07 LUAD LUAD 0.96 0.04 255 | TCGA-75-6207-01A-11R-1755-07 LUAD LUAD 0.884 0.116 256 | TCGA-75-6211-01A-11R-1755-07 LUAD LUAD 0.838 0.162 257 | TCGA-75-6212-01A-11R-1755-07 LUAD LUAD 0.968 0.032 258 | TCGA-75-6214-01A-41R-1949-07 LUAD LUSC 0.24 0.76 259 | TCGA-75-7025-01A-12R-1949-07 LUAD LUAD 0.952 0.048 260 | TCGA-75-7027-01A-11R-1949-07 LUAD LUAD 0.9 0.1 261 | TCGA-75-7030-01A-11R-1949-07 LUAD LUAD 0.906 0.094 262 | TCGA-75-7031-01A-11R-1949-07 LUAD LUAD 0.912 0.088 263 | TCGA-78-7143-01A-11R-2039-07 LUAD LUAD 0.99 0.01 264 | TCGA-78-7145-01A-11R-2039-07 LUAD LUAD 0.972 0.028 265 | TCGA-78-7146-01A-11R-2039-07 LUAD LUAD 0.8 0.2 266 | TCGA-78-7147-01A-11R-2039-07 LUAD LUAD 0.94 0.06 267 | TCGA-78-7148-01A-11R-2039-07 LUAD LUAD 0.994 0.006 268 | TCGA-78-7149-01A-11R-2039-07 LUAD LUAD 0.974 0.026 269 | TCGA-78-7150-01A-21R-2039-07 LUAD LUAD 0.824 0.176 270 | TCGA-78-7152-01A-11R-2039-07 LUAD LUAD 0.95 0.05 271 | TCGA-78-7153-01A-11R-2039-07 LUAD LUAD 0.956 0.044 272 | TCGA-78-7154-01A-11R-2039-07 LUAD LUAD 0.788 0.212 273 | TCGA-78-7155-01A-11R-2039-07 LUAD LUAD 0.594 0.406 274 | TCGA-78-7156-01A-11R-2039-07 LUAD LUAD 0.96 0.04 275 | TCGA-78-7158-01A-11R-2039-07 LUAD LUAD 0.966 0.034 276 | TCGA-78-7159-01A-11R-2039-07 LUAD LUAD 0.978 0.022 277 | TCGA-78-7160-01A-11R-2039-07 LUAD LUAD 0.978 0.022 278 | TCGA-78-7161-01A-11R-2039-07 LUAD LUAD 0.978 0.022 279 | TCGA-78-7162-01A-21R-2066-07 LUAD LUAD 0.974 0.026 280 | TCGA-78-7163-01A-12R-2066-07 LUAD LUAD 0.908 0.092 281 | TCGA-78-7166-01A-12R-2066-07 LUAD LUAD 0.946 0.054 282 | TCGA-78-7167-01A-11R-2066-07 LUAD LUAD 0.974 0.026 283 | TCGA-78-7220-01A-11R-2039-07 LUAD LUAD 0.826 0.174 284 | TCGA-78-7535-01A-11R-2066-07 LUAD LUAD 0.87 0.13 285 | TCGA-78-7536-01A-11R-2066-07 LUAD LUAD 0.83 0.17 286 | TCGA-78-7537-01A-11R-2066-07 LUAD LUAD 0.982 0.018 287 | TCGA-78-7539-01A-11R-2066-07 LUAD LUAD 0.974 0.026 288 | TCGA-78-7540-01A-11R-2066-07 LUAD LUAD 0.954 0.046 289 | TCGA-78-7542-01A-21R-2066-07 LUAD LUAD 0.626 0.374 290 | TCGA-78-7633-01A-11R-2066-07 LUAD LUAD 0.976 0.024 291 | TCGA-80-5607-01A-31R-1949-07 LUAD LUAD 0.968 0.032 292 | TCGA-80-5608-01A-31R-1949-07 LUAD LUAD 0.974 0.026 293 | TCGA-80-5611-01A-01R-1628-07 LUAD LUAD 0.872 0.128 294 | TCGA-83-5908-01A-21R-2287-07 LUAD LUAD 0.856 0.144 295 | TCGA-86-6562-01A-11R-1755-07 LUAD LUAD 0.988 0.012 296 | TCGA-86-6851-01A-11R-1949-07 LUAD LUAD 0.924 0.076 297 | TCGA-86-7701-01A-11R-2170-07 LUAD LUAD 0.932 0.068 298 | TCGA-86-7711-01A-11R-2066-07 LUAD LUAD 0.726 0.274 299 | TCGA-86-7713-01A-11R-2066-07 LUAD LUAD 0.91 0.09 300 | TCGA-86-7714-01A-12R-2170-07 LUAD LUAD 0.97 0.03 301 | TCGA-86-7953-01A-11R-2187-07 LUAD LUAD 0.9 0.1 302 | TCGA-86-7954-01A-11R-2187-07 LUAD LUAD 0.954 0.046 303 | TCGA-86-7955-01A-11R-2187-07 LUAD LUAD 0.878 0.122 304 | TCGA-86-8054-01A-11R-2241-07 LUAD LUAD 0.846 0.154 305 | TCGA-86-8055-01A-11R-2241-07 LUAD LUAD 0.956 0.044 306 | TCGA-86-8056-01A-11R-2241-07 LUAD LUAD 0.97 0.03 307 | TCGA-86-8073-01A-11R-2241-07 LUAD LUAD 0.912 0.088 308 | TCGA-86-8074-01A-11R-2241-07 LUAD LUAD 0.984 0.016 309 | TCGA-86-8075-01A-11R-2241-07 LUAD LUAD 0.956 0.044 310 | TCGA-86-8076-01A-31R-2241-07 LUAD LUAD 0.988 0.012 311 | TCGA-86-8279-01A-11R-2287-07 LUAD LUAD 0.944 0.056 312 | TCGA-86-8280-01A-11R-2287-07 LUAD LUAD 0.986 0.014 313 | TCGA-86-8281-01A-11R-2287-07 LUAD LUAD 0.99 0.01 314 | TCGA-91-6828-01A-11R-1858-07 LUAD LUAD 0.97 0.03 315 | TCGA-91-6829-01A-21R-1858-07 LUAD LUAD 0.928 0.072 316 | TCGA-91-6830-01A-11R-1949-07 LUAD LUAD 0.916 0.084 317 | TCGA-91-6831-01A-11R-1858-07 LUAD LUAD 0.894 0.106 318 | TCGA-91-6835-01A-11R-1858-07 LUAD LUAD 0.936 0.064 319 | TCGA-91-6836-01A-21R-1858-07 LUAD LUAD 0.832 0.168 320 | TCGA-91-6840-01A-11R-1949-07 LUAD LUAD 0.798 0.202 321 | TCGA-91-6847-01A-11R-1949-07 LUAD LUAD 0.736 0.264 322 | TCGA-91-6848-01A-11R-1949-07 LUAD LUAD 0.606 0.394 323 | TCGA-91-6849-01A-11R-1949-07 LUAD LUAD 0.974 0.026 324 | TCGA-91-7771-01A-11R-2170-07 LUAD LUAD 0.99 0.01 325 | TCGA-93-7347-01A-11R-2187-07 LUAD LUAD 0.99 0.01 326 | TCGA-93-7348-01A-21R-2039-07 LUAD LUAD 0.978 0.022 327 | TCGA-93-8067-01A-11R-2287-07 LUAD LUAD 0.91 0.09 328 | TCGA-95-7039-01A-11R-1949-07 LUAD LUAD 0.928 0.072 329 | TCGA-95-7043-01A-11R-1949-07 LUAD LUAD 0.786 0.214 330 | TCGA-95-7562-01A-11R-2241-07 LUAD LUAD 0.87 0.13 331 | TCGA-95-7567-01A-11R-2066-07 LUAD LUAD 0.966 0.034 332 | TCGA-95-7944-01A-11R-2187-07 LUAD LUAD 0.776 0.224 333 | TCGA-95-7947-01A-11R-2187-07 LUAD LUAD 0.834 0.166 334 | TCGA-95-7948-01A-11R-2187-07 LUAD LUAD 0.828 0.172 335 | TCGA-95-8039-01A-11R-2241-07 LUAD LUAD 0.998 0.002 336 | TCGA-97-7546-01A-11R-2039-07 LUAD LUAD 0.966 0.034 337 | TCGA-97-7547-01A-11R-2039-07 LUAD LUAD 0.958 0.042 338 | TCGA-97-7552-01A-11R-2039-07 LUAD LUAD 0.97 0.03 339 | TCGA-97-7553-01A-21R-2039-07 LUAD LUAD 0.978 0.022 340 | TCGA-97-7554-01A-11R-2039-07 LUAD LUAD 0.972 0.028 341 | TCGA-97-7937-01A-11R-2170-07 LUAD LUAD 0.98 0.02 342 | TCGA-97-7938-01A-11R-2170-07 LUAD LUAD 0.978 0.022 343 | TCGA-97-7941-01A-11R-2187-07 LUAD LUAD 0.968 0.032 344 | TCGA-97-8171-01A-11R-2287-07 LUAD LUAD 0.888 0.112 345 | TCGA-97-8172-01A-11R-2287-07 LUAD LUAD 0.984 0.016 346 | TCGA-97-8174-01A-11R-2287-07 LUAD LUAD 0.932 0.068 347 | TCGA-97-8175-01A-11R-2287-07 LUAD LUAD 0.956 0.044 348 | TCGA-97-8177-01A-11R-2287-07 LUAD LUAD 0.96 0.04 349 | TCGA-97-8179-01A-11R-2287-07 LUAD LUAD 0.97 0.03 350 | TCGA-99-7458-01A-11R-2039-07 LUAD LUAD 0.984 0.016 351 | TCGA-99-8025-01A-11R-2241-07 LUAD LUAD 0.958 0.042 352 | TCGA-99-8028-01A-11R-2241-07 LUAD LUAD 0.972 0.028 353 | TCGA-99-8032-01A-11R-2241-07 LUAD LUAD 0.958 0.042 354 | TCGA-99-8033-01A-11R-2241-07 LUAD LUAD 0.944 0.056 355 | TCGA-J2-8192-01A-11R-2241-07 LUAD LUAD 0.962 0.038 356 | TCGA-J2-8194-01A-11R-2241-07 LUAD LUAD 0.984 0.016 357 | TCGA-18-3406-01A-01R-0980-07 LUSC LUSC 0.102 0.898 358 | TCGA-18-3407-01A-01R-0980-07 LUSC LUSC 0.024 0.976 359 | TCGA-18-3408-01A-01R-0980-07 LUSC LUSC 0.22 0.78 360 | TCGA-18-3409-01A-01R-0980-07 LUSC LUSC 0.26 0.74 361 | TCGA-18-3410-01A-01R-0980-07 LUSC LUSC 0.18 0.82 362 | TCGA-18-3411-01A-01R-0980-07 LUSC LUSC 0.028 0.972 363 | TCGA-18-3412-01A-01R-0980-07 LUSC LUSC 0.024 0.976 364 | TCGA-18-3414-01A-01R-0980-07 LUSC LUSC 0.024 0.976 365 | TCGA-18-3415-01A-01R-0980-07 LUSC LUSC 0.02 0.98 366 | TCGA-18-3416-01A-01R-0980-07 LUSC LUSC 0.128 0.872 367 | TCGA-18-3417-01A-01R-1443-07 LUSC LUSC 0.156 0.844 368 | TCGA-18-3419-01A-01R-0980-07 LUSC LUSC 0.14 0.86 369 | TCGA-18-3421-01A-01R-0980-07 LUSC LUSC 0.086 0.914 370 | TCGA-18-4083-01A-01R-1100-07 LUSC LUSC 0.052 0.948 371 | TCGA-18-4086-01A-01R-1100-07 LUSC LUSC 0.004 0.996 372 | TCGA-18-4721-01A-01R-1443-07 LUSC LUSC 0.072 0.928 373 | TCGA-18-5592-01A-01R-1635-07 LUSC LUSC 0.048 0.952 374 | TCGA-18-5595-01A-01R-1635-07 LUSC LUSC 0.076 0.924 375 | TCGA-21-1070-01A-01R-0692-07 LUSC LUSC 0.118 0.882 376 | TCGA-21-1071-01A-01R-0692-07 LUSC LUSC 0.11 0.89 377 | TCGA-21-1072-01A-01R-0692-07 LUSC LUSC 0.042 0.958 378 | TCGA-21-1075-01A-01R-0692-07 LUSC LUSC 0.05 0.95 379 | TCGA-21-1076-01A-02R-0692-07 LUSC LUSC 0.138 0.862 380 | TCGA-21-1077-01A-01R-0692-07 LUSC LUSC 0.01 0.99 381 | TCGA-21-1078-01A-01R-0692-07 LUSC LUSC 0.444 0.556 382 | TCGA-21-1079-01A-01R-0692-07 LUSC LUSC 0.132 0.868 383 | TCGA-21-1080-01A-01R-0692-07 LUSC LUSC 0.056 0.944 384 | TCGA-21-1081-01A-01R-0692-07 LUSC LUSC 0.098 0.902 385 | TCGA-21-1082-01A-01R-0692-07 LUSC LUSC 0.062 0.938 386 | TCGA-21-1083-01A-01R-0692-07 LUSC LUSC 0.246 0.754 387 | TCGA-21-5782-01A-01R-1635-07 LUSC LUSC 0.128 0.872 388 | TCGA-21-5784-01A-01R-1635-07 LUSC LUSC 0.086 0.914 389 | TCGA-21-5786-01A-01R-1635-07 LUSC LUSC 0.04 0.96 390 | TCGA-21-5787-01A-01R-1635-07 LUSC LUAD 0.662 0.338 391 | TCGA-22-0940-01A-01R-0692-07 LUSC LUSC 0.16 0.84 392 | TCGA-22-0944-01A-01R-0692-07 LUSC LUSC 0.028 0.972 393 | TCGA-22-1002-01A-01R-0692-07 LUSC LUSC 0.204 0.796 394 | TCGA-22-1011-01A-01R-0692-07 LUSC LUSC 0.072 0.928 395 | TCGA-22-1012-01A-01R-0692-07 LUSC LUSC 0.036 0.964 396 | TCGA-22-1016-01A-01R-0692-07 LUSC LUSC 0.412 0.588 397 | TCGA-22-1017-01A-01R-0692-07 LUSC LUAD 0.874 0.126 398 | TCGA-22-4591-01A-01R-1201-07 LUSC LUSC 0.208 0.792 399 | TCGA-22-4593-01A-21R-1820-07 LUSC LUSC 0.068 0.932 400 | TCGA-22-4594-01A-01R-1201-07 LUSC LUAD 0.892 0.108 401 | TCGA-22-4595-01A-01R-1201-07 LUSC LUSC 0.058 0.942 402 | TCGA-22-4596-01A-01R-1201-07 LUSC LUAD 0.936 0.064 403 | TCGA-22-4599-01A-01R-1443-07 LUSC LUSC 0.068 0.932 404 | TCGA-22-4601-01A-01R-1443-07 LUSC LUSC 0.054 0.946 405 | TCGA-22-4604-01A-01R-1201-07 LUSC LUSC 0.018 0.982 406 | TCGA-22-4607-01A-01R-1201-07 LUSC LUSC 0.088 0.912 407 | TCGA-22-4613-01A-01R-1443-07 LUSC LUSC 0.022 0.978 408 | TCGA-22-5471-01A-01R-1635-07 LUSC LUSC 0.04 0.96 409 | TCGA-22-5472-01A-01R-1635-07 LUSC LUSC 0.09 0.91 410 | TCGA-22-5473-01A-01R-1635-07 LUSC LUSC 0.008 0.992 411 | TCGA-22-5474-01A-01R-1635-07 LUSC LUSC 0.048 0.952 412 | TCGA-22-5477-01A-01R-1635-07 LUSC LUSC 0.094 0.906 413 | TCGA-22-5478-01A-01R-1635-07 LUSC LUSC 0.278 0.722 414 | TCGA-22-5479-01A-31R-1949-07 LUSC LUSC 0.01 0.99 415 | TCGA-22-5480-01A-01R-1635-07 LUSC LUSC 0.196 0.804 416 | TCGA-22-5481-01A-31R-1949-07 LUSC LUSC 0.422 0.578 417 | TCGA-22-5482-01A-01R-1635-07 LUSC LUSC 0.014 0.986 418 | TCGA-22-5483-01A-01R-1820-07 LUSC LUSC 0.408 0.592 419 | TCGA-22-5485-01A-01R-1635-07 LUSC LUSC 0.04 0.96 420 | TCGA-22-5489-01A-01R-1635-07 LUSC LUSC 0.106 0.894 421 | TCGA-22-5491-01A-01R-1635-07 LUSC LUSC 0.024 0.976 422 | TCGA-22-5492-01A-01R-1635-07 LUSC LUSC 0.106 0.894 423 | TCGA-33-4532-01A-01R-1201-07 LUSC LUSC 0.028 0.972 424 | TCGA-33-4533-01A-01R-1201-07 LUSC LUSC 0.194 0.806 425 | TCGA-33-4538-01A-01R-1201-07 LUSC LUSC 0.014 0.986 426 | TCGA-33-4547-01A-01R-1201-07 LUSC LUSC 0.044 0.956 427 | TCGA-33-4566-01A-01R-1443-07 LUSC LUSC 0.486 0.514 428 | TCGA-33-4582-01A-01R-1443-07 LUSC LUSC 0.084 0.916 429 | TCGA-33-4583-01A-01R-1443-07 LUSC LUSC 0.04 0.96 430 | TCGA-33-4586-01A-01R-1443-07 LUSC LUSC 0.03 0.97 431 | TCGA-33-6737-01A-11R-1820-07 LUSC LUAD 0.754 0.246 432 | TCGA-33-6738-01A-11R-1949-07 LUSC LUSC 0.376 0.624 433 | TCGA-34-2596-01A-01R-0851-07 LUSC LUSC 0.05 0.95 434 | TCGA-34-2600-01A-01R-0851-07 LUSC LUSC 0.076 0.924 435 | TCGA-34-2608-01A-02R-0851-07 LUSC LUSC 0.046 0.954 436 | TCGA-34-5231-01A-21R-1820-07 LUSC LUSC 0.094 0.906 437 | TCGA-34-5232-01A-21R-1820-07 LUSC LUSC 0.024 0.976 438 | TCGA-34-5234-01A-01R-1635-07 LUSC LUSC 0.282 0.718 439 | TCGA-34-5236-01A-21R-1820-07 LUSC LUSC 0.016 0.984 440 | TCGA-34-5239-01A-21R-1820-07 LUSC LUSC 0.138 0.862 441 | TCGA-34-5240-01A-01R-1443-07 LUSC LUSC 0.064 0.936 442 | TCGA-34-5241-01A-01R-1443-07 LUSC LUSC 0.024 0.976 443 | TCGA-34-5927-01A-11R-1820-07 LUSC LUSC 0.184 0.816 444 | TCGA-34-5928-01A-11R-1820-07 LUSC LUSC 0.15 0.85 445 | TCGA-34-5929-01A-11R-1820-07 LUSC LUSC 0.086 0.914 446 | TCGA-34-7107-01A-11R-1949-07 LUSC LUSC 0.034 0.966 447 | TCGA-37-3783-01A-01R-1201-07 LUSC LUSC 0.1 0.9 448 | TCGA-37-3789-01A-01R-0980-07 LUSC LUSC 0.06 0.94 449 | TCGA-37-3792-01A-01R-0980-07 LUSC LUAD 0.504 0.496 450 | TCGA-37-4129-01A-01R-1100-07 LUSC LUAD 0.594 0.406 451 | TCGA-37-4130-01A-01R-1100-07 LUSC LUAD 0.598 0.402 452 | TCGA-37-4132-01A-01R-1100-07 LUSC LUSC 0.428 0.572 453 | TCGA-37-4133-01A-01R-1100-07 LUSC LUSC 0.172 0.828 454 | TCGA-37-4135-01A-01R-1100-07 LUSC LUSC 0.344 0.656 455 | TCGA-37-4141-01A-02R-1100-07 LUSC LUSC 0.434 0.566 456 | TCGA-37-5819-01A-01R-1635-07 LUSC LUAD 0.54 0.46 457 | TCGA-39-5011-01A-01R-1443-07 LUSC LUAD 0.838 0.162 458 | TCGA-39-5016-01A-01R-1443-07 LUSC LUSC 0.044 0.956 459 | TCGA-39-5019-01A-01R-1820-07 LUSC LUSC 0.004 0.996 460 | TCGA-39-5021-01A-01R-1443-07 LUSC LUSC 0.08 0.92 461 | TCGA-39-5024-01A-21R-1820-07 LUSC LUSC 0.094 0.906 462 | TCGA-39-5027-01A-21R-1820-07 LUSC LUSC 0.02 0.98 463 | TCGA-39-5028-01A-01R-1443-07 LUSC LUSC 0.166 0.834 464 | TCGA-39-5029-01A-01R-1443-07 LUSC LUSC 0.062 0.938 465 | TCGA-39-5030-01A-01R-1443-07 LUSC LUSC 0.082 0.918 466 | TCGA-39-5031-01A-01R-1443-07 LUSC LUSC 0.022 0.978 467 | TCGA-39-5034-01A-01R-1443-07 LUSC LUAD 0.802 0.198 468 | TCGA-39-5035-01A-01R-1443-07 LUSC LUSC 0.144 0.856 469 | TCGA-39-5036-01A-01R-1443-07 LUSC LUSC 0.048 0.952 470 | TCGA-39-5037-01A-01R-1443-07 LUSC LUSC 0.066 0.934 471 | TCGA-39-5039-01A-01R-1443-07 LUSC LUSC 0.288 0.712 472 | TCGA-43-2578-01A-01R-0851-07 LUSC LUSC 0.422 0.578 473 | TCGA-43-2581-01A-01R-0851-07 LUSC LUAD 0.838 0.162 474 | TCGA-43-3394-01A-01R-0980-07 LUSC LUSC 0.03 0.97 475 | TCGA-43-3920-01A-01R-0980-07 LUSC LUSC 0.094 0.906 476 | TCGA-43-5668-01A-01R-1635-07 LUSC LUAD 0.728 0.272 477 | TCGA-43-6143-01A-11R-1820-07 LUSC LUSC 0.192 0.808 478 | TCGA-43-6647-01A-11R-1820-07 LUSC LUSC 0.116 0.884 479 | TCGA-43-6770-01A-11R-1820-07 LUSC LUSC 0.016 0.984 480 | TCGA-43-6771-01A-11R-1820-07 LUSC LUSC 0.218 0.782 481 | TCGA-46-3765-01A-01R-0980-07 LUSC LUSC 0.01 0.99 482 | TCGA-46-3766-01A-01R-0980-07 LUSC LUSC 0.262 0.738 483 | TCGA-46-3767-01A-01R-0980-07 LUSC LUSC 0.052 0.948 484 | TCGA-46-3768-01A-01R-0980-07 LUSC LUSC 0.092 0.908 485 | TCGA-46-3769-01A-01R-0980-07 LUSC LUAD 0.572 0.428 486 | TCGA-46-6025-01A-11R-1820-07 LUSC LUSC 0.088 0.912 487 | TCGA-46-6026-01A-11R-1820-07 LUSC LUSC 0.144 0.856 488 | TCGA-51-4079-01A-01R-1100-07 LUSC LUSC 0.02 0.98 489 | TCGA-51-4080-01A-01R-1100-07 LUSC LUSC 0.07 0.93 490 | TCGA-51-4081-01A-01R-1100-07 LUSC LUSC 0.032 0.968 491 | TCGA-56-1622-01A-01R-0692-07 LUSC LUSC 0.168 0.832 492 | TCGA-56-5897-01A-11R-1635-07 LUSC LUSC 0.05 0.95 493 | TCGA-56-5898-01A-11R-1635-07 LUSC LUSC 0.046 0.954 494 | TCGA-56-6545-01A-11R-1820-07 LUSC LUSC 0.074 0.926 495 | TCGA-56-6546-01A-11R-1820-07 LUSC LUSC 0.466 0.534 496 | TCGA-60-2695-01A-01R-0851-07 LUSC LUSC 0.448 0.552 497 | TCGA-60-2696-01A-01R-0851-07 LUSC LUSC 0.156 0.844 498 | TCGA-60-2698-01A-01R-0851-07 LUSC LUSC 0.08 0.92 499 | TCGA-60-2706-01A-01R-0851-07 LUSC LUAD 0.624 0.376 500 | TCGA-60-2707-01A-01R-0851-07 LUSC LUSC 0.076 0.924 501 | TCGA-60-2708-01A-01R-0851-07 LUSC LUSC 0.044 0.956 502 | TCGA-60-2709-01A-21R-1820-07 LUSC LUSC 0.106 0.894 503 | TCGA-60-2710-01A-01R-0851-07 LUSC LUSC 0.05 0.95 504 | TCGA-60-2711-01A-01R-0851-07 LUSC LUSC 0.058 0.942 505 | TCGA-60-2712-01A-01R-0851-07 LUSC LUSC 0.096 0.904 506 | TCGA-60-2713-01A-01R-0851-07 LUSC LUSC 0.028 0.972 507 | TCGA-60-2714-01A-01R-0851-07 LUSC LUAD 0.838 0.162 508 | TCGA-60-2715-01A-01R-0851-07 LUSC LUSC 0.174 0.826 509 | TCGA-60-2716-01A-01R-0851-07 LUSC LUSC 0.206 0.794 510 | TCGA-60-2719-01A-01R-0851-07 LUSC LUSC 0.058 0.942 511 | TCGA-60-2720-01A-01R-0851-07 LUSC LUSC 0.16 0.84 512 | TCGA-60-2721-01A-01R-0851-07 LUSC LUSC 0.05 0.95 513 | TCGA-60-2722-01A-01R-0851-07 LUSC LUSC 0.032 0.968 514 | TCGA-60-2723-01A-01R-0851-07 LUSC LUSC 0.06 0.94 515 | TCGA-60-2724-01A-01R-0851-07 LUSC LUSC 0.056 0.944 516 | TCGA-60-2725-01A-01R-1201-07 LUSC LUSC 0.06 0.94 517 | TCGA-60-2726-01A-01R-0851-07 LUSC LUSC 0.166 0.834 518 | TCGA-63-5128-01A-01R-1443-07 LUSC LUSC 0.048 0.952 519 | TCGA-63-5131-01A-01R-1443-07 LUSC LUSC 0.078 0.922 520 | TCGA-63-6202-01A-11R-1820-07 LUSC LUAD 0.826 0.174 521 | TCGA-63-7020-01A-11R-1949-07 LUSC LUSC 0.13 0.87 522 | TCGA-63-7021-01A-11R-1949-07 LUSC LUSC 0.042 0.958 523 | TCGA-63-7022-01A-11R-1949-07 LUSC LUSC 0.09 0.91 524 | TCGA-63-7023-01A-11R-1949-07 LUSC LUSC 0.144 0.856 525 | TCGA-66-2727-01A-01R-0980-07 LUSC LUSC 0.034 0.966 526 | TCGA-66-2734-01A-01R-0980-07 LUSC LUSC 0.042 0.958 527 | TCGA-66-2737-01A-01R-0980-07 LUSC LUSC 0.008 0.992 528 | TCGA-66-2742-01A-01R-0980-07 LUSC LUSC 0.056 0.944 529 | TCGA-66-2744-01A-01R-0980-07 LUSC LUSC 0.426 0.574 530 | TCGA-66-2753-01A-01R-0980-07 LUSC LUSC 0.126 0.874 531 | TCGA-66-2754-01A-01R-0980-07 LUSC LUSC 0.5 0.5 532 | TCGA-66-2755-01A-01R-0851-07 LUSC LUSC 0.136 0.864 533 | TCGA-66-2756-01A-01R-0851-07 LUSC LUAD 0.578 0.422 534 | TCGA-66-2757-01A-01R-0851-07 LUSC LUSC 0.356 0.644 535 | TCGA-66-2758-01A-02R-0851-07 LUSC LUSC 0.07 0.93 536 | TCGA-66-2759-01A-01R-0851-07 LUSC LUSC 0.024 0.976 537 | TCGA-66-2763-01A-01R-0851-07 LUSC LUSC 0.04 0.96 538 | TCGA-66-2765-01A-01R-0851-07 LUSC LUSC 0.04 0.96 539 | TCGA-66-2766-01A-01R-0851-07 LUSC LUSC 0.036 0.964 540 | TCGA-66-2767-01A-01R-0851-07 LUSC LUSC 0.046 0.954 541 | TCGA-66-2768-01A-01R-0851-07 LUSC LUSC 0.04 0.96 542 | TCGA-66-2769-01A-02R-0851-07 LUSC LUSC 0.156 0.844 543 | TCGA-66-2770-01A-01R-0851-07 LUSC LUSC 0.038 0.962 544 | TCGA-66-2771-01A-01R-0980-07 LUSC LUSC 0.068 0.932 545 | TCGA-66-2773-01A-01R-1201-07 LUSC LUSC 0.09 0.91 546 | TCGA-66-2777-01A-01R-1201-07 LUSC LUSC 0.044 0.956 547 | TCGA-66-2778-01A-02R-0851-07 LUSC LUSC 0.102 0.898 548 | TCGA-66-2780-01A-01R-0851-07 LUSC LUSC 0.016 0.984 549 | TCGA-66-2781-01A-01R-0851-07 LUSC LUSC 0.024 0.976 550 | TCGA-66-2782-01A-01R-0851-07 LUSC LUSC 0.118 0.882 551 | TCGA-66-2783-01A-01R-1201-07 LUSC LUSC 0.038 0.962 552 | TCGA-66-2785-01A-01R-0851-07 LUSC LUSC 0.386 0.614 553 | TCGA-66-2786-01A-01R-0851-07 LUSC LUSC 0.166 0.834 554 | TCGA-66-2787-01A-01R-0980-07 LUSC LUSC 0.06 0.94 555 | TCGA-66-2788-01A-01R-0980-07 LUSC LUSC 0.018 0.982 556 | TCGA-66-2789-01A-01R-0980-07 LUSC LUSC 0.096 0.904 557 | TCGA-66-2790-01A-01R-0980-07 LUSC LUSC 0.11 0.89 558 | TCGA-66-2791-01A-01R-0980-07 LUSC LUSC 0.008 0.992 559 | TCGA-66-2792-01A-01R-0980-07 LUSC LUSC 0.018 0.982 560 | TCGA-66-2793-01A-01R-1201-07 LUSC LUSC 0.164 0.836 561 | TCGA-66-2794-01A-01R-1201-07 LUSC LUSC 0.036 0.964 562 | TCGA-66-2795-01A-02R-0980-07 LUSC LUSC 0.03 0.97 563 | TCGA-66-2800-01A-01R-1201-07 LUSC LUSC 0.04 0.96 564 | TCGA-70-6722-01A-11R-1820-07 LUSC LUSC 0.336 0.664 565 | TCGA-70-6723-01A-11R-1820-07 LUSC LUSC 0.11 0.89 566 | TCGA-77-6842-01A-11R-1949-07 LUSC LUSC 0.312 0.688 567 | TCGA-77-6843-01A-11R-1949-07 LUSC LUSC 0.104 0.896 568 | TCGA-77-6844-01A-11R-1949-07 LUSC LUSC 0.026 0.974 569 | TCGA-77-6845-01A-11R-1949-07 LUSC LUSC 0.04 0.96 570 | TCGA-79-5596-01A-31R-1949-07 LUSC LUSC 0.012 0.988 571 | TCGA-85-6175-01A-11R-1820-07 LUSC LUSC 0.264 0.736 572 | TCGA-85-6560-01A-11R-1820-07 LUSC LUSC 0.498 0.502 573 | TCGA-85-6561-01A-11R-1820-07 LUSC LUSC 0.022 0.978 574 | TCGA-85-6798-01A-11R-1949-07 LUSC LUSC 0.05 0.95 575 | TCGA-90-6837-01A-11R-1949-07 LUSC LUSC 0.088 0.912 576 | TCGA-94-7033-01A-11R-1949-07 LUSC LUSC 0.106 0.894 577 | -------------------------------------------------------------------------------- /Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt: -------------------------------------------------------------------------------- 1 | SampleID ActualClass PredictedClass LUAD_Probability LUSC_Probability 2 | TCGA-05-4244-01A-01R-1107-07 LUAD LUSC 0.44 0.56 3 | TCGA-05-4249-01A-01R-1107-07 LUAD LUAD 0.768 0.232 4 | TCGA-05-4250-01A-01R-1107-07 LUAD LUSC 0.368 0.632 5 | TCGA-05-4382-01A-01R-1206-07 LUAD LUAD 0.996 0.004 6 | TCGA-05-4384-01A-01R-1755-07 LUAD LUAD 1 0 7 | TCGA-05-4389-01A-01R-1206-07 LUAD LUAD 1 0 8 | TCGA-05-4390-01A-02R-1755-07 LUAD LUAD 0.996 0.004 9 | TCGA-05-4395-01A-01R-1206-07 LUAD LUAD 0.992 0.008 10 | TCGA-05-4396-01A-21R-1858-07 LUAD LUAD 0.986 0.014 11 | TCGA-05-4397-01A-01R-1206-07 LUAD LUAD 0.992 0.008 12 | TCGA-05-4398-01A-01R-1206-07 LUAD LUAD 0.992 0.008 13 | TCGA-05-4402-01A-01R-1206-07 LUAD LUAD 1 0 14 | TCGA-05-4403-01A-01R-1206-07 LUAD LUAD 0.994 0.006 15 | TCGA-05-4405-01A-21R-1858-07 LUAD LUAD 0.998 0.002 16 | TCGA-05-4410-01A-21R-1858-07 LUAD LUAD 1 0 17 | TCGA-05-4415-01A-22R-1858-07 LUAD LUAD 0.992 0.008 18 | TCGA-05-4417-01A-22R-1858-07 LUAD LUAD 1 0 19 | TCGA-05-4418-01A-01R-1206-07 LUAD LUAD 0.998 0.002 20 | TCGA-05-4420-01A-01R-1206-07 LUAD LUAD 0.998 0.002 21 | TCGA-05-4422-01A-01R-1206-07 LUAD LUAD 0.996 0.004 22 | TCGA-05-4424-01A-22R-1858-07 LUAD LUAD 1 0 23 | TCGA-05-4425-01A-01R-1755-07 LUAD LUAD 0.996 0.004 24 | TCGA-05-4426-01A-01R-1206-07 LUAD LUAD 0.998 0.002 25 | TCGA-05-4427-01A-21R-1858-07 LUAD LUAD 0.98 0.02 26 | TCGA-05-4430-01A-02R-1206-07 LUAD LUAD 0.996 0.004 27 | TCGA-05-4432-01A-01R-1206-07 LUAD LUAD 0.994 0.006 28 | TCGA-05-4433-01A-22R-1858-07 LUAD LUAD 0.996 0.004 29 | TCGA-05-4434-01A-01R-1206-07 LUAD LUAD 1 0 30 | TCGA-05-5420-01A-01R-1628-07 LUAD LUAD 0.986 0.014 31 | TCGA-05-5423-01A-01R-1628-07 LUAD LUAD 0.994 0.006 32 | TCGA-05-5425-01A-02R-1628-07 LUAD LUAD 1 0 33 | TCGA-05-5428-01A-01R-1628-07 LUAD LUAD 0.994 0.006 34 | TCGA-05-5429-01A-01R-1628-07 LUAD LUAD 0.984 0.016 35 | TCGA-05-5715-01A-01R-1628-07 LUAD LUAD 1 0 36 | TCGA-35-3615-01A-01R-0946-07 LUAD LUAD 1 0 37 | TCGA-35-4122-01A-01R-1107-07 LUAD LUSC 0.184 0.816 38 | TCGA-35-4123-01A-01R-1107-07 LUAD LUSC 0.25 0.75 39 | TCGA-35-5375-01A-01R-1628-07 LUAD LUAD 0.872 0.128 40 | TCGA-38-4625-01A-01R-1206-07 LUAD LUAD 0.998 0.002 41 | TCGA-38-4626-01A-01R-1206-07 LUAD LUAD 1 0 42 | TCGA-38-4627-01A-01R-1206-07 LUAD LUAD 0.996 0.004 43 | TCGA-38-4628-01A-01R-1206-07 LUAD LUAD 0.996 0.004 44 | TCGA-38-4629-01A-02R-1206-07 LUAD LUAD 0.99 0.01 45 | TCGA-38-4630-01A-01R-1206-07 LUAD LUAD 0.902 0.098 46 | TCGA-38-4631-01A-01R-1755-07 LUAD LUAD 0.962 0.038 47 | TCGA-38-4632-01A-01R-1755-07 LUAD LUAD 1 0 48 | TCGA-38-6178-01A-11R-1755-07 LUAD LUAD 0.994 0.006 49 | TCGA-38-7271-01A-11R-2039-07 LUAD LUAD 0.998 0.002 50 | TCGA-44-2655-01A-01R-0946-07 LUAD LUAD 1 0 51 | TCGA-44-2656-01A-02R-0946-07 LUAD LUAD 1 0 52 | TCGA-44-2657-01A-01R-1107-07 LUAD LUAD 1 0 53 | TCGA-44-2659-01A-01R-0946-07 LUAD LUAD 0.998 0.002 54 | TCGA-44-2661-01A-01R-1107-07 LUAD LUAD 0.968 0.032 55 | TCGA-44-2662-01A-01R-0946-07 LUAD LUAD 1 0 56 | TCGA-44-2665-01A-01R-0946-07 LUAD LUAD 0.982 0.018 57 | TCGA-44-2666-01A-01R-0946-07 LUAD LUAD 1 0 58 | TCGA-44-2668-01A-01R-0946-07 LUAD LUAD 1 0 59 | TCGA-44-3396-01A-01R-1206-07 LUAD LUAD 1 0 60 | TCGA-44-3398-01A-01R-1107-07 LUAD LUAD 0.898 0.102 61 | TCGA-44-3918-01A-01R-1107-07 LUAD LUSC 0.49 0.51 62 | TCGA-44-3919-01A-02R-1107-07 LUAD LUAD 0.864 0.136 63 | TCGA-44-4112-01A-01R-1107-07 LUAD LUSC 0.434 0.566 64 | TCGA-44-5643-01A-01R-1628-07 LUAD LUAD 0.892 0.108 65 | TCGA-44-5644-01A-21R-2039-07 LUAD LUAD 0.998 0.002 66 | TCGA-44-5645-01A-01R-1628-07 LUAD LUAD 0.982 0.018 67 | TCGA-44-6145-01A-11R-1755-07 LUAD LUAD 1 0 68 | TCGA-44-6146-01A-11R-1755-07 LUAD LUAD 1 0 69 | TCGA-44-6147-01A-11R-1755-07 LUAD LUAD 1 0 70 | TCGA-44-6148-01A-11R-1755-07 LUAD LUAD 0.982 0.018 71 | TCGA-44-6774-01A-21R-1858-07 LUAD LUAD 0.998 0.002 72 | TCGA-44-6775-01A-11R-1858-07 LUAD LUAD 1 0 73 | TCGA-44-6776-01A-11R-1858-07 LUAD LUAD 1 0 74 | TCGA-44-6777-01A-11R-1858-07 LUAD LUAD 0.996 0.004 75 | TCGA-44-6778-01A-11R-1858-07 LUAD LUAD 0.97 0.03 76 | TCGA-44-6779-01A-11R-1858-07 LUAD LUAD 0.994 0.006 77 | TCGA-44-7659-01A-11R-2066-07 LUAD LUAD 0.998 0.002 78 | TCGA-44-7660-01A-11R-2066-07 LUAD LUAD 0.988 0.012 79 | TCGA-44-7661-01A-11R-2066-07 LUAD LUAD 0.998 0.002 80 | TCGA-44-7662-01A-11R-2066-07 LUAD LUAD 1 0 81 | TCGA-44-7667-01A-31R-2066-07 LUAD LUAD 0.99 0.01 82 | TCGA-44-7669-01A-21R-2066-07 LUAD LUAD 0.982 0.018 83 | TCGA-44-7670-01A-11R-2066-07 LUAD LUAD 0.992 0.008 84 | TCGA-44-7671-01A-11R-2066-07 LUAD LUAD 1 0 85 | TCGA-44-7672-01A-11R-2066-07 LUAD LUAD 1 0 86 | TCGA-44-8117-01A-11R-2241-07 LUAD LUAD 0.994 0.006 87 | TCGA-44-8119-01A-11R-2241-07 LUAD LUAD 1 0 88 | TCGA-44-8120-01A-11R-2241-07 LUAD LUAD 1 0 89 | TCGA-49-4486-01A-01R-1206-07 LUAD LUAD 0.992 0.008 90 | TCGA-49-4487-01A-21R-1858-07 LUAD LUAD 0.998 0.002 91 | TCGA-49-4488-01A-01R-1755-07 LUAD LUAD 1 0 92 | TCGA-49-4490-01A-21R-1858-07 LUAD LUAD 1 0 93 | TCGA-49-4494-01A-01R-1206-07 LUAD LUAD 0.996 0.004 94 | TCGA-49-4501-01A-01R-1206-07 LUAD LUAD 1 0 95 | TCGA-49-4505-01A-01R-1206-07 LUAD LUAD 1 0 96 | TCGA-49-4506-01A-01R-1206-07 LUAD LUAD 0.938 0.062 97 | TCGA-49-4507-01A-01R-1206-07 LUAD LUAD 1 0 98 | TCGA-49-4510-01A-01R-1206-07 LUAD LUAD 1 0 99 | TCGA-49-4512-01A-21R-1858-07 LUAD LUAD 1 0 100 | TCGA-49-4514-01A-21R-1858-07 LUAD LUAD 0.992 0.008 101 | TCGA-49-6742-01A-11R-1858-07 LUAD LUAD 1 0 102 | TCGA-49-6743-01A-11R-1858-07 LUAD LUAD 0.998 0.002 103 | TCGA-49-6744-01A-11R-1858-07 LUAD LUAD 1 0 104 | TCGA-49-6745-01A-11R-1858-07 LUAD LUAD 1 0 105 | TCGA-49-6761-01A-31R-1949-07 LUAD LUAD 1 0 106 | TCGA-49-6767-01A-11R-1858-07 LUAD LUAD 0.988 0.012 107 | TCGA-50-5044-01A-21R-1858-07 LUAD LUAD 0.986 0.014 108 | TCGA-50-5049-01A-01R-1628-07 LUAD LUAD 0.996 0.004 109 | TCGA-50-5051-01A-21R-1858-07 LUAD LUAD 0.984 0.016 110 | TCGA-50-5055-01A-01R-1628-07 LUAD LUAD 0.998 0.002 111 | TCGA-50-5066-01A-01R-1628-07 LUAD LUAD 0.974 0.026 112 | TCGA-50-5066-02A-11R-2090-07 LUAD LUAD 0.996 0.004 113 | TCGA-50-5068-01A-01R-1628-07 LUAD LUAD 0.98 0.02 114 | TCGA-50-5072-01A-21R-1858-07 LUAD LUAD 0.94 0.06 115 | TCGA-50-5931-01A-11R-1755-07 LUAD LUAD 0.926 0.074 116 | TCGA-50-5932-01A-11R-1755-07 LUAD LUAD 0.998 0.002 117 | TCGA-50-5933-01A-11R-1755-07 LUAD LUAD 0.988 0.012 118 | TCGA-50-5935-01A-11R-1755-07 LUAD LUAD 1 0 119 | TCGA-50-5936-01A-11R-1628-07 LUAD LUAD 0.994 0.006 120 | TCGA-50-5939-01A-11R-1628-07 LUAD LUAD 0.998 0.002 121 | TCGA-50-5941-01A-11R-1755-07 LUAD LUAD 1 0 122 | TCGA-50-5942-01A-21R-1755-07 LUAD LUAD 0.996 0.004 123 | TCGA-50-5944-01A-11R-1755-07 LUAD LUAD 1 0 124 | TCGA-50-5946-01A-11R-1755-07 LUAD LUAD 0.996 0.004 125 | TCGA-50-5946-02A-11R-2090-07 LUAD LUAD 1 0 126 | TCGA-50-6590-01A-12R-1858-07 LUAD LUAD 0.922 0.078 127 | TCGA-50-6591-01A-11R-1755-07 LUAD LUAD 0.9 0.1 128 | TCGA-50-6592-01A-11R-1755-07 LUAD LUAD 0.996 0.004 129 | TCGA-50-6593-01A-11R-1755-07 LUAD LUAD 1 0 130 | TCGA-50-6594-01A-11R-1755-07 LUAD LUAD 0.992 0.008 131 | TCGA-50-6595-01A-12R-1858-07 LUAD LUAD 1 0 132 | TCGA-50-6597-01A-11R-1858-07 LUAD LUAD 0.994 0.006 133 | TCGA-50-6673-01A-11R-1949-07 LUAD LUAD 1 0 134 | TCGA-50-7109-01A-11R-2039-07 LUAD LUAD 0.998 0.002 135 | TCGA-53-7624-01A-11R-2066-07 LUAD LUAD 0.976 0.024 136 | TCGA-53-7626-01A-12R-2066-07 LUAD LUAD 1 0 137 | TCGA-53-7813-01A-11R-2170-07 LUAD LUAD 0.996 0.004 138 | TCGA-55-1592-01A-01R-0946-07 LUAD LUAD 0.996 0.004 139 | TCGA-55-1594-01A-01R-0946-07 LUAD LUAD 0.986 0.014 140 | TCGA-55-1595-01A-01R-0946-07 LUAD LUAD 1 0 141 | TCGA-55-1596-01A-01R-0946-07 LUAD LUAD 0.996 0.004 142 | TCGA-55-5899-01A-11R-1628-07 LUAD LUAD 0.988 0.012 143 | TCGA-55-6543-01A-11R-1755-07 LUAD LUAD 0.996 0.004 144 | TCGA-55-6642-01A-11R-1858-07 LUAD LUAD 1 0 145 | TCGA-55-6712-01A-11R-1858-07 LUAD LUAD 0.998 0.002 146 | TCGA-55-6968-01A-11R-1949-07 LUAD LUAD 0.982 0.018 147 | TCGA-55-6969-01A-11R-1949-07 LUAD LUAD 0.992 0.008 148 | TCGA-55-6970-01A-11R-1949-07 LUAD LUAD 1 0 149 | TCGA-55-6971-01A-11R-1949-07 LUAD LUAD 1 0 150 | TCGA-55-6972-01A-11R-1949-07 LUAD LUAD 0.996 0.004 151 | TCGA-55-6975-01A-11R-1949-07 LUAD LUAD 0.936 0.064 152 | TCGA-55-6978-01A-11R-1949-07 LUAD LUAD 1 0 153 | TCGA-55-6979-01A-11R-1949-07 LUAD LUAD 0.954 0.046 154 | TCGA-55-6980-01A-11R-1949-07 LUAD LUAD 1 0 155 | TCGA-55-6981-01A-11R-1949-07 LUAD LUAD 0.996 0.004 156 | TCGA-55-6982-01A-11R-1949-07 LUAD LUAD 0.96 0.04 157 | TCGA-55-6983-01A-11R-1949-07 LUAD LUAD 0.998 0.002 158 | TCGA-55-6984-01A-11R-1949-07 LUAD LUAD 0.996 0.004 159 | TCGA-55-6985-01A-11R-1949-07 LUAD LUAD 1 0 160 | TCGA-55-6986-01A-11R-1949-07 LUAD LUAD 0.998 0.002 161 | TCGA-55-6987-01A-11R-1949-07 LUAD LUAD 1 0 162 | TCGA-55-7227-01A-11R-2039-07 LUAD LUAD 1 0 163 | TCGA-55-7281-01A-11R-2039-07 LUAD LUAD 1 0 164 | TCGA-55-7283-01A-11R-2039-07 LUAD LUAD 1 0 165 | TCGA-55-7284-01B-11R-2241-07 LUAD LUAD 0.996 0.004 166 | TCGA-55-7570-01A-11R-2039-07 LUAD LUAD 0.962 0.038 167 | TCGA-55-7573-01A-11R-2039-07 LUAD LUAD 1 0 168 | TCGA-55-7574-01A-11R-2039-07 LUAD LUAD 1 0 169 | TCGA-55-7576-01A-11R-2066-07 LUAD LUAD 1 0 170 | TCGA-55-7724-01A-11R-2170-07 LUAD LUAD 0.724 0.276 171 | TCGA-55-7725-01A-11R-2170-07 LUAD LUAD 0.996 0.004 172 | TCGA-55-7726-01A-11R-2170-07 LUAD LUAD 0.936 0.064 173 | TCGA-55-7727-01A-11R-2170-07 LUAD LUAD 0.926 0.074 174 | TCGA-55-7728-01A-11R-2187-07 LUAD LUAD 1 0 175 | TCGA-55-7815-01A-11R-2170-07 LUAD LUAD 0.87 0.13 176 | TCGA-55-7903-01A-11R-2170-07 LUAD LUAD 1 0 177 | TCGA-55-7907-01A-11R-2170-07 LUAD LUAD 1 0 178 | TCGA-55-7910-01A-11R-2170-07 LUAD LUAD 0.994 0.006 179 | TCGA-55-7911-01A-11R-2170-07 LUAD LUAD 1 0 180 | TCGA-55-7913-01B-11R-2241-07 LUAD LUAD 0.988 0.012 181 | TCGA-55-7914-01A-11R-2170-07 LUAD LUAD 0.998 0.002 182 | TCGA-55-7994-01A-11R-2187-07 LUAD LUAD 0.99 0.01 183 | TCGA-55-7995-01A-11R-2187-07 LUAD LUAD 1 0 184 | TCGA-55-8085-01A-11R-2241-07 LUAD LUAD 1 0 185 | TCGA-55-8087-01A-11R-2241-07 LUAD LUAD 0.998 0.002 186 | TCGA-55-8089-01A-11R-2241-07 LUAD LUAD 1 0 187 | TCGA-55-8090-01A-11R-2241-07 LUAD LUAD 0.996 0.004 188 | TCGA-55-8091-01A-11R-2241-07 LUAD LUAD 0.996 0.004 189 | TCGA-55-8092-01A-11R-2241-07 LUAD LUAD 0.994 0.006 190 | TCGA-55-8094-01A-11R-2241-07 LUAD LUAD 1 0 191 | TCGA-55-8096-01A-11R-2241-07 LUAD LUAD 0.996 0.004 192 | TCGA-55-8097-01A-11R-2241-07 LUAD LUAD 1 0 193 | TCGA-55-8203-01A-11R-2241-07 LUAD LUAD 1 0 194 | TCGA-55-8204-01A-11R-2241-07 LUAD LUAD 0.666 0.334 195 | TCGA-55-8205-01A-11R-2241-07 LUAD LUAD 0.998 0.002 196 | TCGA-55-8206-01A-11R-2241-07 LUAD LUAD 1 0 197 | TCGA-55-8207-01A-11R-2241-07 LUAD LUAD 1 0 198 | TCGA-55-8208-01A-11R-2241-07 LUAD LUAD 1 0 199 | TCGA-55-8299-01A-11R-2287-07 LUAD LUAD 1 0 200 | TCGA-55-8301-01A-11R-2287-07 LUAD LUAD 1 0 201 | TCGA-64-1676-01A-01R-0946-07 LUAD LUAD 0.98 0.02 202 | TCGA-64-1677-01A-01R-0946-07 LUAD LUAD 0.998 0.002 203 | TCGA-64-1678-01A-01R-0946-07 LUAD LUAD 0.928 0.072 204 | TCGA-64-1679-01A-21R-2066-07 LUAD LUAD 1 0 205 | TCGA-64-1680-01A-02R-0946-07 LUAD LUAD 0.998 0.002 206 | TCGA-64-1681-01A-11R-2066-07 LUAD LUAD 1 0 207 | TCGA-64-5774-01A-01R-1628-07 LUAD LUAD 0.998 0.002 208 | TCGA-64-5775-01A-01R-1628-07 LUAD LUAD 0.942 0.058 209 | TCGA-64-5778-01A-01R-1628-07 LUAD LUAD 1 0 210 | TCGA-64-5779-01A-01R-1628-07 LUAD LUAD 1 0 211 | TCGA-64-5781-01A-01R-1628-07 LUAD LUAD 0.998 0.002 212 | TCGA-64-5815-01A-01R-1628-07 LUAD LUAD 1 0 213 | TCGA-67-3770-01A-01R-0946-07 LUAD LUAD 0.998 0.002 214 | TCGA-67-3771-01A-01R-0946-07 LUAD LUAD 1 0 215 | TCGA-67-3772-01A-01R-0946-07 LUAD LUAD 0.996 0.004 216 | TCGA-67-3773-01A-01R-0946-07 LUAD LUAD 0.998 0.002 217 | TCGA-67-3774-01A-01R-0946-07 LUAD LUAD 1 0 218 | TCGA-67-4679-01B-01R-1755-07 LUAD LUAD 1 0 219 | TCGA-67-6215-01A-11R-1755-07 LUAD LUAD 1 0 220 | TCGA-67-6216-01A-11R-1755-07 LUAD LUAD 0.998 0.002 221 | TCGA-67-6217-01A-11R-1755-07 LUAD LUAD 1 0 222 | TCGA-69-7760-01A-11R-2170-07 LUAD LUAD 0.994 0.006 223 | TCGA-69-7761-01A-11R-2170-07 LUAD LUAD 0.988 0.012 224 | TCGA-69-7763-01A-11R-2170-07 LUAD LUAD 1 0 225 | TCGA-69-7764-01A-11R-2170-07 LUAD LUAD 1 0 226 | TCGA-69-7765-01A-11R-2170-07 LUAD LUAD 1 0 227 | TCGA-69-7973-01A-11R-2187-07 LUAD LUAD 0.996 0.004 228 | TCGA-69-7974-01A-11R-2187-07 LUAD LUAD 1 0 229 | TCGA-69-7978-01A-11R-2187-07 LUAD LUAD 1 0 230 | TCGA-69-7979-01A-11R-2187-07 LUAD LUAD 0.992 0.008 231 | TCGA-69-7980-01A-11R-2187-07 LUAD LUAD 1 0 232 | TCGA-69-8253-01A-11R-2287-07 LUAD LUAD 1 0 233 | TCGA-69-8254-01A-11R-2287-07 LUAD LUAD 0.994 0.006 234 | TCGA-69-8255-01A-11R-2287-07 LUAD LUAD 0.988 0.012 235 | TCGA-71-6725-01A-11R-1858-07 LUAD LUAD 1 0 236 | TCGA-73-4658-01A-01R-1755-07 LUAD LUAD 1 0 237 | TCGA-73-4659-01A-01R-1206-07 LUAD LUAD 1 0 238 | TCGA-73-4662-01A-01R-1206-07 LUAD LUAD 0.998 0.002 239 | TCGA-73-4666-01A-01R-1206-07 LUAD LUAD 0.994 0.006 240 | TCGA-73-4668-01A-01R-1206-07 LUAD LUAD 1 0 241 | TCGA-73-4670-01A-01R-1206-07 LUAD LUAD 0.998 0.002 242 | TCGA-73-4675-01A-01R-1206-07 LUAD LUAD 1 0 243 | TCGA-73-4676-01A-01R-1755-07 LUAD LUAD 0.998 0.002 244 | TCGA-73-4677-01A-01R-1206-07 LUAD LUAD 1 0 245 | TCGA-73-7498-01A-12R-2187-07 LUAD LUAD 1 0 246 | TCGA-73-7499-01A-11R-2187-07 LUAD LUAD 1 0 247 | TCGA-75-5122-01A-01R-1755-07 LUAD LUAD 0.992 0.008 248 | TCGA-75-5125-01A-01R-1755-07 LUAD LUAD 1 0 249 | TCGA-75-5126-01A-01R-1755-07 LUAD LUAD 0.998 0.002 250 | TCGA-75-5146-01A-01R-1628-07 LUAD LUAD 0.994 0.006 251 | TCGA-75-5147-01A-01R-1628-07 LUAD LUAD 0.998 0.002 252 | TCGA-75-6203-01A-11R-1755-07 LUAD LUAD 1 0 253 | TCGA-75-6205-01A-11R-1755-07 LUAD LUAD 1 0 254 | TCGA-75-6206-01A-11R-1755-07 LUAD LUAD 1 0 255 | TCGA-75-6207-01A-11R-1755-07 LUAD LUAD 0.99 0.01 256 | TCGA-75-6211-01A-11R-1755-07 LUAD LUAD 0.99 0.01 257 | TCGA-75-6212-01A-11R-1755-07 LUAD LUAD 1 0 258 | TCGA-75-6214-01A-41R-1949-07 LUAD LUAD 0.956 0.044 259 | TCGA-75-7025-01A-12R-1949-07 LUAD LUAD 1 0 260 | TCGA-75-7027-01A-11R-1949-07 LUAD LUAD 0.996 0.004 261 | TCGA-75-7030-01A-11R-1949-07 LUAD LUAD 1 0 262 | TCGA-75-7031-01A-11R-1949-07 LUAD LUAD 1 0 263 | TCGA-78-7143-01A-11R-2039-07 LUAD LUAD 1 0 264 | TCGA-78-7145-01A-11R-2039-07 LUAD LUAD 0.998 0.002 265 | TCGA-78-7146-01A-11R-2039-07 LUAD LUAD 0.974 0.026 266 | TCGA-78-7147-01A-11R-2039-07 LUAD LUAD 1 0 267 | TCGA-78-7148-01A-11R-2039-07 LUAD LUAD 1 0 268 | TCGA-78-7149-01A-11R-2039-07 LUAD LUAD 1 0 269 | TCGA-78-7150-01A-21R-2039-07 LUAD LUAD 0.982 0.018 270 | TCGA-78-7152-01A-11R-2039-07 LUAD LUAD 1 0 271 | TCGA-78-7153-01A-11R-2039-07 LUAD LUAD 1 0 272 | TCGA-78-7154-01A-11R-2039-07 LUAD LUAD 0.974 0.026 273 | TCGA-78-7155-01A-11R-2039-07 LUAD LUAD 0.928 0.072 274 | TCGA-78-7156-01A-11R-2039-07 LUAD LUAD 0.998 0.002 275 | TCGA-78-7158-01A-11R-2039-07 LUAD LUAD 0.988 0.012 276 | TCGA-78-7159-01A-11R-2039-07 LUAD LUAD 1 0 277 | TCGA-78-7160-01A-11R-2039-07 LUAD LUAD 1 0 278 | TCGA-78-7161-01A-11R-2039-07 LUAD LUAD 1 0 279 | TCGA-78-7162-01A-21R-2066-07 LUAD LUAD 0.998 0.002 280 | TCGA-78-7163-01A-12R-2066-07 LUAD LUAD 0.982 0.018 281 | TCGA-78-7166-01A-12R-2066-07 LUAD LUAD 1 0 282 | TCGA-78-7167-01A-11R-2066-07 LUAD LUAD 0.998 0.002 283 | TCGA-78-7220-01A-11R-2039-07 LUAD LUAD 0.996 0.004 284 | TCGA-78-7535-01A-11R-2066-07 LUAD LUAD 1 0 285 | TCGA-78-7536-01A-11R-2066-07 LUAD LUAD 0.986 0.014 286 | TCGA-78-7537-01A-11R-2066-07 LUAD LUAD 1 0 287 | TCGA-78-7539-01A-11R-2066-07 LUAD LUAD 0.994 0.006 288 | TCGA-78-7540-01A-11R-2066-07 LUAD LUAD 0.998 0.002 289 | TCGA-78-7542-01A-21R-2066-07 LUAD LUAD 0.984 0.016 290 | TCGA-78-7633-01A-11R-2066-07 LUAD LUAD 0.998 0.002 291 | TCGA-80-5607-01A-31R-1949-07 LUAD LUAD 1 0 292 | TCGA-80-5608-01A-31R-1949-07 LUAD LUAD 1 0 293 | TCGA-80-5611-01A-01R-1628-07 LUAD LUAD 0.994 0.006 294 | TCGA-83-5908-01A-21R-2287-07 LUAD LUAD 0.998 0.002 295 | TCGA-86-6562-01A-11R-1755-07 LUAD LUAD 1 0 296 | TCGA-86-6851-01A-11R-1949-07 LUAD LUAD 0.998 0.002 297 | TCGA-86-7701-01A-11R-2170-07 LUAD LUAD 0.992 0.008 298 | TCGA-86-7711-01A-11R-2066-07 LUAD LUAD 0.982 0.018 299 | TCGA-86-7713-01A-11R-2066-07 LUAD LUAD 0.998 0.002 300 | TCGA-86-7714-01A-12R-2170-07 LUAD LUAD 1 0 301 | TCGA-86-7953-01A-11R-2187-07 LUAD LUAD 0.998 0.002 302 | TCGA-86-7954-01A-11R-2187-07 LUAD LUAD 0.998 0.002 303 | TCGA-86-7955-01A-11R-2187-07 LUAD LUAD 0.994 0.006 304 | TCGA-86-8054-01A-11R-2241-07 LUAD LUAD 0.992 0.008 305 | TCGA-86-8055-01A-11R-2241-07 LUAD LUAD 0.998 0.002 306 | TCGA-86-8056-01A-11R-2241-07 LUAD LUAD 0.998 0.002 307 | TCGA-86-8073-01A-11R-2241-07 LUAD LUAD 0.996 0.004 308 | TCGA-86-8074-01A-11R-2241-07 LUAD LUAD 1 0 309 | TCGA-86-8075-01A-11R-2241-07 LUAD LUAD 1 0 310 | TCGA-86-8076-01A-31R-2241-07 LUAD LUAD 1 0 311 | TCGA-86-8279-01A-11R-2287-07 LUAD LUAD 1 0 312 | TCGA-86-8280-01A-11R-2287-07 LUAD LUAD 1 0 313 | TCGA-86-8281-01A-11R-2287-07 LUAD LUAD 1 0 314 | TCGA-91-6828-01A-11R-1858-07 LUAD LUAD 0.998 0.002 315 | TCGA-91-6829-01A-21R-1858-07 LUAD LUAD 0.986 0.014 316 | TCGA-91-6830-01A-11R-1949-07 LUAD LUAD 1 0 317 | TCGA-91-6831-01A-11R-1858-07 LUAD LUAD 0.99 0.01 318 | TCGA-91-6835-01A-11R-1858-07 LUAD LUAD 1 0 319 | TCGA-91-6836-01A-21R-1858-07 LUAD LUAD 0.978 0.022 320 | TCGA-91-6840-01A-11R-1949-07 LUAD LUAD 0.99 0.01 321 | TCGA-91-6847-01A-11R-1949-07 LUAD LUAD 0.95 0.05 322 | TCGA-91-6848-01A-11R-1949-07 LUAD LUAD 0.924 0.076 323 | TCGA-91-6849-01A-11R-1949-07 LUAD LUAD 1 0 324 | TCGA-91-7771-01A-11R-2170-07 LUAD LUAD 0.998 0.002 325 | TCGA-93-7347-01A-11R-2187-07 LUAD LUAD 1 0 326 | TCGA-93-7348-01A-21R-2039-07 LUAD LUAD 1 0 327 | TCGA-93-8067-01A-11R-2287-07 LUAD LUAD 1 0 328 | TCGA-95-7039-01A-11R-1949-07 LUAD LUAD 1 0 329 | TCGA-95-7043-01A-11R-1949-07 LUAD LUAD 0.986 0.014 330 | TCGA-95-7562-01A-11R-2241-07 LUAD LUAD 0.99 0.01 331 | TCGA-95-7567-01A-11R-2066-07 LUAD LUAD 1 0 332 | TCGA-95-7944-01A-11R-2187-07 LUAD LUAD 0.998 0.002 333 | TCGA-95-7947-01A-11R-2187-07 LUAD LUAD 0.988 0.012 334 | TCGA-95-7948-01A-11R-2187-07 LUAD LUAD 0.994 0.006 335 | TCGA-95-8039-01A-11R-2241-07 LUAD LUAD 1 0 336 | TCGA-97-7546-01A-11R-2039-07 LUAD LUAD 0.998 0.002 337 | TCGA-97-7547-01A-11R-2039-07 LUAD LUAD 0.968 0.032 338 | TCGA-97-7552-01A-11R-2039-07 LUAD LUAD 0.996 0.004 339 | TCGA-97-7553-01A-21R-2039-07 LUAD LUAD 1 0 340 | TCGA-97-7554-01A-11R-2039-07 LUAD LUAD 0.998 0.002 341 | TCGA-97-7937-01A-11R-2170-07 LUAD LUAD 1 0 342 | TCGA-97-7938-01A-11R-2170-07 LUAD LUAD 1 0 343 | TCGA-97-7941-01A-11R-2187-07 LUAD LUAD 1 0 344 | TCGA-97-8171-01A-11R-2287-07 LUAD LUAD 0.994 0.006 345 | TCGA-97-8172-01A-11R-2287-07 LUAD LUAD 0.998 0.002 346 | TCGA-97-8174-01A-11R-2287-07 LUAD LUAD 0.998 0.002 347 | TCGA-97-8175-01A-11R-2287-07 LUAD LUAD 0.968 0.032 348 | TCGA-97-8177-01A-11R-2287-07 LUAD LUAD 1 0 349 | TCGA-97-8179-01A-11R-2287-07 LUAD LUAD 1 0 350 | TCGA-99-7458-01A-11R-2039-07 LUAD LUAD 0.998 0.002 351 | TCGA-99-8025-01A-11R-2241-07 LUAD LUAD 1 0 352 | TCGA-99-8028-01A-11R-2241-07 LUAD LUAD 1 0 353 | TCGA-99-8032-01A-11R-2241-07 LUAD LUAD 1 0 354 | TCGA-99-8033-01A-11R-2241-07 LUAD LUAD 0.874 0.126 355 | TCGA-J2-8192-01A-11R-2241-07 LUAD LUAD 0.762 0.238 356 | TCGA-J2-8194-01A-11R-2241-07 LUAD LUAD 1 0 357 | TCGA-18-3406-01A-01R-0980-07 LUSC LUAD 0.776 0.224 358 | TCGA-18-3407-01A-01R-0980-07 LUSC LUSC 0 1 359 | TCGA-18-3408-01A-01R-0980-07 LUSC LUAD 0.9 0.1 360 | TCGA-18-3409-01A-01R-0980-07 LUSC LUAD 0.832 0.168 361 | TCGA-18-3410-01A-01R-0980-07 LUSC LUSC 0.002 0.998 362 | TCGA-18-3411-01A-01R-0980-07 LUSC LUSC 0 1 363 | TCGA-18-3412-01A-01R-0980-07 LUSC LUSC 0.002 0.998 364 | TCGA-18-3414-01A-01R-0980-07 LUSC LUSC 0.062 0.938 365 | TCGA-18-3415-01A-01R-0980-07 LUSC LUSC 0.002 0.998 366 | TCGA-18-3416-01A-01R-0980-07 LUSC LUSC 0.006 0.994 367 | TCGA-18-3417-01A-01R-1443-07 LUSC LUSC 0.016 0.984 368 | TCGA-18-3419-01A-01R-0980-07 LUSC LUSC 0.002 0.998 369 | TCGA-18-3421-01A-01R-0980-07 LUSC LUSC 0.006 0.994 370 | TCGA-18-4083-01A-01R-1100-07 LUSC LUSC 0.006 0.994 371 | TCGA-18-4086-01A-01R-1100-07 LUSC LUSC 0 1 372 | TCGA-18-4721-01A-01R-1443-07 LUSC LUSC 0 1 373 | TCGA-18-5592-01A-01R-1635-07 LUSC LUSC 0.002 0.998 374 | TCGA-18-5595-01A-01R-1635-07 LUSC LUSC 0 1 375 | TCGA-21-1070-01A-01R-0692-07 LUSC LUSC 0.032 0.968 376 | TCGA-21-1071-01A-01R-0692-07 LUSC LUSC 0.006 0.994 377 | TCGA-21-1072-01A-01R-0692-07 LUSC LUSC 0.046 0.954 378 | TCGA-21-1075-01A-01R-0692-07 LUSC LUSC 0.12 0.88 379 | TCGA-21-1076-01A-02R-0692-07 LUSC LUSC 0.008 0.992 380 | TCGA-21-1077-01A-01R-0692-07 LUSC LUSC 0.002 0.998 381 | TCGA-21-1078-01A-01R-0692-07 LUSC LUSC 0.042 0.958 382 | TCGA-21-1079-01A-01R-0692-07 LUSC LUSC 0.014 0.986 383 | TCGA-21-1080-01A-01R-0692-07 LUSC LUSC 0.004 0.996 384 | TCGA-21-1081-01A-01R-0692-07 LUSC LUSC 0.004 0.996 385 | TCGA-21-1082-01A-01R-0692-07 LUSC LUSC 0.004 0.996 386 | TCGA-21-1083-01A-01R-0692-07 LUSC LUSC 0.002 0.998 387 | TCGA-21-5782-01A-01R-1635-07 LUSC LUSC 0.032 0.968 388 | TCGA-21-5784-01A-01R-1635-07 LUSC LUSC 0.01 0.99 389 | TCGA-21-5786-01A-01R-1635-07 LUSC LUSC 0.008 0.992 390 | TCGA-21-5787-01A-01R-1635-07 LUSC LUSC 0.008 0.992 391 | TCGA-22-0940-01A-01R-0692-07 LUSC LUSC 0.004 0.996 392 | TCGA-22-0944-01A-01R-0692-07 LUSC LUSC 0.002 0.998 393 | TCGA-22-1002-01A-01R-0692-07 LUSC LUSC 0 1 394 | TCGA-22-1011-01A-01R-0692-07 LUSC LUSC 0.004 0.996 395 | TCGA-22-1012-01A-01R-0692-07 LUSC LUSC 0.002 0.998 396 | TCGA-22-1016-01A-01R-0692-07 LUSC LUSC 0.006 0.994 397 | TCGA-22-1017-01A-01R-0692-07 LUSC LUSC 0.092 0.908 398 | TCGA-22-4591-01A-01R-1201-07 LUSC LUSC 0.018 0.982 399 | TCGA-22-4593-01A-21R-1820-07 LUSC LUSC 0.006 0.994 400 | TCGA-22-4594-01A-01R-1201-07 LUSC LUSC 0.152 0.848 401 | TCGA-22-4595-01A-01R-1201-07 LUSC LUSC 0.002 0.998 402 | TCGA-22-4596-01A-01R-1201-07 LUSC LUSC 0.172 0.828 403 | TCGA-22-4599-01A-01R-1443-07 LUSC LUSC 0.032 0.968 404 | TCGA-22-4601-01A-01R-1443-07 LUSC LUSC 0.002 0.998 405 | TCGA-22-4604-01A-01R-1201-07 LUSC LUSC 0 1 406 | TCGA-22-4607-01A-01R-1201-07 LUSC LUSC 0.002 0.998 407 | TCGA-22-4613-01A-01R-1443-07 LUSC LUSC 0.008 0.992 408 | TCGA-22-5471-01A-01R-1635-07 LUSC LUSC 0.002 0.998 409 | TCGA-22-5472-01A-01R-1635-07 LUSC LUSC 0 1 410 | TCGA-22-5473-01A-01R-1635-07 LUSC LUSC 0 1 411 | TCGA-22-5474-01A-01R-1635-07 LUSC LUSC 0 1 412 | TCGA-22-5477-01A-01R-1635-07 LUSC LUSC 0 1 413 | TCGA-22-5478-01A-01R-1635-07 LUSC LUSC 0.006 0.994 414 | TCGA-22-5479-01A-31R-1949-07 LUSC LUSC 0 1 415 | TCGA-22-5480-01A-01R-1635-07 LUSC LUSC 0.004 0.996 416 | TCGA-22-5481-01A-31R-1949-07 LUSC LUSC 0.068 0.932 417 | TCGA-22-5482-01A-01R-1635-07 LUSC LUSC 0 1 418 | TCGA-22-5483-01A-01R-1820-07 LUSC LUSC 0.002 0.998 419 | TCGA-22-5485-01A-01R-1635-07 LUSC LUSC 0.002 0.998 420 | TCGA-22-5489-01A-01R-1635-07 LUSC LUSC 0.004 0.996 421 | TCGA-22-5491-01A-01R-1635-07 LUSC LUSC 0.002 0.998 422 | TCGA-22-5492-01A-01R-1635-07 LUSC LUSC 0.008 0.992 423 | TCGA-33-4532-01A-01R-1201-07 LUSC LUSC 0.002 0.998 424 | TCGA-33-4533-01A-01R-1201-07 LUSC LUSC 0.022 0.978 425 | TCGA-33-4538-01A-01R-1201-07 LUSC LUSC 0.016 0.984 426 | TCGA-33-4547-01A-01R-1201-07 LUSC LUSC 0 1 427 | TCGA-33-4566-01A-01R-1443-07 LUSC LUSC 0.054 0.946 428 | TCGA-33-4582-01A-01R-1443-07 LUSC LUSC 0 1 429 | TCGA-33-4583-01A-01R-1443-07 LUSC LUSC 0.012 0.988 430 | TCGA-33-4586-01A-01R-1443-07 LUSC LUSC 0.012 0.988 431 | TCGA-33-6737-01A-11R-1820-07 LUSC LUSC 0.006 0.994 432 | TCGA-33-6738-01A-11R-1949-07 LUSC LUSC 0.004 0.996 433 | TCGA-34-2596-01A-01R-0851-07 LUSC LUSC 0 1 434 | TCGA-34-2600-01A-01R-0851-07 LUSC LUSC 0.002 0.998 435 | TCGA-34-2608-01A-02R-0851-07 LUSC LUSC 0 1 436 | TCGA-34-5231-01A-21R-1820-07 LUSC LUSC 0.008 0.992 437 | TCGA-34-5232-01A-21R-1820-07 LUSC LUSC 0 1 438 | TCGA-34-5234-01A-01R-1635-07 LUSC LUSC 0.108 0.892 439 | TCGA-34-5236-01A-21R-1820-07 LUSC LUSC 0 1 440 | TCGA-34-5239-01A-21R-1820-07 LUSC LUSC 0 1 441 | TCGA-34-5240-01A-01R-1443-07 LUSC LUSC 0.022 0.978 442 | TCGA-34-5241-01A-01R-1443-07 LUSC LUSC 0.004 0.996 443 | TCGA-34-5927-01A-11R-1820-07 LUSC LUSC 0.006 0.994 444 | TCGA-34-5928-01A-11R-1820-07 LUSC LUSC 0.002 0.998 445 | TCGA-34-5929-01A-11R-1820-07 LUSC LUSC 0.002 0.998 446 | TCGA-34-7107-01A-11R-1949-07 LUSC LUSC 0 1 447 | TCGA-37-3783-01A-01R-1201-07 LUSC LUSC 0.008 0.992 448 | TCGA-37-3789-01A-01R-0980-07 LUSC LUSC 0.002 0.998 449 | TCGA-37-3792-01A-01R-0980-07 LUSC LUSC 0.12 0.88 450 | TCGA-37-4129-01A-01R-1100-07 LUSC LUSC 0.082 0.918 451 | TCGA-37-4130-01A-01R-1100-07 LUSC LUSC 0.116 0.884 452 | TCGA-37-4132-01A-01R-1100-07 LUSC LUSC 0.012 0.988 453 | TCGA-37-4133-01A-01R-1100-07 LUSC LUSC 0.034 0.966 454 | TCGA-37-4135-01A-01R-1100-07 LUSC LUSC 0.024 0.976 455 | TCGA-37-4141-01A-02R-1100-07 LUSC LUSC 0.008 0.992 456 | TCGA-37-5819-01A-01R-1635-07 LUSC LUSC 0.032 0.968 457 | TCGA-39-5011-01A-01R-1443-07 LUSC LUSC 0.032 0.968 458 | TCGA-39-5016-01A-01R-1443-07 LUSC LUSC 0 1 459 | TCGA-39-5019-01A-01R-1820-07 LUSC LUSC 0.122 0.878 460 | TCGA-39-5021-01A-01R-1443-07 LUSC LUSC 0 1 461 | TCGA-39-5024-01A-21R-1820-07 LUSC LUSC 0.05 0.95 462 | TCGA-39-5027-01A-21R-1820-07 LUSC LUSC 0 1 463 | TCGA-39-5028-01A-01R-1443-07 LUSC LUSC 0.002 0.998 464 | TCGA-39-5029-01A-01R-1443-07 LUSC LUSC 0 1 465 | TCGA-39-5030-01A-01R-1443-07 LUSC LUSC 0.002 0.998 466 | TCGA-39-5031-01A-01R-1443-07 LUSC LUSC 0 1 467 | TCGA-39-5034-01A-01R-1443-07 LUSC LUSC 0.026 0.974 468 | TCGA-39-5035-01A-01R-1443-07 LUSC LUSC 0.01 0.99 469 | TCGA-39-5036-01A-01R-1443-07 LUSC LUSC 0 1 470 | TCGA-39-5037-01A-01R-1443-07 LUSC LUSC 0.002 0.998 471 | TCGA-39-5039-01A-01R-1443-07 LUSC LUSC 0.006 0.994 472 | TCGA-43-2578-01A-01R-0851-07 LUSC LUSC 0.002 0.998 473 | TCGA-43-2581-01A-01R-0851-07 LUSC LUSC 0.104 0.896 474 | TCGA-43-3394-01A-01R-0980-07 LUSC LUSC 0 1 475 | TCGA-43-3920-01A-01R-0980-07 LUSC LUSC 0.026 0.974 476 | TCGA-43-5668-01A-01R-1635-07 LUSC LUSC 0.016 0.984 477 | TCGA-43-6143-01A-11R-1820-07 LUSC LUSC 0.01 0.99 478 | TCGA-43-6647-01A-11R-1820-07 LUSC LUSC 0.006 0.994 479 | TCGA-43-6770-01A-11R-1820-07 LUSC LUSC 0 1 480 | TCGA-43-6771-01A-11R-1820-07 LUSC LUSC 0.066 0.934 481 | TCGA-46-3765-01A-01R-0980-07 LUSC LUSC 0 1 482 | TCGA-46-3766-01A-01R-0980-07 LUSC LUSC 0.026 0.974 483 | TCGA-46-3767-01A-01R-0980-07 LUSC LUSC 0.006 0.994 484 | TCGA-46-3768-01A-01R-0980-07 LUSC LUSC 0 1 485 | TCGA-46-3769-01A-01R-0980-07 LUSC LUSC 0.034 0.966 486 | TCGA-46-6025-01A-11R-1820-07 LUSC LUSC 0.004 0.996 487 | TCGA-46-6026-01A-11R-1820-07 LUSC LUSC 0.052 0.948 488 | TCGA-51-4079-01A-01R-1100-07 LUSC LUSC 0 1 489 | TCGA-51-4080-01A-01R-1100-07 LUSC LUSC 0.002 0.998 490 | TCGA-51-4081-01A-01R-1100-07 LUSC LUSC 0 1 491 | TCGA-56-1622-01A-01R-0692-07 LUSC LUSC 0.008 0.992 492 | TCGA-56-5897-01A-11R-1635-07 LUSC LUSC 0 1 493 | TCGA-56-5898-01A-11R-1635-07 LUSC LUSC 0.008 0.992 494 | TCGA-56-6545-01A-11R-1820-07 LUSC LUSC 0.036 0.964 495 | TCGA-56-6546-01A-11R-1820-07 LUSC LUSC 0.004 0.996 496 | TCGA-60-2695-01A-01R-0851-07 LUSC LUSC 0.004 0.996 497 | TCGA-60-2696-01A-01R-0851-07 LUSC LUSC 0.006 0.994 498 | TCGA-60-2698-01A-01R-0851-07 LUSC LUSC 0.038 0.962 499 | TCGA-60-2706-01A-01R-0851-07 LUSC LUSC 0.032 0.968 500 | TCGA-60-2707-01A-01R-0851-07 LUSC LUSC 0.002 0.998 501 | TCGA-60-2708-01A-01R-0851-07 LUSC LUSC 0 1 502 | TCGA-60-2709-01A-21R-1820-07 LUSC LUSC 0.004 0.996 503 | TCGA-60-2710-01A-01R-0851-07 LUSC LUSC 0.004 0.996 504 | TCGA-60-2711-01A-01R-0851-07 LUSC LUSC 0 1 505 | TCGA-60-2712-01A-01R-0851-07 LUSC LUSC 0 1 506 | TCGA-60-2713-01A-01R-0851-07 LUSC LUSC 0 1 507 | TCGA-60-2714-01A-01R-0851-07 LUSC LUSC 0.258 0.742 508 | TCGA-60-2715-01A-01R-0851-07 LUSC LUSC 0.002 0.998 509 | TCGA-60-2716-01A-01R-0851-07 LUSC LUSC 0.026 0.974 510 | TCGA-60-2719-01A-01R-0851-07 LUSC LUSC 0 1 511 | TCGA-60-2720-01A-01R-0851-07 LUSC LUSC 0.016 0.984 512 | TCGA-60-2721-01A-01R-0851-07 LUSC LUSC 0 1 513 | TCGA-60-2722-01A-01R-0851-07 LUSC LUSC 0 1 514 | TCGA-60-2723-01A-01R-0851-07 LUSC LUSC 0 1 515 | TCGA-60-2724-01A-01R-0851-07 LUSC LUSC 0.008 0.992 516 | TCGA-60-2725-01A-01R-1201-07 LUSC LUSC 0.002 0.998 517 | TCGA-60-2726-01A-01R-0851-07 LUSC LUSC 0.008 0.992 518 | TCGA-63-5128-01A-01R-1443-07 LUSC LUSC 0.014 0.986 519 | TCGA-63-5131-01A-01R-1443-07 LUSC LUSC 0.012 0.988 520 | TCGA-63-6202-01A-11R-1820-07 LUSC LUSC 0.068 0.932 521 | TCGA-63-7020-01A-11R-1949-07 LUSC LUSC 0.006 0.994 522 | TCGA-63-7021-01A-11R-1949-07 LUSC LUSC 0.028 0.972 523 | TCGA-63-7022-01A-11R-1949-07 LUSC LUSC 0 1 524 | TCGA-63-7023-01A-11R-1949-07 LUSC LUSC 0 1 525 | TCGA-66-2727-01A-01R-0980-07 LUSC LUSC 0 1 526 | TCGA-66-2734-01A-01R-0980-07 LUSC LUSC 0 1 527 | TCGA-66-2737-01A-01R-0980-07 LUSC LUSC 0 1 528 | TCGA-66-2742-01A-01R-0980-07 LUSC LUSC 0 1 529 | TCGA-66-2744-01A-01R-0980-07 LUSC LUSC 0.004 0.996 530 | TCGA-66-2753-01A-01R-0980-07 LUSC LUSC 0.01 0.99 531 | TCGA-66-2754-01A-01R-0980-07 LUSC LUSC 0.02 0.98 532 | TCGA-66-2755-01A-01R-0851-07 LUSC LUSC 0.002 0.998 533 | TCGA-66-2756-01A-01R-0851-07 LUSC LUSC 0.088 0.912 534 | TCGA-66-2757-01A-01R-0851-07 LUSC LUSC 0.004 0.996 535 | TCGA-66-2758-01A-02R-0851-07 LUSC LUSC 0 1 536 | TCGA-66-2759-01A-01R-0851-07 LUSC LUSC 0 1 537 | TCGA-66-2763-01A-01R-0851-07 LUSC LUSC 0.006 0.994 538 | TCGA-66-2765-01A-01R-0851-07 LUSC LUSC 0.02 0.98 539 | TCGA-66-2766-01A-01R-0851-07 LUSC LUSC 0.01 0.99 540 | TCGA-66-2767-01A-01R-0851-07 LUSC LUSC 0 1 541 | TCGA-66-2768-01A-01R-0851-07 LUSC LUSC 0 1 542 | TCGA-66-2769-01A-02R-0851-07 LUSC LUSC 0.002 0.998 543 | TCGA-66-2770-01A-01R-0851-07 LUSC LUSC 0 1 544 | TCGA-66-2771-01A-01R-0980-07 LUSC LUSC 0.002 0.998 545 | TCGA-66-2773-01A-01R-1201-07 LUSC LUSC 0.002 0.998 546 | TCGA-66-2777-01A-01R-1201-07 LUSC LUSC 0 1 547 | TCGA-66-2778-01A-02R-0851-07 LUSC LUSC 0.014 0.986 548 | TCGA-66-2780-01A-01R-0851-07 LUSC LUSC 0.002 0.998 549 | TCGA-66-2781-01A-01R-0851-07 LUSC LUSC 0 1 550 | TCGA-66-2782-01A-01R-0851-07 LUSC LUSC 0 1 551 | TCGA-66-2783-01A-01R-1201-07 LUSC LUSC 0 1 552 | TCGA-66-2785-01A-01R-0851-07 LUSC LUSC 0.024 0.976 553 | TCGA-66-2786-01A-01R-0851-07 LUSC LUSC 0 1 554 | TCGA-66-2787-01A-01R-0980-07 LUSC LUSC 0 1 555 | TCGA-66-2788-01A-01R-0980-07 LUSC LUSC 0.002 0.998 556 | TCGA-66-2789-01A-01R-0980-07 LUSC LUSC 0.004 0.996 557 | TCGA-66-2790-01A-01R-0980-07 LUSC LUSC 0.008 0.992 558 | TCGA-66-2791-01A-01R-0980-07 LUSC LUSC 0 1 559 | TCGA-66-2792-01A-01R-0980-07 LUSC LUSC 0 1 560 | TCGA-66-2793-01A-01R-1201-07 LUSC LUSC 0.016 0.984 561 | TCGA-66-2794-01A-01R-1201-07 LUSC LUSC 0 1 562 | TCGA-66-2795-01A-02R-0980-07 LUSC LUSC 0.008 0.992 563 | TCGA-66-2800-01A-01R-1201-07 LUSC LUSC 0.002 0.998 564 | TCGA-70-6722-01A-11R-1820-07 LUSC LUSC 0.012 0.988 565 | TCGA-70-6723-01A-11R-1820-07 LUSC LUSC 0.026 0.974 566 | TCGA-77-6842-01A-11R-1949-07 LUSC LUSC 0.002 0.998 567 | TCGA-77-6843-01A-11R-1949-07 LUSC LUSC 0.006 0.994 568 | TCGA-77-6844-01A-11R-1949-07 LUSC LUSC 0.004 0.996 569 | TCGA-77-6845-01A-11R-1949-07 LUSC LUSC 0 1 570 | TCGA-79-5596-01A-31R-1949-07 LUSC LUSC 0 1 571 | TCGA-85-6175-01A-11R-1820-07 LUSC LUSC 0.138 0.862 572 | TCGA-85-6560-01A-11R-1820-07 LUSC LUAD 0.882 0.118 573 | TCGA-85-6561-01A-11R-1820-07 LUSC LUSC 0.02 0.98 574 | TCGA-85-6798-01A-11R-1949-07 LUSC LUSC 0 1 575 | TCGA-90-6837-01A-11R-1949-07 LUSC LUSC 0 1 576 | TCGA-94-7033-01A-11R-1949-07 LUSC LUSC 0.008 0.992 577 | -------------------------------------------------------------------------------- /Codes/BuildMatrixFile.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | from utilities import * 3 | 4 | inFilePattern = sys.argv[1] 5 | outFilePath = sys.argv[2] 6 | 7 | inFilePaths = sorted(glob.glob(inFilePattern)) 8 | sampleIDs = [os.path.basename(x) for x in inFilePaths] 9 | 10 | features = set() 11 | for inFilePath in inFilePaths: 12 | print "Identifying features in %s" % inFilePath 13 | for line in file(inFilePath): 14 | features.add(line.rstrip().split("\t")[0]) 15 | features = sorted(list(features)) 16 | 17 | outData = [[""] + features] 18 | for inFilePath in inFilePaths: 19 | print "Parsing and saving values for %s" % inFilePath 20 | sampleID = os.path.basename(inFilePath) 21 | 22 | valueDict = {} 23 | for line in file(inFilePath): 24 | lineItems = line.rstrip().split("\t") 25 | valueDict[lineItems[0]] = lineItems[1] 26 | 27 | values = [valueDict[feature] for feature in features] 28 | outData.append([sampleID] + values) 29 | 30 | print "Transposing and saving to %s" % outFilePath 31 | writeMatrixToFile(transposeMatrix(outData), outFilePath) 32 | -------------------------------------------------------------------------------- /Codes/CalcAUC.R: -------------------------------------------------------------------------------- 1 | library(pROC) 2 | 3 | inFilePath = commandArgs()[7] 4 | actualColumnName = commandArgs()[8] 5 | probabilitiesColumnName = commandArgs()[9] 6 | outFilePath = commandArgs()[10] 7 | main = commandArgs()[11] 8 | 9 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F) 10 | 11 | actual = as.factor(data[,actualColumnName]) 12 | probabilities = as.numeric(data[,probabilitiesColumnName]) 13 | 14 | pdf(outFilePath) 15 | par(mar=c(4.5, 4.7, 0.0, 0.5), lwd=4) 16 | 17 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE) 18 | lowerBoundAuc = format(roc_result$ci[1], digits=3) 19 | midAuc = format(roc_result$ci[2], digits=3) 20 | upperBoundAuc = format(roc_result$ci[3], digits=3) 21 | 22 | ci(roc_result) 23 | sens.ci <- ci.se(roc_result) 24 | plot(sens.ci, type="shape", col="gray95") 25 | plot(sens.ci, type="bars") 26 | plot(roc_result, add=TRUE) 27 | 28 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep="")) 29 | title(main) 30 | 31 | par(mar=c(5.1, 4.1, 2.1, 2.1)) 32 | graphics.off() 33 | 34 | print(c(lowerBoundAuc, midAuc, upperBoundAuc)) 35 | -------------------------------------------------------------------------------- /Codes/CalcAccuracy.R: -------------------------------------------------------------------------------- 1 | library(pROC) 2 | 3 | inFilePath = commandArgs()[7] 4 | actualColumnName = commandArgs()[8] 5 | predColumnName = commandArgs()[9] 6 | 7 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F) 8 | 9 | actual = data[,actualColumnName] 10 | pred = data[,predColumnName] 11 | 12 | accuracy = sum(actual == pred) / nrow(data) 13 | 14 | print(accuracy) 15 | -------------------------------------------------------------------------------- /Codes/Classify_luad_vs_lusc.R: -------------------------------------------------------------------------------- 1 | library(caret) 2 | 3 | outFilePath12 = "Classification_12_LUAD_LUSC_Predictions.txt" 4 | outFilePath20 = "Classification_20_LUAD_LUSC_Predictions.txt" 5 | 6 | # Read data from file 7 | setwd("Analysis_datasets") 8 | luad12 = read.table("12_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F) 9 | lusc12 = read.table("12_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F) 10 | lu12 = cbind(luad12,lusc12) 11 | luad20 = read.table("20_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F) 12 | lusc20 = read.table("20_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F) 13 | lu20 = cbind(luad20,lusc20) 14 | 15 | # Only keep the same samples in TCGA processed versus Rsubread processed data 16 | lu20_f = lu20[,colnames(lu20)%in%colnames(lu12)] 17 | 18 | # Remove class values from data frame "LGG"==rownames(data)[9752] 19 | classes12 = as.factor(as.character(lu12[nrow(lu12),])) 20 | data12 = t(data.matrix(lu12[-nrow(lu12),])) 21 | classes20 = as.factor(as.character(lu20_f[nrow(lu20_f),])) 22 | data20 = t(data.matrix(lu20_f[-nrow(lu20_f),])) 23 | 24 | # Retain features that do not have zero variance 25 | data12 = data12[,which(apply(data12, 2, var) > 0)] 26 | data20 = data20[,which(apply(data20, 2, var) > 0)] 27 | 28 | # Set random seed so results are same each time 29 | set.seed(0) 30 | 31 | # Build the classification model 32 | mod12 <- train(classes12~., data=data12, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T)) 33 | set.seed(0) 34 | mod20 <- train(classes20~., data=data20, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T)) 35 | 36 | # Determine which mtry parameter value performed best 37 | tuneResults12 = mod12$results[order(mod12$results$Accuracy, decreasing=TRUE),] 38 | bestMtry12 = tuneResults12[1,]$mtry 39 | tuneResults20 = mod20$results[order(mod20$results$Accuracy, decreasing=TRUE),] 40 | bestMtry20 = tuneResults20[1,]$mtry 41 | 42 | # Select predictions that coincide with best mtry parameter 43 | predictions12 = mod12$pred[which(mod12$pred$mtry == bestMtry12),] 44 | predictions20 = mod20$pred[which(mod20$pred$mtry == bestMtry20),] 45 | 46 | # Sort predictions by the original order 47 | predictions12 = predictions12[order(predictions12$rowIndex),] 48 | predictions20 = predictions20[order(predictions20$rowIndex),] 49 | 50 | # Build output matrix 51 | output12 = cbind(rownames(data12), predictions12[,2], predictions12[,1], predictions12[,3:(ncol(predictions12) - 3)]) 52 | colnames(output12) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output12)[4:ncol(output12)], "Probability", sep="_")) 53 | output20 = cbind(rownames(data20), predictions20[,2], predictions20[,1], predictions20[,3:(ncol(predictions20) - 3)]) 54 | colnames(output20) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output20)[4:ncol(output20)], "Probability", sep="_")) 55 | 56 | # Save predictions to output file 57 | write.table(output12, outFilePath12, sep="\t", col.names=T, row.names=F, quote=F) 58 | write.table(output20, outFilePath20, sep="\t", col.names=T, row.names=F, quote=F) 59 | -------------------------------------------------------------------------------- /Codes/CombineScalarValues.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | from utilities import * 3 | 4 | inFilePattern = sys.argv[1] 5 | outFilePath = sys.argv[2] 6 | 7 | outFile = open(outFilePath, 'w') 8 | 9 | for inFilePath in glob.glob(inFilePattern): 10 | outFile.write("%s\t%s\n" % (os.path.basename(inFilePath), readScalarFromFile(inFilePath))) 11 | 12 | outFile.close() 13 | -------------------------------------------------------------------------------- /Codes/FileContainsText.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | from utilities import * 3 | 4 | inFilePath = sys.argv[1] 5 | searchPattern = sys.argv[2].decode('string-escape') 6 | 7 | print searchPattern in readTextFromFile(inFilePath) 8 | -------------------------------------------------------------------------------- /Codes/GetFileExtension.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | inFilePath = sys.argv[1] 4 | 5 | file, ext = os.path.splitext(inFilePath) 6 | 7 | print ext 8 | -------------------------------------------------------------------------------- /Codes/IdentifyDiscordantPredictions.R: -------------------------------------------------------------------------------- 1 | inFilePath = commandArgs()[7] 2 | actualColumnName = commandArgs()[8] 3 | predictedColumnName = commandArgs()[9] 4 | potentiallyDiscordantFilePath = commandArgs()[10] 5 | 6 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F) 7 | 8 | incorrect = data[which(data[,actualColumnName]!=data[,predictedColumnName]),] 9 | 10 | potentiallyDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE) 11 | 12 | print("Samples predicted incorrectly:") 13 | print(nrow(incorrect)) 14 | 15 | print("Samples predicted incorrectly that were identified previously as potentially discordant:") 16 | print(nrow(incorrect[which(incorrect$row.names %in% potentiallyDiscordantSamples),])) 17 | -------------------------------------------------------------------------------- /Codes/IdentifyInconsistentPredictions.R: -------------------------------------------------------------------------------- 1 | inFilePath1 = commandArgs()[7] 2 | inFilePath2 = commandArgs()[8] 3 | actualColumnName = commandArgs()[9] 4 | predictedColumnName = commandArgs()[10] 5 | 6 | data1 = read.table(inFilePath1, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F) 7 | data2 = read.table(inFilePath2, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F) 8 | 9 | incorrect1 = data1[which(data1[,actualColumnName]!=data1[,predictedColumnName]),] 10 | incorrect2 = data2[which(data2[,actualColumnName]!=data2[,predictedColumnName]),] 11 | 12 | print(nrow(incorrect1)) 13 | print(nrow(incorrect2)) 14 | 15 | diff12 = setdiff(incorrect1$row.names, incorrect2$row.names) 16 | diff21 = setdiff(incorrect2$row.names, incorrect1$row.names) 17 | diffs = c(diff12, diff21) 18 | 19 | print("Samples predicted inconsistently between two data sets:") 20 | data = merge(data1, data2, by=1) 21 | print(data[which(data$row.names %in% diffs),]) 22 | -------------------------------------------------------------------------------- /Codes/LUSC_vs_LUAD.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(stringr) 3 | library(heatmap3) 4 | library(caret) 5 | library(pROC) 6 | 7 | readData = function(filePath, logTransform=FALSE) 8 | { 9 | data = fread(filePath) 10 | 11 | data = data.frame(data[-nrow(data),]) 12 | rownames(data) = data[,1] 13 | data = data[,-1] 14 | data = data.matrix(data) 15 | 16 | if (logTransform) 17 | data = log2(data + 1) 18 | 19 | return(data) 20 | } 21 | 22 | mergeData = function(data1, data2) 23 | { 24 | merged = merge(data1, data2, by=0, sort=FALSE) 25 | rownames(merged) = merged[,1] 26 | merged = merged[,-1] 27 | } 28 | 29 | crossValidate = function(data, outPrefix) 30 | { 31 | # Remove any genes with no variance 32 | data = data[which(apply(data, 1, var) > 0),] 33 | write.table(dim(data), paste(outPrefix, "_Dimensions.txt", sep="")) 34 | 35 | library(doParallel) 36 | registerDoParallel(cores=12) 37 | 38 | # From http://stackoverflow.com/questions/13403427/fully-reproducible-parallel-models-using-caret 39 | # Unfortunately, it doesn't seem to ensure that the results are the same for multiple iterations 40 | set.seed(0) 41 | seeds <- vector(mode = "list", length = 11) # length is = (n_repeats*nresampling)+1 42 | for(i in 1:10) seeds[[i]] <- sample.int(n=1000, 3) #(3 is the number of tuning parameter, mtry for rf, here equal to ncol(iris)-2) 43 | seeds[[11]]<-sample.int(1000, 1)#for the last model 44 | 45 | model <- train(classes~., data=t(data), method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T), seeds=seeds) 46 | 47 | tuneResults = model$results[order(model$results$Accuracy, decreasing=TRUE),] 48 | bestMtry = tuneResults[1,]$mtry 49 | 50 | # Select predictions that coincide with best mtry parameter 51 | predictions = model$pred[which(model$pred$mtry == bestMtry),] 52 | 53 | # Sort predictions by the original order 54 | predictions = predictions[order(predictions$rowIndex),] 55 | 56 | rownames(predictions) = gsub("\\.", "-", colnames(data)) 57 | 58 | write.table(predictions, paste(outPrefix, "_Predictions.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=T) 59 | 60 | featureImportance <- varImp(model, scale = TRUE)$importance 61 | featureImportance <- featureImportance[order(featureImportance$Overall, decreasing=TRUE),,drop=FALSE] 62 | write.table(featureImportance, paste(outPrefix, "_FeatureImportance.txt", sep=""), quote=FALSE, row.names=T, col.names=NA, sep="\t") 63 | } 64 | 65 | identifyDiffExpressedGenes = function(data1, data2, n) 66 | { 67 | data1Mean = apply(data1, 1, mean) 68 | data2Mean = apply(data2, 1, mean) 69 | ratios = (data1Mean + 1) / (data2Mean + 1) 70 | ratios = sort(ratios, decreasing=TRUE) 71 | genesToPlot = c(names(head(ratios, n=n)), names(tail(ratios, n=n))) 72 | 73 | return(genesToPlot) 74 | } 75 | 76 | tcgaLuad = readData("12_LUAD_t.txt", logTransform=TRUE) 77 | tcgaLusc = readData("12_LUSC_t.txt", logTransform=TRUE) 78 | rsubreadLuad = readData("20_LUAD_t.txt") 79 | rsubreadLusc = readData("20_LUSC_t.txt") 80 | 81 | # Extract gene symbols from row names 82 | rownames(tcgaLuad) = sapply(rownames(tcgaLuad), function(x) { str_split(x, "\\|")[[1]][1] }) 83 | rownames(tcgaLusc) = sapply(rownames(tcgaLusc), function(x) { str_split(x, "\\|")[[1]][1] }) 84 | 85 | # Find genes that are common across both data sets 86 | commonTcgaGenes = intersect(rownames(tcgaLuad), rownames(tcgaLusc)) 87 | commonRsubreadGenes = intersect(rownames(rsubreadLuad), rownames(rsubreadLusc)) 88 | commonGenes = intersect(commonTcgaGenes, commonRsubreadGenes) 89 | nonOverlappingGenes = setdiff(commonRsubreadGenes, commonTcgaGenes) 90 | 91 | # Find samples that are common across both data sets 92 | commonLuadSamples = intersect(colnames(tcgaLuad), colnames(rsubreadLuad)) 93 | commonLuscSamples = intersect(colnames(tcgaLusc), colnames(rsubreadLusc)) 94 | 95 | # Select common genes, samples of interest 96 | tcgaLuad = tcgaLuad[commonTcgaGenes,commonLuadSamples] 97 | tcgaLusc = tcgaLusc[commonTcgaGenes,commonLuscSamples] 98 | rsubreadLuad = rsubreadLuad[commonRsubreadGenes,commonLuadSamples] 99 | rsubreadLusc = rsubreadLusc[commonRsubreadGenes,commonLuscSamples] 100 | 101 | classesLuad = rep("LUAD", ncol(tcgaLuad)) 102 | classesLusc = rep("LUSC", ncol(tcgaLusc)) 103 | classes = as.factor(c(classesLuad, classesLusc)) 104 | 105 | tcga = mergeData(tcgaLuad, tcgaLusc) 106 | rsubread = mergeData(rsubreadLuad, rsubreadLusc) 107 | 108 | # Remove any genes with no variance 109 | tcga = tcga[which(apply(tcga, 1, var) > 0),] 110 | rsubread = rsubread[which(apply(rsubread, 1, var) > 0),] 111 | 112 | crossValidate(tcga, "TCGA_AllGenes") 113 | crossValidate(rsubread, "RSubread_AllGenes") 114 | crossValidate(tcga[commonGenes,], "TCGA_CommonGenes") 115 | crossValidate(rsubread[commonGenes,], "RSubread_CommonGenes") 116 | crossValidate(rsubread[nonOverlappingGenes,], "RSubread_NonOverlappingGenes") 117 | 118 | # Identify top differentially expressed genes 119 | tcgaDiffExpressedGenes = identifyDiffExpressedGenes(tcgaLuad, tcgaLusc, 100) 120 | rsubreadNonOverlappingDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLusc[nonOverlappingGenes,], 100) 121 | 122 | # Get potentially discordant samples 123 | luscDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE) 124 | luscDiscordantSamples = str_replace_all(luscDiscordantSamples, "\\-", ".") 125 | luscDiscordantSamples = intersect(luscDiscordantSamples, commonLuscSamples) 126 | 127 | tcgaLuscDiscordant = tcgaLusc[,luscDiscordantSamples] 128 | tcgaLusc = tcgaLusc[,setdiff(colnames(tcgaLusc), luscDiscordantSamples)] 129 | tcga = mergeData(tcgaLuad, tcgaLusc) 130 | tcga = mergeData(tcga, tcgaLuscDiscordant) 131 | 132 | rsubreadLuscDiscordant = rsubreadLusc[,luscDiscordantSamples] 133 | rsubreadLusc = rsubreadLusc[,setdiff(colnames(rsubreadLusc), luscDiscordantSamples)] 134 | rsubread = mergeData(rsubreadLuad, rsubreadLusc) 135 | rsubread = mergeData(rsubread, rsubreadLuscDiscordant) 136 | 137 | #discordantDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLuscDiscordant[nonOverlappingGenes,], 5) 138 | discordantDiffExpressedGenes = c("MIR320A", "MIR1234", "MIR4461", "MIR186") 139 | 140 | colnames(rsubread) = str_replace_all(colnames(rsubread), "\\.", "-") 141 | write.table(rsubread[discordantDiffExpressedGenes,], "RSubread_Discordant_DiffExpressedGenes_Data.txt", sep="\t", quote=F, col.names=NA, row.names=T) 142 | 143 | classes = c(classesLuad, rep("LUSC", ncol(rsubreadLusc)), rep("Discordant LUSC", ncol(rsubreadLuscDiscordant))) 144 | classes = cbind(colnames(rsubread), classes) 145 | write.table(classes, "RSubread_Discordant_Classes.txt", sep="\t", quote=F, col.names=F, row.names=F) 146 | -------------------------------------------------------------------------------- /Codes/ParseCgHubQueryResults.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | 3 | inFilePath = sys.argv[1] 4 | sampleFilePath = sys.argv[2] 5 | outDownloadSamplesDirPath = sys.argv[3] 6 | outCancerTypesDirPath = sys.argv[4] 7 | 8 | def parseTagValue(lines, key): 9 | for line in lines: 10 | line = line.strip() 11 | 12 | if line.startswith("<%s>" % key): 13 | return line.replace("/", "").replace("<%s>" % key, "") 14 | 15 | return None 16 | 17 | def saveOutput(outLines): 18 | legacyID = parseTagValue(outLines, "legacy_sample_id") 19 | 20 | if sampleFilePath == "" or legacyID in samplesToKeep: 21 | analysisID = parseTagValue(outLines, "analysis_id") 22 | 23 | if analysisID != None: 24 | outFilePath = "%s/%s" % (outDownloadSamplesDirPath, legacyID) 25 | if os.path.exists(outFilePath): 26 | print "%s already exists" % outFilePath 27 | outFile = open(outFilePath, 'w') 28 | outFile.write("%s\n" % analysisID) 29 | outFile.close() 30 | 31 | cancerType = parseTagValue(outLines, "disease_abbr") 32 | if cancerType == None: 33 | print "Cancer type was not specified for %s." % analysisID 34 | exit(1) 35 | outFile = open("%s/%s" % (outCancerTypesDirPath, legacyID), 'w') 36 | outFile.write("%s\n" % cancerType) 37 | outFile.close() 38 | 39 | return legacyID 40 | 41 | return None 42 | 43 | inFileLines = [line for line in file(inFilePath)] 44 | 45 | headerLine1 = inFileLines.pop(0) 46 | headerLine2 = inFileLines.pop(0) 47 | 48 | if "Query" in inFileLines[0]: 49 | inFileLines.pop(0) 50 | inFileLines.pop(0) 51 | 52 | footerLine = inFileLines.pop(len(inFileLines)-1) 53 | 54 | if sampleFilePath != "": 55 | samplesToKeep = set([line.rstrip() for line in file(sampleFilePath)]) 56 | 57 | samplesSaved = set() 58 | 59 | outLines = [] 60 | 61 | for line in inFileLines: 62 | if " 0),] 29 | 30 | accent = brewer.pal(8, "Accent") 31 | set3 = brewer.pal(12, "Set3") 32 | cols = c(accent[1], set3[12], accent[7]) 33 | 34 | ColSideColors = as.character(classes[,1]) 35 | ColSideColors[ColSideColors=="LUAD"] = cols[1] 36 | ColSideColors[ColSideColors=="LUSC"] = cols[2] 37 | ColSideColors[ColSideColors=="Discordant LUSC"] = cols[3] 38 | par(lwd=4) 39 | if (nrow(data) <= 10) 40 | for (gene in rownames(data)) 41 | plotHist(gene, data, classes[,1], paste("RSubread_", gene, "_Histogram.pdf", sep="")) 42 | 43 | pdf("RSubread_Discordant_Heatmap.pdf") 44 | colnames(data) = rep("", ncol(data)) 45 | if (nrow(data) > 20) 46 | rownames(data) = rep("", nrow(data)) 47 | heatmap3(data, Colv=NA, Rowv=TRUE, showRowDendro=T, showColDendro=F, cexRow=3, margins=c(5, 12), ColSideColors=ColSideColors, ColSideLabs="", cex=1.5) 48 | legend("top", legend=c("LUAD", "LUSC", "Discordant LUSC"), col=cols, cex=1.1, lty=1, lwd=4, inset=-0.07, xpd=TRUE, box.lwd=0, box.lty=0, horiz=F) 49 | graphics.off() 50 | -------------------------------------------------------------------------------- /Codes/PrintMatrixDimensions.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | import utilities 3 | 4 | inFilePath = sys.argv[1] 5 | 6 | inFile = open(inFilePath) 7 | numCols = len(inFile.readline().rstrip().split("\t")) 8 | numRows = 1 9 | for line in inFile: 10 | numRows += 1 11 | inFile.close() 12 | 13 | print "Number Rows: %i" % numRows 14 | print "Number Columns: %i" % numCols 15 | -------------------------------------------------------------------------------- /Codes/ProcessClinicalData.R: -------------------------------------------------------------------------------- 1 | if (!require("plyr")) { 2 | install.packages("plyr", dependencies = TRUE) 3 | library(plyr) 4 | } 5 | 6 | data==identifiers=tmp_data=tmp_identifier=NULL 7 | dirname='.' 8 | setwd(dirname)#Set the directory where the clinical data is located for each cancer in separate folder 9 | filenames<-system("ls */nationwidechildrens.org_clinical_patient*", intern=T) 10 | for(i in 1:length(filenames)){#####iterating through each of the clinical files to create new matrix files with ALL clinical variables 11 | print(i) 12 | f<-(read.delim(paste(c(dirname,filenames[i]), collapse=''))) ###reading in the filess one at a time 13 | tmp_data<-f[3:nrow(f),] 14 | tmp_identifier<-f[1:2,] 15 | if(i==1){ 16 | data<-tmp_data 17 | identifier<-tmp_identifier 18 | }else{ 19 | identifier<-list(identifier,tmp_identifier) 20 | identifier<-rbind.fill.matrix(identifier) 21 | for(j in 1:ncol(identifier)){ 22 | if(!is.na(identifier[3,j])){ 23 | identifier[1,j]<-identifier[3,j] 24 | identifier[2,j]<-identifier[4,j] 25 | } 26 | } 27 | identifier<-identifier[1:2,] 28 | data<-list(data,tmp_data) 29 | data<-rbind.fill.matrix(data) 30 | #data<-merge(data,f) 31 | } 32 | } 33 | rownames(data)<-data[,2] 34 | 35 | #Now, converting short TCGA ids reported in clinical data to long TCGA ids reported in RNA-Seq dataset using R codes 36 | 37 | sample_names<-rownames(as.matrix(read.table("PANCAN24_CancerType_Samples.txt", row.names=1, sep='\t', check.names = F))) #getting the long TCGA IDs used in RNA-Seq dataset 38 | partial_sample_names<-rownames(data) 39 | counter=0##to check how many replacement has been done 40 | for (j in 1:length(partial_sample_names)){ 41 | if(!is.na(pmatch(partial_sample_names[j],sample_names))){ 42 | partial_sample_names[j]<-sample_names[pmatch(partial_sample_names[j],sample_names, duplicates.ok=F)] 43 | counter=counter+1 44 | } 45 | } 46 | 47 | rownames(data)<-partial_sample_names 48 | clinical_data<-matrix(NA, nrow=9264,ncol=548) ###instantiating an NA matrix 49 | rownames(clinical_data)<-sample_names 50 | colnames(clinical_data)<-colnames(data) 51 | for(i in 1:length(rownames(clinical_data))){ 52 | sample_id<-rownames(clinical_data)[i] 53 | if(sample_id%in%rownames(data)){ 54 | clinical_data[sample_id,]<-data[sample_id,] 55 | } 56 | } 57 | clinical_data_identifier<-cbind(t(identifier),t(clinical_data)) 58 | write.table(clinical_data_identifier,file="TCGA_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt", sep='\t',col.names=NA, quote=F) 59 | 60 | -------------------------------------------------------------------------------- /Codes/ProcessRnaSeqFeatureCounts.R: -------------------------------------------------------------------------------- 1 | library(Rsubread) 2 | library(limma) 3 | library(edgeR) 4 | library(tools) 5 | options(digits=2) 6 | 7 | referenceGenomeFastaFilePath = commandArgs()[7] 8 | inFilePath1 = commandArgs()[8] 9 | inFilePath2 = commandArgs()[9] # NULL for single-end analyses or when a BAM file has been specified 10 | gtfFilePath = commandArgs()[10] 11 | tempFilePrefix = commandArgs()[11] 12 | outFpkmFilePath = commandArgs()[12] 13 | outTpmFilePath = commandArgs()[13] 14 | outCountsFilePath = commandArgs()[14] 15 | outStatsFilePath = commandArgs()[15] 16 | 17 | memory = 4000 18 | nthreads = 1 19 | 20 | input_format = "gzFASTQ" 21 | if (file_ext(inFilePath1) == "bam") 22 | input_format = "BAM" 23 | if (file_ext(inFilePath1) %in% c("fastq", "fq")) 24 | input_format = "FASTQ" 25 | 26 | outBamFilePath = paste(tempFilePrefix, "bam", sep=".") 27 | 28 | referenceGenomeIndexFilePrefix = paste(referenceGenomeFastaFilePath, "__reference_index", sep="") 29 | 30 | if (!file.exists(paste(referenceGenomeIndexFilePrefix, ".reads", sep=""))) 31 | buildindex(basename=referenceGenomeIndexFilePrefix, reference=referenceGenomeFastaFilePath, memory=memory) 32 | 33 | if (inFilePath2 == "NULL") 34 | inFilePath2 = NULL 35 | 36 | if (!file.exists(outBamFilePath)) 37 | align(index=referenceGenomeIndexFilePrefix, readfile1=inFilePath1, readfile2=inFilePath2, output_file=outBamFilePath, nthreads=nthreads, input_format=input_format, tieBreakHamming=TRUE, unique=TRUE, indels=5) 38 | 39 | fCountsList = featureCounts(outBamFilePath, annot.ext=gtfFilePath, isGTFAnnotationFile=TRUE, nthreads=nthreads, isPairedEnd=!is.null(inFilePath2)) 40 | dgeList = DGEList(counts=fCountsList$counts, genes=fCountsList$annotation) 41 | fpkm = rpkm(dgeList, dgeList$genes$Length) 42 | tpm = exp(log(fpkm) - log(sum(fpkm)) + log(1e6)) 43 | 44 | write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 45 | 46 | featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts) 47 | write.table(featureCounts, outCountsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 48 | 49 | write.table(cbind(fCountsList$annotation[,1], fpkm), outFpkmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 50 | #write.table(cbind(fCountsList$annotation[,1], log2(fpkm + 1)), outFpkmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 51 | write.table(cbind(fCountsList$annotation[,1], tpm), outTpmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 52 | #write.table(cbind(fCountsList$annotation[,1], log2(tpm + 1)), outTpmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE) 53 | 54 | unlink(outBamFilePath) 55 | unlink(paste(outBamFilePath, ".indel", sep="")) 56 | -------------------------------------------------------------------------------- /Codes/Split.py: -------------------------------------------------------------------------------- 1 | import os, sys, glob 2 | 3 | inFilePath = sys.argv[1] 4 | outDirPath = sys.argv[2] 5 | 6 | inFile = open(inFilePath) 7 | sampleIDs = inFile.readline().rstrip().split("\t")[1:] 8 | 9 | lineCount = 0 10 | 11 | for line in inFile: 12 | lineItems = line.rstrip().split("\t") 13 | gene = lineItems.pop(0) 14 | 15 | for sampleID in sampleIDs: 16 | outFile = open(outDirPath + "/" + sampleID, 'a') 17 | outFile.write("%s\t%s\n" % (gene, lineItems.pop(0))) 18 | outFile.close() 19 | 20 | lineCount += 1 21 | if lineCount % 1000 == 0: 22 | print lineCount 23 | 24 | inFile.close() 25 | -------------------------------------------------------------------------------- /Codes/TransposeData.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import utilities 3 | 4 | inFilePath = sys.argv[1] 5 | outFilePath = sys.argv[2] 6 | 7 | data = utilities.readMatrixFromFile(inFilePath) 8 | 9 | if len(data) > 1 and len(data[0]) == len(data[1]) - 1: 10 | data[0].insert(0, " ") 11 | 12 | utilities.writeMatrixToFile(utilities.transposeMatrix(data), outFilePath) 13 | -------------------------------------------------------------------------------- /Codes/biological_rep.R: -------------------------------------------------------------------------------- 1 | find_biological_replicate<-function(matrix){ 2 | s=NULL 3 | samples=colnames(matrix) 4 | for(i in 1:ncol(matrix)){ 5 | s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-') 6 | } 7 | sum(duplicated(s)) 8 | s=s[duplicated(s)] 9 | dupsamples=NULL 10 | counter=0 11 | for(i in 1:length(samples)){ 12 | tmp=paste(strsplit(samples[i],'-')[[1]][1:3],sep='',collapse="-") 13 | print(tmp) 14 | if(tmp%in%s){ 15 | print("biological replicate found!!") 16 | print(rownames(samples))[i] 17 | dupsamples=c(dupsamples,samples[i]) 18 | counter=counter+1 19 | } 20 | } 21 | 22 | print(paste(counter,"samples are duplicated for biological replicates")) 23 | return (matrix[,colnames(matrix)%in%dupsamples]) 24 | 25 | } 26 | 27 | 28 | 29 | library(data.table) 30 | 31 | samples<-read.table("~/Downloads/GSE62944_TCGA_20_CancerType_Samples.txt",row.names=1) 32 | # tcga20<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_FeatureCounts.txt"),row.names=1,check.names = F) 33 | # dim(tcga20) 34 | # filt_20<-find_biological_replicate(rownames(samples),tcga20) 35 | # dim(filt_20) 36 | # tcga20_zero<-apply(tcga20==0,2,sum) 37 | tcga24_tpm<-data.frame(fread("~/Desktop/PANCAN24/PANCAN24.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM.txt"),row.names=1,check.names = F) 38 | dim(tcga20_tpm) 39 | 40 | normal_tpm<-data.frame(fread("~/Desktop/PANCAN24/TCGA24.IlluminaHiSeq_RNASeqV2.normal_Rsubread_TPM.txt"),row.names=1,check.names = F) 41 | dim(normal_tpm) 42 | colnames(normal_tpm) 43 | s=NULL 44 | samples=colnames(normal_tpm) 45 | for(i in 1:ncol(normal_tpm)){ 46 | s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-') 47 | } 48 | tumor_s=NULL 49 | samples=colnames(tcga24_tpm) 50 | for(i in 1:ncol(tcga24_tpm)){ 51 | tumor_s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-') 52 | } 53 | 54 | 55 | 56 | tcga20_tpm<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM_10_9.txt"),row.names=1,check.names = F) 57 | dim(tcga20_tpm) 58 | filt_20_tpm<-find_biological_replicate(tcga20_tpm) 59 | dim(filt_20_tpm) 60 | tcga20_zero<-apply(tcga20_tpm==0,2,sum) 61 | 62 | 63 | rsem<-data.frame(fread("~/Desktop/PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist"),row.names=1,check.names = F) 64 | filt_12<-find_biological_replicate(rsem) 65 | dim(filt_12)##16 samples with 2 replicates 66 | rsem_zero<-apply(rsem==0,2,sum) 67 | filt_12<-filt_12[30:nrow(filt_12),] 68 | biological_rep_12<-subset(filt_12,select=colnames(filt_12)%in%colnames(filt_20_tpm)) 69 | rownames_biological_rep_12<-gsub("[|].*","",rownames(biological_rep_12)) 70 | 71 | biological_rep_12_o<-biological_rep_12[rownames_biological_rep_12%in%rownames(biological_rep_20),order(colnames(biological_rep_12))] 72 | dim(biological_rep_12_o) 73 | 74 | #biological_rep_20<-subset(filt_20,select=colnames(filt_20)%in%colnames(filt_12)) 75 | #biological_rep_20_o<-biological_rep_20[,order(colnames(biological_rep_20))] 76 | biological_rep_20<-subset(filt_20_tpm,select=colnames(filt_20_tpm)%in%colnames(filt_12)) 77 | biological_rep_20_o<-biological_rep_20[rownames(biological_rep_20)%in%rownames_biological_rep_12,order(colnames(biological_rep_20))] 78 | dim(biological_rep_20_o) 79 | 80 | 81 | total_20<-log2(apply(biological_rep_20_o,2,sum)) 82 | total_12<-log2(apply(biological_rep_12_o,2,sum)) 83 | 84 | 85 | #plot(total_12,ylim=c(22.75,26.5),main="PANCAN12 Level 3",ylab="log2(Gene Counts)") 86 | cor_res_12=cor_res_20=NULL 87 | pdf("~/Dropbox/Bioinformatics submission/Resubmission/scatter_plot.pdf") 88 | 89 | cor_res_12=cor_res_20=NULL 90 | for(i in 1:13){ 91 | #points((i*2-1):(i*2),total_12[(i*2-1):(i*2)],col=i,lwd = 4,pch=i) 92 | plot(log2(biological_rep_12_o[,(i*2-1)]+1),log2(biological_rep_12_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_12_o)[(i*2-1)],"log2(Normalized gene counts)",sep='\n'),ylab=paste(colnames(biological_rep_12_o)[(i*2)],"log2(Normalized gene counts)",sep=' ')) 93 | c=cor.test(biological_rep_12_o[,(i*2-1)],biological_rep_12_o[,(i*2)])#,method="spearman") 94 | cor_res_12=rbind(cor_res_12,c(colnames(biological_rep_12_o)[(i*2-1)],round(total_12[(i*2-1)],digits = 3),colnames(biological_rep_12_o)[(i*2)],round(total_12[(i*2)],digits = 3),round(c$estimate,digits = 3))) 95 | #title(paste(paste(strsplit(colnames(biological_rep_12_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep="")) 96 | title(paste("TCGA Level 3 \nPearson's correlation=",round(c$estimate,digits = 2),sep="")) 97 | print(i) 98 | print(colnames(biological_rep_12_o)[i]) 99 | plot(log2(biological_rep_20_o[,(i*2-1)]+1),log2(biological_rep_20_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_20_o)[(i*2-1)],"log2(TPM)",sep='\n'),ylab=paste(colnames(biological_rep_20_o)[(i*2)],"log2(TPM)",sep='')) 100 | c<-cor.test(biological_rep_20_o[,(i*2-1)],biological_rep_20_o[,(i*2)])#,method="spearman") 101 | cor_res_20=rbind(cor_res_20,c(colnames(biological_rep_20_o)[(i*2-1)],round(total_20[(i*2-1)],digits = 3),colnames(biological_rep_20_o)[(i*2)],round(total_20[(i*2)],digits=3),round(c$estimate,digits = 3))) 102 | #title(paste(paste(strsplit(colnames(biological_rep_20_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep="")) 103 | title(paste("Rsubread TPM \nPearson's correlation=",round(c$estimate,digits = 2),sep="")) 104 | 105 | } 106 | colnames(cor_res_12)=c("Replicate_1","log2 Level 3 gene counts","Replicate_2","log2 Level 3 gene counts","Pearson's correlation between replicates(Level 3)") 107 | colnames(cor_res_20)=c("Replicate_1","log2 Rsubread gene counts","Replicate_2","log2 Rsubread gene counts","Pearson's correlation between replicates(Rsubread)") 108 | 109 | par( mfrow = c(2, 1 ) ,lwd=4) 110 | hist(as.numeric(cor_res_12[,5]),main = "TCGA Level 3 Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5) 111 | abline(v=mean(as.numeric(cor_res_12[,5])),col="red") 112 | abline(v=median(as.numeric(cor_res_12[,5])),col="blue") 113 | hist(as.numeric(cor_res_20[,5]),main = "Rsubread Replicates Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5) 114 | abline(v=mean(as.numeric(cor_res_20[,5])),col="red") 115 | abline(v=median(as.numeric(cor_res_20[,5])),col="blue") 116 | 117 | write.table(cbind(cor_res_12,cor_res_20),"~/Desktop/correlations.txt",sep='\t',col.names=NA,quote=1) 118 | #************************************************************************ 119 | 120 | 121 | 122 | #### 123 | ecdf_all_ex<-apply(log2(biological_rep_12_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf) 124 | plot(ecdf_all_ex[[1]],xlab="log2 Level 3 reads", ylab = NA,xlim=c(0,20),col="blue",main="TCGA Level 3",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5) 125 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown") 126 | 127 | ###using Rsubread pipeline aligned data 128 | # ecdf_all<-apply(rsub_fpkmlog,2,ecdf) 129 | # plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion") 130 | # for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")} 131 | # for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")} 132 | 133 | ecdf_all_ex<-apply(log2(biological_rep_20_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf) 134 | plot(ecdf_all_ex[[1]],xlab="log2TPM reads", ylab = NA,xlim=c(0,20),col="blue",main="Rsubread",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5,) 135 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown") 136 | 137 | 138 | 139 | 140 | ############zero 141 | setwd("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/") 142 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) 143 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) 144 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) 145 | rsub_tpm<-log2(read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F)+1) 146 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),] 147 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene 148 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene) 149 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1) 150 | 151 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_tpm),] 152 | com_genes_TCGA<-com_genes_TCGA[order(rownames(com_genes_TCGA)),] 153 | com_genes_tpm<-rsub_tpm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),] 154 | com_genes_tpm<-com_genes_tpm[order(rownames(com_genes_tpm)),] 155 | zero_genes_rsem<-com_genes_TCGA[apply(com_genes_TCGA[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates 156 | sum_zero_genes_rsem<-mean(apply(zero_genes_rsem==0,1,sum))##average of how many replicates have same zero expression 157 | 158 | 159 | nrow(zero_genes_rsem) 160 | nrow(zero_genes_rsem)/nrow(com_genes_TCGA) 161 | mean(apply(zero_genes_rsem,1,mean))#228.859 if TCGA counts are used 162 | zero_genes_f<-com_genes_tpm[apply(com_genes_tpm[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates 163 | sum_zero_genes_feature<-mean(apply(zero_genes_f==0,1,sum))##average of how many replicates have same zero expression 164 | 165 | nrow(zero_genes_f) 166 | nrow(zero_genes_f)/nrow(com_genes_tpm) 167 | mean(apply(zero_genes_f,1,mean))#0.55 if Rsubread counts are used. 168 | par( mfrow = c(1, 2 ) ,lwd=4) 169 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="TCGA Level 3",breaks=12) 170 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2) 171 | 172 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="Rsubread TPM",breaks=2) 173 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2) 174 | pro_t<-zero_sum/nrow(rsem_f) 175 | prop<-cbind(pro_t,pro_r) 176 | colnames(prop)<-c("TCGA Level 3","Rsubread TPM") 177 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F) 178 | 179 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1,]),sep=" ")) 180 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1&apply(com_genes_tpm[,1:12]<=100,1,mean)==1,]),sep=" ")) 181 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>100,1,mean)==1&apply(com_genes_tpm[,1:12]<=1000,1,mean)==1,]),sep=" ")) 182 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>1000,1,mean)==1&apply(com_genes_tpm[,1:12]<=10000,1,mean)==1,]),sep=" ")) 183 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>10000,1,mean)==1,]),sep=" ")) 184 | 185 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1,]),sep=" ")) 186 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1&apply(com_genes_TCGA[,1:12]<=100,1,mean)==1,]),sep=" ")) 187 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>100,1,mean)==1&apply(com_genes_TCGA[,1:12]<=1000,1,mean)==1,]),sep=" ")) 188 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>1000,1,mean)==1&apply(com_genes_TCGA[,1:12]<=10000,1,mean)==1,]),sep=" ")) 189 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>10000,1,mean)==1,]),sep=" ")) 190 | 191 | 192 | 193 | 194 | 195 | # feature_f<-feature[rownames(feature)%in%rownames(rsem_her2_expected_counts),] 196 | # rsem_f<-rsem_her2_expected_counts[rownames(rsem_her2_expected_counts)%in%rownames(feature),] 197 | # zero_sum_r<-apply(feature_f==0,2,sum) 198 | # pro_r<-zero_sum_r/nrow(feature_f) 199 | # pro_r<-pro_r[order(names(pro_r))] 200 | # zero_sum_tcga<-apply(rsem_f==0,2,sum)/nrow(rsem_f) 201 | # print(paste("Total number of zero expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]==0,1,mean)==1),]),sep=" ")) 202 | # print(paste("Total number of zero expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]==0,1,mean)==1),]),sep=" ")) 203 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" ")) 204 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" ")) 205 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" ")) 206 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" ")) 207 | # 208 | # dim(rsem_f) 209 | # dim(feature_f) 210 | # rsem_f_o<-rsem_f[order(rownames(rsem_f)),] 211 | # feature_f_o<-feature_f[order(rownames(feature_f)),] 212 | # head(rownames(rsem_f_o)) 213 | # head(rownames(feature_f_o)) 214 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)==1,]#common zero expressing genes 215 | # nrow(zero_genes_rsem) 216 | # nrow(feature_f_o[apply(feature_f_o[,1:17]==0,1,mean)==1&apply(rsem_f_o[,1:17]==0,1,mean)==1,]) 217 | # zero_genes_f<-feature_f_o[apply(rsem_f_o[,1:17]==0,1,mean)!=1&apply(feature_f_o[,1:17]==0,1,mean)==1,]##gene that are zero expressing in feature counts but nonzero in TCGA 218 | # nrow(zero_genes_f) 219 | # zero_genes_r<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)!=1,]##gene that are zero expressing in Level 3 but nonzero in feature 220 | # nrow(zero_genes_r) 221 | # ------- 222 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates 223 | # nrow(zero_genes_rsem) 224 | # nrow(zero_genes_rsem)/nrow(rsem_f_o) 225 | # mean(apply(zero_genes_rsem,1,mean)) 226 | # zero_genes_f<-feature_f_o[apply(feature_f_o[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates 227 | # nrow(zero_genes_f) 228 | # nrow(zero_genes_f)/nrow(feature_f_o) 229 | # mean(apply(zero_genes_f,1,mean)) 230 | 231 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1,]),sep=" ")) 232 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1&apply(feature_f_o[,1:12]<=100,1,mean)==1,]),sep=" ")) 233 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>100,1,mean)==1&apply(feature_f_o[,1:12]<=1000,1,mean)==1,]),sep=" ")) 234 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>1000,1,mean)==1&apply(feature_f_o[,1:12]<=10000,1,mean)==1,]),sep=" ")) 235 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>10000,1,mean)==1,]),sep=" ")) 236 | 237 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1,]),sep=" ")) 238 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1&apply(rsem_f_o[,1:12]<=100,1,mean)==1,]),sep=" ")) 239 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>100,1,mean)==1&apply(rsem_f_o[,1:12]<=1000,1,mean)==1,]),sep=" ")) 240 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>1000,1,mean)==1&apply(rsem_f_o[,1:12]<=10000,1,mean)==1,]),sep=" ")) 241 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>10000,1,mean)==1,]),sep=" ")) 242 | 243 | 244 | 245 | 246 | 247 | #interesting_genes<-feature_f[rownames(feature_f)%in%rownames(zero_genes),] 248 | #zero_genes_feature<-interesting_genes[apply(interesting_genes[,1:17]==0,1,mean)==1,]# 249 | par( mfrow = c(2, 1 ) ,lwd=4) 250 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="TCGA",breaks=12) 251 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2) 252 | 253 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="Rsubread",breaks=2) 254 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2) 255 | pro_t<-zero_sum/nrow(rsem_f) 256 | prop<-cbind(pro_t,pro_r) 257 | colnames(prop)<-c("TCGA","Rsubread") 258 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F) 259 | #########LUSC but LUAD-like analysis## 260 | class_12<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt", header=1,row.names=1) 261 | class_20<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt", header=1,row.names=1) 262 | mismatch12<-class_12[class_12[,1]!=class_12[,2],] 263 | mismatch20<-class_20[class_20[,1]!=class_20[,2],] 264 | mismatches_all<-merge(class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],by=0) 265 | rownames(mismatches_all)<-gsub("01A-.*-07","01",mismatches_all$Row.names) 266 | mismatches_all<-mismatches_all[,2:ncol(mismatches_all)] 267 | colnames(mismatches_all)<-c("ActualClass.TCGA","PredictedClass.TCGA","LUAD_Probability.TCGA", "LUSC_Probability.TCGA", "ActualClass.Rsubread","PredictedClass.Rsubread", 268 | "LUAD_Probability.Rsubread","LUSC_Probability.Rsubread") 269 | lusc_but_luad<-read.table("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/LUSC_but_LUAD_like.txt",sep='\t', header=1) 270 | discord<-merge(mismatches_all,lusc_but_luad,by.x=0,by.y=1,all.y=T) 271 | mismatches_all[gsub("01A-.*-07","01",mismatches_all$Row.names)%in%lusc_but_luad$sample,]#identifies the missclassified LUSC, but LUAD-like samples identified by 272 | lusc_but_luad[!lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),] 273 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),] 274 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",rownames(mismatch20)),] 275 | 276 | -------------------------------------------------------------------------------- /Codes/numZero.R: -------------------------------------------------------------------------------- 1 | 2 | ##Manually download Pancan12 RNA_Seq dataset from https://www.synapse.org/#!Synapse:syn1695324 and filtered for gene symbols.Additionally, download Rsubread TPM RNA_Seq data from GEO accession number GSM1536837. 3 | pan12<-read.table("PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist", header=1,row.names=1) 4 | pan20<-read.table("GSM1536837_TCGA_20.Illumina.tumor_Rsubread_TPM.txt",header=1,row.names=1) 5 | 6 | pan12_f<-pan12[rownames(pan12)%in%rownames(pan20),colnames(pan12)%in%colnames(pan20)] 7 | pan20_f<-pan20[rownames(pan20)%in%rownames(pan12),colnames(pan20)%in%colnames(pan12)] 8 | 9 | 10 | 11 | write.table(apply((pan12_f==0),2,sum),"PANCAN12_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F) 12 | write.table(apply((pan20_f==0),2,sum),"PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F) 13 | -------------------------------------------------------------------------------- /Codes/utilities.py: -------------------------------------------------------------------------------- 1 | import glob, os, posix, sys, math, collections, json, difflib 2 | #import scipy 3 | #from scipy.stats import * 4 | from operator import itemgetter, attrgetter 5 | import itertools 6 | from random import uniform, sample 7 | #import numpy 8 | from collections import defaultdict 9 | #from fisher import * 10 | #from transcendental import stdtr 11 | 12 | def printFlush(text, outFilePath=None): 13 | print text 14 | sys.stdout.flush() 15 | 16 | if outFilePath != None: 17 | outFile = open(outFilePath, 'a') 18 | outFile.write(text + "\n") 19 | outFile.close() 20 | 21 | def printMatrix(data): 22 | for x in data: 23 | print x 24 | print "" 25 | 26 | def smartDivide(numerator, denominator): 27 | if float(denominator) == 0.0: 28 | return float('nan') 29 | 30 | return float(numerator) / float(denominator) 31 | 32 | def getProbes(probeTabFilePath): 33 | probes = [] 34 | 35 | probeTabFile = open(probeTabFilePath) 36 | headerItems = [x.lower() for x in probeTabFile.readline().rstrip().split("\t")] 37 | 38 | for line in probeTabFile: 39 | lineItems = line.rstrip().split("\t") 40 | if headerItems.count("probe set name") > 0: 41 | probeset = lineItems[headerItems.index("probe set name")] 42 | else: 43 | if headerItems.count("probe set id") > 0: 44 | probeset = lineItems[headerItems.index("probe set id")] 45 | else: 46 | print "No probe set name or probe set id column in %s" % probeTabFilePath 47 | 48 | probeX = lineItems[headerItems.index("probe x")] 49 | probeY = lineItems[headerItems.index("probe y")] 50 | probe = probeset + "#" + probeX + "_" + probeY 51 | probes.append(probe) 52 | 53 | return probes 54 | 55 | def getProbesetProbesDict(probes): 56 | probesetProbesDict = {} 57 | 58 | for probe in probes: 59 | probeset = probe[:probe.find("#")] 60 | probesetProbesDict[probeset] = probesetProbesDict.setdefault(probeset, []) + [probe] 61 | 62 | return probesetProbesDict 63 | 64 | def getPatientIDs(normDirPath, normFileSuffix): 65 | ids = [] 66 | 67 | #print normDirPath + "*" + normFileSuffix 68 | #sys.exit(0) 69 | for filePath in glob.glob(normDirPath + "*" + normFileSuffix): 70 | ids.append(filePath.replace(normDirPath, "").replace(normFileSuffix, "")) 71 | 72 | ids.sort() 73 | return ids 74 | 75 | def readScalarFromFile(filePath): 76 | return readMatrixFromFile(filePath)[0][0] 77 | 78 | def writeScalarToFile(x, filePath): 79 | outFile = open(filePath, 'w') 80 | outFile.write(x) 81 | outFile.close() 82 | 83 | def readVectorFromFile(filePath): 84 | return [line.rstrip() for line in file(filePath)] 85 | 86 | def writeVectorToFile(data, filePath): 87 | outFile = open(filePath, 'w') 88 | for x in data: 89 | outFile.write(str(x) + "\n") 90 | outFile.close() 91 | 92 | def readMatrixFromFile(filePath, numLines=None): 93 | matrix = [] 94 | for line in file(filePath): 95 | if numLines != None and len(matrix) >= numLines: 96 | break 97 | 98 | matrix.append(line.rstrip().split("\t")) 99 | 100 | if len(matrix) % 100000 == 0: 101 | print len(matrix) 102 | 103 | return matrix 104 | 105 | def writeMatrixToFile(x, filePath, writeMode='w'): 106 | outFile = open(filePath, writeMode) 107 | writeMatrixToOpenFile(x, outFile) 108 | outFile.close() 109 | 110 | def writeMatrixToOpenFile(x, outFile): 111 | for y in x: 112 | outFile.write("\t".join([str(z) for z in y]) + "\n") 113 | 114 | def appendMatrixToFile(x, filePath): 115 | writeMatrixToFile(x, filePath, writeMode='a') 116 | 117 | def readTextFromFile(filePath): 118 | text = "" 119 | 120 | for line in file(filePath): 121 | text += line 122 | 123 | return text 124 | 125 | def writeDictToFile(dictionary, filePath): 126 | writeScalarToFile(json.dumps(dictionary), filePath) 127 | 128 | def readDictFromFile(filePath): 129 | txt = readTextFromFile(filePath) 130 | dictionary = json.loads(txt) 131 | 132 | dictionary2 = {} 133 | 134 | for key in dictionary: 135 | value = dictionary[key] 136 | 137 | if isNumeric(key): 138 | key = int(key) 139 | 140 | dictionary2[key] = value 141 | 142 | return dictionary2 143 | 144 | def calculateMean(values): 145 | if len(values) == 0: 146 | return float('nan') 147 | 148 | return sum(values) / len(values) 149 | 150 | def calculateVarianceMean(values): 151 | mu = calculateMean(values) 152 | diffValues = [(x - mu)**2 for x in values] 153 | return calculateMean(diffValues) / (len(diffValues) - 1) 154 | 155 | def calculateWeightedMean(values, weights): 156 | if len(values) != len(weights): 157 | print "When calculating a weighted mean, the values must be the same length as the weights." 158 | raise 159 | 160 | def calculateStandardDeviation(values): 161 | xbar = calculateMean(values) 162 | residuals = [x - xbar for x in values] 163 | residualsSquared = [x**2 for x in residuals] 164 | return math.sqrt(sum(residualsSquared) / (len(values) - 1)) 165 | 166 | def calculateZscore(x): 167 | mean = calculateMean(x) 168 | standardDeviation = calculateStandardDeviation(x) 169 | return [(y - mean) / standardDeviation for y in x] 170 | 171 | def calculateTrimmedMean(values, trimProportion=0.10): 172 | if values == None or len(values) == 0: 173 | return None 174 | 175 | values = sorted([float(x) for x in values]) 176 | 177 | if len(values) < 3: 178 | return calculateMean(values) 179 | elif len(values) == 3: 180 | return values[1] 181 | elif len(values) == 4: 182 | return calculateMean(values[1:3]) 183 | elif len(values) == 5: 184 | return calculateMean(values[1:4]) 185 | 186 | values = scipy.stats.trimboth(values, trimProportion) 187 | 188 | return float(calculateMean(values)) 189 | 190 | def calculateEuclideanDistance(xList, yList): 191 | zSum = 0.0 192 | 193 | for i in range(len(xList)): 194 | x = xList[i] 195 | y = yList[i] 196 | z = math.pow(x - y, 2) 197 | zSum += z 198 | 199 | return math.sqrt(zSum) 200 | 201 | def calculateCorrelationCoefficient(xList, yList): 202 | return numpy.corrcoef(xList, yList)[0,1] 203 | 204 | def calculatePearsonCoefficient(xList, yList): 205 | return stats.pearsonr(xList, yList)[0] 206 | 207 | def calculateSpearmanCoefficient(xList, yList): 208 | return stats.spearmanr(xList, yList)[0] 209 | 210 | def calculateTTest(xList, yList): 211 | xList = numpy.array([x for x in xList if not math.isnan(x)]) 212 | yList = numpy.array([y for y in yList if not math.isnan(y)]) 213 | 214 | if len(xList) == 1 and len(yList) > 1: 215 | return calculateOneSampleTTest(xList[0], yList) 216 | if len(xList) > 1 and len(yList) == 1: 217 | return calculateOneSampleTTest(yList[0], xList) 218 | 219 | return ttest_ind(xList, yList, 0)[1] 220 | 221 | # From http://stackoverflow.com/questions/10038543/tracking-down-the-assumptions-made-by-scipys-ttest-ind-function 222 | def calculateWelchTTest(pop1, pop2): 223 | num1 = numpy.array(pop1).shape[0] 224 | num2 = numpy.array(pop2).shape[0] 225 | 226 | t_stat = (numpy.mean(pop1) - numpy.mean(pop2))/numpy.sqrt( numpy.var(pop1)/num1 + numpy.var(pop2)/num2) 227 | df = ((numpy.var(pop1)/num1 + numpy.var(pop2)/num2)**(2.0)) / ((numpy.var(pop1)/num1)**(2.0)/(num1-1) + (numpy.var(pop2)/num2) ** (2.0) / (num2-1)) 228 | 229 | #one_tailed_p_value = 1.0 - scipy.stats.t.cdf(t_stat,df) 230 | two_tailed_p_value = 1.0 - (scipy.stats.t.cdf(numpy.abs(t_stat),df) - scipy.stats.t.cdf(-numpy.abs(t_stat), df)) 231 | 232 | return two_tailed_p_value 233 | 234 | def calculateOneSampleTTest(x, yList): 235 | return stats.ttest_1samp(yList, x)[1] 236 | 237 | def isValueAberrant(x, yList, numStandardDeviations): 238 | std = calculateStandardDeviation(yList) 239 | lowerLimit = calculateMean(yList) - float(numStandardDeviations) * std 240 | upperLimit = calculateMean(yList) + float(numStandardDeviations) * std 241 | 242 | return x < lowerLimit or x > upperLimit 243 | 244 | def calculateMedian(values): 245 | sortedValues = sorted(values) 246 | 247 | if len(sortedValues) % 2 == 1: 248 | return sortedValues[(len(sortedValues)+1)/2-1] 249 | else: 250 | lower = sortedValues[len(sortedValues)/2-1] 251 | upper = sortedValues[len(sortedValues)/2] 252 | return (float(lower + upper)) / 2 253 | 254 | def calculateFoldChange(values1, values2): 255 | overallMin = min(min(values1), min(values2)) 256 | 257 | values1 = [x - overallMin + 1 for x in values1] 258 | values2 = [x - overallMin + 1 for x in values2] 259 | 260 | mean1 = calculateMean(values1) 261 | mean2 = calculateMean(values2) 262 | 263 | return mean1 / mean2 264 | 265 | def calculateAbsoluteFoldChange(values1, values2): 266 | overallMin = min(min(values1), min(values2)) 267 | 268 | values1 = [x - overallMin + 1 for x in values1] 269 | values2 = [x - overallMin + 1 for x in values2] 270 | 271 | mean1 = calculateMean(values1) 272 | mean2 = calculateMean(values2) 273 | 274 | ratioA = mean1 / mean2 275 | ratioB = mean2 / mean1 276 | 277 | return min(ratioA, ratioB) 278 | 279 | def getNormalizedProbes(normFilePath): 280 | print "Getting normalized probes" 281 | return [line.split(" ")[0] for line in file(normFilePath)] 282 | 283 | def getKeyProbeDict(filePath, probesToKeep=None, minProbesPerKey=1): 284 | probesToKeepSet = set(probesToKeep) 285 | keyProbeDict = {} 286 | 287 | for line in file(filePath): 288 | lineItems = line.rstrip().split("\t") 289 | key = lineItems[0] 290 | 291 | if len(lineItems) > 1: 292 | fileProbes = [x for x in lineItems[1].split(",") if x != ""] 293 | 294 | if len(fileProbes) >= 0: 295 | keyProbeDict[key] = keyProbeDict.setdefault(key, []) + fileProbes 296 | 297 | return keyProbeDict 298 | 299 | def getTranscriptProbeDict(filePath, normFilePath): 300 | normalizedProbes = set(getNormalizedProbes(normFilePath)) 301 | 302 | print "Getting transcript-probe dictionary" 303 | transcriptProbeDict = {} 304 | for line in file(filePath): 305 | lineItems = line.rstrip().split("\t") 306 | transcript = lineItems[0] 307 | probes = lineItems[1].split(",") 308 | probes = list(set(probes) & normalizedProbes) 309 | 310 | transcriptProbeDict[transcript] = probes 311 | 312 | return transcriptProbeDict 313 | 314 | def getPatientsKeyValuesDict(sourceDir, patientIDs, fileSuffix, dataValueIndex, keys=None): 315 | patientsKeyValuesDict = collections.defaultdict(dict) 316 | 317 | if len(patientIDs) == 0: 318 | return patientsKeyValuesDict 319 | 320 | keyLineIndicesDict = {} 321 | lineCount = 0 322 | 323 | for line in file(sourceDir + patientIDs[0] + fileSuffix): 324 | key = line.rstrip().split("\t")[0] 325 | keyLineIndicesDict[key] = lineCount 326 | 327 | lineCount += 1 328 | #if lineCount % 100000 == 0: 329 | # print "Parsing file line indices: %i" % lineCount 330 | 331 | #print "Creating key line indices list from dict" 332 | if keys == None: 333 | keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keyLineIndicesDict.keys()] 334 | else: 335 | keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keys if key in keyLineIndicesDict.keys()] 336 | 337 | #print "Sorting key line indices" 338 | keyLineIndices.sort(key=itemgetter(1)) 339 | 340 | patientFileHandles = {} 341 | for patientID in patientIDs: 342 | patientFileHandles[patientID] = open(checkDirPath(sourceDir) + patientID + fileSuffix) 343 | 344 | for patientID in patientIDs: 345 | #print patientID 346 | patientFile = open(checkDirPath(sourceDir) + patientID + fileSuffix) 347 | 348 | previousLineIndex = 0 349 | for keyLineIndex in keyLineIndices: 350 | for i in range(previousLineIndex, keyLineIndex[1]): 351 | patientFile.readline() 352 | previousLineIndex = keyLineIndex[1] + 1 353 | 354 | lineItems = patientFile.readline().rstrip().split("\t") 355 | patientsKeyValuesDict[patientID][lineItems[0]] = lineItems[dataValueIndex] 356 | 357 | patientFile.close() 358 | 359 | return patientsKeyValuesDict 360 | 361 | def getPatientKeyValuesDict(filePath, dataColumnIndex, probes=None): 362 | probeValues = {} 363 | 364 | for line in file(filePath): 365 | lineItems = line.rstrip().split("\t") 366 | probe = lineItems[0] 367 | value = lineItems[dataColumnIndex] 368 | 369 | probeValues[probe] = value 370 | 371 | if not probes: 372 | return probeValues 373 | else: 374 | modProbeValues = {} 375 | for probe in probes: 376 | modProbeValues[probe] = probeValues[probe] 377 | return modProbeValues 378 | 379 | def savePatientKeyValuesDict(patientDict, outFilePath): 380 | outFile = open(outFilePath, 'w') 381 | 382 | for key in sorted(patientDict.keys()): 383 | outFile.write("%s\t%s\n" % (key, patientDict[key])) 384 | 385 | outFile.close() 386 | 387 | def checkDirPath(dirPath): 388 | if not os.path.exists(dirPath): 389 | posix.mkdir(dirPath) 390 | 391 | if not dirPath.endswith("/"): 392 | dirPath = dirPath + "/" 393 | 394 | return dirPath 395 | 396 | def lastIndexOf(theList, value): 397 | return len(theList) - 1 - theList[::-1].index(value) 398 | 399 | def getTranscriptGeneDict(filePath): 400 | transcriptGeneDict = {} 401 | 402 | for line in file(filePath): 403 | lineItems = line.rstrip().split("\t") 404 | transcript = lineItems[0] 405 | 406 | gene = lineItems[1] 407 | if len(lineItems) == 3: 408 | gene = lineItems[2] 409 | 410 | transcriptGeneDict[transcript] = gene 411 | 412 | return transcriptGeneDict 413 | 414 | def getGeneTranscriptDict(filePath): 415 | geneTranscriptDict = {} 416 | 417 | for line in file(filePath): 418 | lineItems = line.rstrip().split("\t") 419 | transcript = lineItems[0] 420 | 421 | gene = lineItems[1] 422 | if len(lineItems) == 3: 423 | gene = lineItems[2] 424 | 425 | geneTranscriptDict[gene] = geneTranscriptDict.setdefault(gene, []) + [transcript] 426 | 427 | return geneTranscriptDict 428 | 429 | def transposeMatrix(x): 430 | transposed = zip(*x) 431 | 432 | for i in range(len(transposed)): 433 | transposed[i] = list(transposed[i]) 434 | 435 | return transposed 436 | 437 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/ 438 | def order(x, NoneIsLast = True, decreasing = False): 439 | """ 440 | Returns the ordering of the elements of x. The list 441 | [ x[j] for j in order(x) ] is a sorted version of x. 442 | 443 | Missing values in x are indicated by None. If NoneIsLast is true, 444 | then missing values are ordered to be at the end. 445 | Otherwise, they are ordered at the beginning. 446 | """ 447 | omitNone = False 448 | if NoneIsLast == None: 449 | NoneIsLast = True 450 | omitNone = True 451 | 452 | n = len(x) 453 | ix = range(n) 454 | if None not in x: 455 | ix.sort(reverse = decreasing, key = lambda j : x[j]) 456 | else: 457 | # Handle None values properly. 458 | def key(i, x = x): 459 | elem = x[i] 460 | # Valid values are True or False only. 461 | if decreasing == NoneIsLast: 462 | return not(elem is None), elem 463 | else: 464 | return elem is None, elem 465 | ix = range(n) 466 | ix.sort(key=key, reverse=decreasing) 467 | 468 | if omitNone: 469 | n = len(x) 470 | for i in range(n-1, -1, -1): 471 | if x[ix[i]] == None: 472 | n -= 1 473 | return ix[:n] 474 | return ix 475 | 476 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/ 477 | def rankSmart(x, NoneIsLast=True, decreasing = False, ties = "first"): 478 | """ 479 | Returns the ranking of the elements of x. The position of the first 480 | element in the original vector is rank[0] in the sorted vector. 481 | 482 | Missing values are indicated by None. Calls the order() function. 483 | Ties are NOT averaged by default. Choices are: 484 | "first" "average" "min" "max" "random" "average" 485 | """ 486 | omitNone = False 487 | if NoneIsLast == None: 488 | NoneIsLast = True 489 | omitNone = True 490 | O = order(x, NoneIsLast = NoneIsLast, decreasing = decreasing) 491 | R = O[:] 492 | n = len(O) 493 | for i in range(n): 494 | R[O[i]] = i 495 | if ties == "first" or ties not in ["first", "average", "min", "max", "random"]: 496 | return R 497 | 498 | blocks = [] 499 | isnewblock = True 500 | newblock = [] 501 | for i in range(1,n) : 502 | if x[O[i]] == x[O[i-1]]: 503 | if i-1 not in newblock: 504 | newblock.append(i-1) 505 | newblock.append(i) 506 | else: 507 | if len(newblock) > 0: 508 | blocks.append(newblock) 509 | newblock = [] 510 | if len(newblock) > 0: 511 | blocks.append(newblock) 512 | 513 | for i, block in enumerate(blocks): 514 | # Don't process blocks of None values. 515 | if x[O[block[0]]] == None: 516 | continue 517 | if ties == "average": 518 | s = 0.0 519 | for j in block: 520 | s += j 521 | s /= float(len(block)) 522 | for j in block: 523 | R[O[j]] = s 524 | elif ties == "min": 525 | s = min(block) 526 | for j in block: 527 | R[O[j]] = s 528 | elif ties == "max": 529 | s =max(block) 530 | for j in block: 531 | R[O[j]] = s 532 | elif ties == "random": 533 | s = sample([O[i] for i in block], len(block)) 534 | for i,j in enumerate(block): 535 | R[O[j]] = s[i] 536 | else: 537 | for i,j in enumerate(block): 538 | R[O[j]] = j 539 | if omitNone: 540 | R = [ R[j] for j in range(n) if x[j] != None] 541 | return R 542 | 543 | # The following function came from http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python 544 | def rank2(a): 545 | n = len(a) 546 | ivec=rank_simple(a) 547 | svec=[a[rank] for rank in ivec] 548 | sumranks = 0 549 | dupcount = 0 550 | newarray = [0]*n 551 | for i in xrange(n): 552 | sumranks += i 553 | dupcount += 1 554 | if i==n-1 or svec[i] != svec[i+1]: 555 | averank = sumranks / float(dupcount) + 1 556 | for j in xrange(i-dupcount+1,i+1): 557 | newarray[ivec[j]] = averank 558 | sumranks = 0 559 | dupcount = 0 560 | 561 | return newarray 562 | 563 | def globFilesSortedByModTime(pattern): 564 | def getModifiedTime(filename): 565 | return os.stat(filename).st_mtime 566 | 567 | return sorted(glob.glob(pattern), key=getModifiedTime) 568 | 569 | ## From http://stackoverflow.com/questions/34518/natural-sorting-algorithm 570 | def naturalSort(x, reverse=False): 571 | def natural_key(s): 572 | return tuple( 573 | int(''.join(chars)) if isdigit else ''.join(chars) 574 | for isdigit, chars in itertools.groupby(s, str.isdigit) 575 | ) 576 | 577 | return sorted(x, key=natural_key, reverse=reverse) 578 | 579 | def getItemFrequencyMap(x): 580 | d = defaultdict(int) 581 | for item in x: 582 | d[item] += 1 583 | 584 | return d 585 | 586 | from math import modf, floor 587 | 588 | def quantile(x, q, qtype = 7, issorted = False): 589 | """ 590 | Args: 591 | x - input data 592 | q - quantile 593 | qtype - algorithm 594 | issorted- True if x already sorted. 595 | 596 | Compute quantiles from input array x given q.For median, 597 | specify q=0.5. 598 | 599 | References: 600 | http://reference.wolfram.com/mathematica/ref/Quantile.html 601 | http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile 602 | 603 | Author: 604 | Ernesto P.Adorio Ph.D. 605 | UP Extension Program in Pampanga, Clark Field. 606 | """ 607 | if not issorted: 608 | y = sorted(x) 609 | else: 610 | y = x 611 | if not (1 <= qtype <= 9): 612 | return None # error! 613 | 614 | # Parameters for the Hyndman and Fan algorithm 615 | abcd = [(0, 0, 1, 0), # inverse empirical distrib.function., R type 1 616 | (0.5, 0, 1, 0), # similar to type 1, averaged, R type 2 617 | (0.5, 0, 0, 0), # nearest order statistic,(SAS) R type 3 618 | 619 | (0, 0, 0, 1), # California linear interpolation, R type 4 620 | (0.5, 0, 0, 1), # hydrologists method, R type 5 621 | (0, 1, 0, 1), # mean-based estimate(Weibull method), (SPSS,Minitab), type 6 622 | (1, -1, 0, 1), # mode-based method,(S, S-Plus), R type 7 623 | (1.0/3, 1.0/3, 0, 1), # median-unbiased , R type 8 624 | (3/8.0, 0.25, 0, 1) # normal-unbiased, R type 9. 625 | ] 626 | 627 | a, b, c, d = abcd[qtype-1] 628 | n = len(x) 629 | g, j = modf( a + (n+b) * q -1) 630 | if j < 0: 631 | return y[0] 632 | elif j >= n: 633 | return y[n-1] # oct. 8, 2010 y[n]???!! uncaught off by 1 error!!! 634 | 635 | j = int(floor(j)) 636 | if g == 0: 637 | return y[j] 638 | else: 639 | return y[j] + (y[j+1]- y[j])* (c + d * g) 640 | 641 | def calculateInterquartileRange(x): 642 | firstQ = quantile(x, 0.25) 643 | thirdQ = quantile(x, 0.75) 644 | 645 | return thirdQ - firstQ 646 | 647 | def isNumeric(x): 648 | return str(x).replace(".", "").replace("-", "").isdigit() 649 | 650 | def getUniqueMatrixColumnValues(filePath, columnIndex): 651 | uniqueValues = set() 652 | 653 | for line in file(filePath): 654 | uniqueValues.add(line.rstrip().split("\t")[columnIndex]) 655 | 656 | return sorted(list(uniqueValues)) 657 | 658 | def fisherExactTest(x): 659 | return FishersExactTest.probability_of_table(x) 660 | 661 | def complementGenomicSequence(sequence): 662 | mod = "" 663 | 664 | for base in sequence: 665 | mod += complementGenomicBase(base) 666 | 667 | return mod 668 | 669 | def complementGenomicBase(base): 670 | base = base.upper() 671 | 672 | if base == "A": 673 | return "T" 674 | if base == "T": 675 | return "A" 676 | if base == "C": 677 | return "G" 678 | return "C" 679 | 680 | def reverseComplementGenomicSequence(dnaSequence): 681 | return reverseString(complementGenomicSequence(dnaSequence)) 682 | 683 | def reverseString(string): 684 | return string[::-1] 685 | 686 | def getDictValue(dictionary, key, default=""): 687 | if key in dictionary: 688 | return dictionary[key] 689 | return default 690 | 691 | def getDiffPositions(string1, string2): 692 | matcher = difflib.SequenceMatcher(a=string1, b=string2) 693 | blocks = matcher.get_matching_blocks() 694 | 695 | diffPositions = [] 696 | for block in blocks: 697 | if block[2] == 0 or block[2] == len(string1): 698 | continue 699 | 700 | if len(diffPositions) == 0: 701 | diffPositions.append(block[2]) 702 | else: 703 | diffPositions.append(block[2] + diffPositions[-1]) 704 | 705 | return diffPositions 706 | 707 | def getSimilarityPercent(string1, string2): 708 | blocks = difflib.SequenceMatcher(None, a=string1, b=string2).get_matching_blocks() 709 | 710 | totalMatching = 0.0 711 | for block in blocks: 712 | totalMatching += block[2] 713 | 714 | return (totalMatching / float(len(string1))) * 100.0 715 | 716 | def getLineItems(line, separator="\t"): 717 | return line.rstrip().split(separator) 718 | 719 | def sortMatrix(data, columnIndex, reverse=False): 720 | data.sort(key=itemgetter(columnIndex), reverse=reverse) 721 | return data 722 | 723 | def uniqueSort(values): 724 | # Slow but keeps values in order and uniquifies 725 | out = [] 726 | 727 | for value in values: 728 | if value not in out: 729 | out.append(value) 730 | 731 | return out 732 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 mumtahena 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This repository includes code for processing RNA-Seq FASTQ files and clinical data from The Cancer Genome Atlas. In addition, we have included the code used for analyzing data in our manuscript, "Alternative preprocessing of RNA-Sequencing data in The Cancer Genome Atlas leads to improved analysis results" (Rahman, Mumtahena, et al. _Bioinformatics_ [2015:10.1093/bioinformatics/btv377](http://bioinformatics.oxfordjournals.org/content/early/2015/08/14/bioinformatics.btv377.full). 2 | 3 | ## What is this repository for? 4 | 5 | * We used the 'Rsubread' R package to align and summarize reads at the gene level for 9264 tumor and 741 normal TCGA RNA-Seq samples. The R scripts we provide here can also be used to process samples that did not come from TCGA. We have also included the code for compiling clinical data available for these tumors into a matrix format and matching the clinical IDs with the RNA-Seq IDs. 6 | * We have provided the code and various intermediate data files that we produced in performing the analyses we describe in the manuscript. 7 | 8 | ## How to normalize raw RNA-Seq data and process clinical data from TCGA 9 | 10 | This pipeline is designed to be executed on Unix-based systems. Most of the code is written in the R programming language. But it also requires "bash" scripts to be executed at the command line. 11 | 12 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0. 13 | 14 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org): 15 | * Rsubread 16 | * limma 17 | * edgeR 18 | * tools 19 | 20 | 3. Clone this git repository to your local computer. 21 | 22 | 4. Via [dbGAP](http://www.ncbi.nlm.nih.gov/gap), obtain access to the raw TCGA data. Then obtain a private key that allows you download raw data via the [Cancer Genomics Hub](https://cghub.ucsc.edu/access/get_access.html). Store this key file as ```cghub.key``` in the current directory. 23 | 24 | 5. In the ```Genome``` directory, store the reference genome file and GTF file that can be obtained from [here](http://support.illumina.com/sequencing/sequencing_software/igenome.html). We used version hg19. After extracting these files, you will find the reference genome in Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa and the GTF file in Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2012-03-09-03-24-41/Genes/genes.gtf. Move these directly to the local Genome directory. **Update [08/04/2020]: These files are no longer available. You can find a copy of them [here](https://osf.io/cqkfp/). You will need to decompress the files using the `gunzip` utility. However, if you are going to run this pipeline now, you might consider using a newer version of the human reference genome.** 25 | 26 | 6. Execute Scripts/process_tcga_rsubread at the command line to begin downloading and normalizing samples. 27 | 28 | All the RNA-Seq and clinical data files that we have processed are available from Gene Expression Omnibus (accession numbers: GSE62820 and GSE62944). 29 | 30 | For informational purposes, we have also provided a bash script (Scripts/process_tcga_level_3) that contains the steps for producing "Level 3" values using the same steps that are performed by the TCGA consortium. These steps are described in more detail here: https://cghub.ucsc.edu/docs/tcga/UNC_mRNAseq_summary.pdf. 31 | 32 | ### Process clinical data 33 | 34 | 1. Install R package 'plyr' using the ```install.packages``` function in R. 35 | 36 | 2. Download the Clinical data for individual cancer type from [TCGA Data Portal] (https://tcga-data.nci.nih.gov/tcga/dataAccessMatrix.htm) in Biotab format. 37 | 38 | 3. Download [GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz](http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE62944&format=file&file=GSE62944%5F06%5F01%5F15%5FTCGA%5F24%5FCancerType%5FSamples%2Etxt%2Egz) from GEO (Accession number GSM1536837) and save the unzipped file to 'Datasets' folder. 39 | 40 | 4. Set directory to where all the clinical data folders for each cancer type is located. 41 | 42 | 5. Run the R script at Codes/ProcessClinicalData.R. 43 | 44 | ## How to reanalyze our findings 45 | 46 | We also provide an R Markdown file (Analysis/TCGA_24_manuscript_analysis.Rmd) that contains the analysis code that we used for our manuscript. If you desire to reexecute this analysis, please complete the following steps: 47 | 48 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0. 49 | 50 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org): 51 | * stats 52 | * ROCR 53 | * pROC 54 | * caret 55 | * knitr 56 | * data.table 57 | * heatmap3 58 | * RColorBrewer 59 | 60 | 3. We used the [BinReg 2](http://www.biomedcentral.com/1471-2105/12/443) algorithm to make HER2 signature predictions on TCGA breast cancer samples. BinReg 2 runs on the MatLab platform. We used our HER2 signature datasets as training samples and the TCGA breast cancer datasets as test samples. We used the following parameters: 200 genes, 2 metagenes, quantile normalization (-g 200 -m 2 -q) to minimize the batch effects between training and test samples. The original outputs from BinReg2 are located within the ```Analysis_datasets/10_14_predictions_raw``` directory. Rerun of the HER2 pathway prediction excluding the two less consistent HER2 training samples is located at ``Analysis_datasets/5_01_predictions_raw``` .These output predictions are summarized in the Analysis_datasets directory folder for further evaluation. 61 | 62 | 4. The code we used to classify TCGA lung adenocarcinoma and squamous carcinoma samples is in Code/Classify_luad_vs_lusc.R. The outputs of this analysis are located in the ```Analysis_datasets``` directory. The bash script describing additional analysis to identify discordant LUAD samples and differentially expressed gene is located at Code/LUSC_LUAD_discordant_analysis. 63 | 64 | 5. Use the ```knitr``` package to compile Analysis/TCGA_24_manuscript_analysis.Rmd. (It is convenient to complete this step within the [RStudio environment](http://www.rstudio.com/).) Also be sure to set the working directory to ```Analysis_datasets```. Our results are stored in the TCGA_24_manuscript_analysis.html file. 65 | 66 | 6. Our analysis datasets and outputs are available [here] (https://www.dropbox.com/sh/4e0c8u7jke694tu/AADEQnB5LbCWihb3A5f04O9va?dl=0). 67 | 68 | ## Contact information 69 | 70 | * Mumtahena Rahman. [moom.rahman@utah.edu](mailto:moom.rahman@utah.edu) 71 | * Stephen R Piccolo. [https://piccolo.byu.edu](https://piccolo.byu.edu) 72 | -------------------------------------------------------------------------------- /Scripts/LUSC_LUAD_discordant_analysis: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | Rscript --vanilla --max-ppsize=500000 ../Codes/LUSC_vs_LUAD.R 5 | 6 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_AllGenes_ROC.pdf "TCGA Level 3 - All Genes" 7 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_AllGenes_ROC.pdf "RSubread - All Genes" 8 | 9 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_CommonGenes_ROC.pdf "TCGA Level 3 - Common Genes" 10 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_CommonGenes_ROC.pdf "RSubread - Common Genes" 11 | 12 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs ../Analysis_datasets/LUAD RSubread_NonOverlappingGenes_ROC.pdf "RSubread - Non-Overlapping Genes" 13 | 14 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt 15 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt 16 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt 17 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt 18 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt 19 | 20 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred 21 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred 22 | 23 | Rscript --vanilla ../Codes/PlotDiscordant.R 24 | -------------------------------------------------------------------------------- /Scripts/normalize_tcga_rsubread: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | sampleIDFile=$1 6 | 7 | tcgaID=$(basename $sampleIDFile) 8 | analysisID=$(cat $sampleIDFile) 9 | 10 | currentDir=$(pwd) 11 | fastqDir=$currentDir/Temp/FASTQ 12 | outFpkmDir=$currentDir/FPKM 13 | outTpmDir=$currentDir/TPM 14 | outFeatureCountsDir=$currentDir/FeatureCounts 15 | outStatsDir=$currentDir/Stats 16 | inProgressFile=$currentDir/InProgress/$tcgaID 17 | 18 | rm -fv $inProgressFile 19 | touch $inProgressFile 20 | 21 | function cleanup { 22 | rm -rfv $fastqDir/${analysisID}* 23 | rm -rfv $fastqDir/${tcgaID}* 24 | rm -fv $inProgressFile 25 | } 26 | 27 | trap 'cleanup' TERM INT EXIT 28 | 29 | mkdir -pv $fastqDir/$tcgaID $outFpkmDir $outFpkmLogDir $outTpmDir $outTpmLogDir $outFeatureCountsDir $outStatsDir 30 | 31 | echo Downloading $tcgaID 32 | mkdir -p $currentDir/XmlFiles 33 | cgquery -o $currentDir/XmlFiles/$tcgaID.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&analysis_id=${analysisID}" 34 | gtdownload -vv -d $currentDir/XmlFiles/$tcgaID.xml -c $currentDir/cghub.key --max-children 1 -p $fastqDir 35 | 36 | echo Rename and extract files $tcgaID 37 | if [ -f $fastqDir/$analysisID/*.tar.gz ] 38 | then 39 | mv -v $fastqDir/$analysisID/*.tar.gz $fastqDir/$tcgaID.tar.gz 40 | tar -zxvf $fastqDir/$tcgaID.tar.gz -C $fastqDir/$tcgaID 41 | rm -fv $fastqDir/$tcgaID.tar.gz 42 | else 43 | mv -v $fastqDir/$analysisID/*.tar $fastqDir/$tcgaID.tar 44 | tar -xvf $fastqDir/$tcgaID.tar -C $fastqDir/$tcgaID 45 | rm -fv $fastqDir/$tcgaID.tar 46 | fi 47 | 48 | fastqFileNamesFile=$fastqDir/$tcgaID/FASTQFiles 49 | for f in $fastqDir/$tcgaID/*fastq* NULL 50 | do 51 | echo $f >> $fastqFileNamesFile 52 | done 53 | 54 | fastqFilePath1=$(head -n 1 $fastqFileNamesFile) 55 | fastqFilePath2=$(head -n 2 $fastqFileNamesFile | tail -n 1) 56 | 57 | Rscript --vanilla $currentDir/Codes/ProcessRnaSeqFeatureCounts.R $currentDir/Genome/genome.fa $fastqFilePath1 $fastqFilePath2 $currentDir/Genome/genes.gtf $fastqDir/$tcgaID $outFpkmDir/$tcgaID $outTpmDir/$tcgaID $outFeatureCountsDir/$tcgaID $outStatsDir/$tcgaID 58 | 59 | rm -fv $currentDir/XmlFiles/$tcgaID.xml 60 | 61 | -------------------------------------------------------------------------------- /Scripts/process_tcga_level_3: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | fastqFile=$1 4 | 5 | softwareDir=Software/TCGA_RNA_Seq_Pipeline 6 | samTools=$softwareDir/rsem-1.1.13/sam/samtools 7 | bwaDir=$softwareDir/MapSplice_multithreads_12_07/bowtie-0.12.7_fusion 8 | mapspliceDir=$softwareDir/MapSplice_multithreads_12_07/bin 9 | picardDir=$softwareDir/picard-tools-1.82 10 | ubu=$softwareDir/ubu-1.2-jar-with-dependencies.jar 11 | rsemDir=$softwareDir/rsem-1.2.12 12 | bedToolsDir=$softwareDir/bedtools-2.17.0/bin 13 | referenceGenomeRef=Genomes/hg19_M_rCRS_ref 14 | referenceGenomeFile=Genomes/hg19_M_rCRS.fa 15 | referenceGenomeIndexFile=Genomes/hg19_M_rCRS/chromosomes 16 | referenceChromosomesDir=Genomes/hg19_M_rCRS/ebwt 17 | referenceBedFile=Genomes/unc_hg19.bed 18 | referenceTranscriptsFile=Genomes/hg19_M_rCRS_ref.transcripts.fa 19 | 20 | workingDir=Level_3_Temp 21 | 22 | mkdir -p $workingDir 23 | 24 | 25 | sampleID=`basename $fastqFile` 26 | sampleID=${sampleID/\.fastq/} 27 | outDir=${fastqFile/\.fastq/_rsem} 28 | mkdir -p $outDir 29 | mkdir -p $outDir/working 30 | tmpFastqFile=$outDir/`basename $fastqFile` 31 | outBamFile1=$outDir/alignments.bam 32 | outBamFile2=$outDir/rg_alignments.bam 33 | outBamFile3=$outDir/phred33_alignments.bam 34 | outBamFile4=$outDir/sorted_genome_alignments 35 | echo processing $sampleID 36 | 37 | #1. Format fastq 1 for Mapsplice 38 | java -Xmx512M -jar $ubu fastq-format --phred33to64 --strip --suffix /1 --in $fastqFile --out $tmpFastqFile> $outDir/working/mapsplice_prep.log 39 | echo preprocessing is done 40 | 41 | #2.Mapsplice 42 | python $mapspliceDir/mapsplice_multi_thread.py --fusion --all-chromosomes-files $referenceGenomeFile -X 8 -Q fq --chromosome-files-dir $referenceChromsomesFile --Bowtieidx $referenceGenomeIndexFile -1 $tmpFastqFile -o $outDir 43 | #echo initial bam file is created now.. deleting the processed FASTQ file 44 | rm $tmpFastqFile 45 | 46 | #3.Add read groups 47 | java -Xmx2G -jar $picardDir/AddOrReplaceReadGroups.jar INPUT=$outBamFile1 OUTPUT=$outBamFile2 RGSM=$sampleID RGID=$sampleID RGLB=TruSeq RGPL=illumina RGPU=barcode VALIDATION_STRINGENCY=SILENT TMP_DIR=$outDir/working/add_rg_tag_tmp > $outDir/working/add_rg_tag.log 48 | echo read groups added or replaced now! 49 | 50 | #4.Convert back to phred33 51 | java -Xmx512M -jar $ubu sam-convert --phred64to33 --in $outBamFile2 --out $outBamFile3 > $outDir/working/sam_convert.log 52 | echo bam file converted back to phred33 53 | 54 | #5.Sort by coordinate 55 | $samTools sort $outBamFile3 $outBamFile4 56 | echo converted Bam file is sorted now 57 | 58 | #6.Flagstat 59 | $samTools flagstat ${outBamFile4}.bam > ${outBamFile4}.flagstat 60 | echo flagstat file created now! 61 | 62 | #7.Index 63 | $samTools index ${outBamFile4}.bam 64 | echo Bam file is sorted now 65 | 66 | #8. Sort By chromosome, then read id 67 | echo using perl script from $softwareDir 68 | perl $softwareDir/sort_bam_by_reference_and_name.pl --input ${outBamFile4}.bam --output $outDir/sorted_by_chr_read.bam --temp-dir ${outDir}.tmp --samtools $samTools > $outDir/working/sorted_by_chr_read.log 69 | echo sorted by chromosome then id 70 | 71 | #9. Translate to transcriptome coors 72 | echo in directory $outDir 73 | java -Xmx3G -jar $ubu sam-xlate --single --bed $referenceBedFile --in $outDir/sorted_by_chr_read.bam --out $outDir/transcriptome_alignments.bam --order $referenceTranscriptsFile --xgtags --reverse > $outDir/working/genome_to_transcriptome.log 74 | echo translation to transcriptome coors done! 75 | 76 | #10. Filter indels, large inserts, zero mapping quality from transcriptome bam $ubu 1.2 version needed for this step to use '--single' parameter 77 | java -Xmx512M -jar $ubu sam-filter --single --in $outDir/transcriptome_alignments.bam --out $outDir/transcriptome_alignments_filtered.bam --strip-indels --max-insert 10000 --mapq 1 > $outDir/working/sam_filter.log 78 | echo Filtered indels, large inserts, zero mapping quality from transcriptome bam 79 | 80 | #11. RSEM 81 | echo starting rsem normalization in $outDir for $sampleID 82 | 83 | $rsemDir/rsem-calculate-expression --bam -p 8 --estimate-rspd --temporary-folder ${outDir}.temp_rsem --no-bam-output $outDir/transcriptome_alignments_filtered.bam $referenceGenomeRef $sampleID > $outDir/working/rsem.log 84 | 85 | 86 | echo data is RSEM normalized 87 | 88 | #12. Strip trailing tabs from rsem.isoforms.results 89 | echo moving output files for $sampleID for final processing... 90 | mv ${sampleID}* $workingDir/ 91 | 92 | perl $softwareDir/strip_trailing_tabs.pl --input $workingDir/${sampleID}.isoforms.results --temp $outDir/working/${sampleID}.orig.isoforms.results 93 | 94 | #13. Prune isoforms from gene quant file 95 | mv $workingDir/${sampleID}.genes.results $outDir/working/${sampleID}.orig.genes.results; sed /^uc0/d $outDir/working/${sampleID}.orig.genes.results >$workingDir/${sampleID}.genes.results 96 | 97 | #14. Normalize gene quant 98 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 1000 -o $workingDir/${sampleID}.rsem.genes.normalized_results $workingDir/${sampleID}.genes.results 99 | 100 | #16. Normalize isoform quant 101 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 300 -o $workingDir/${sampleID}.rsem.isoforms.normalized_results $workingDir/${sampleID}.isoforms.results 102 | 103 | #******************************************************** 104 | #outDir=/data2/u01_hmec_batch01/fastq/f1/FASTQ/f1 105 | #******************************************************** 106 | #17. Junction counts 107 | #java -Xmx512M -jar $ubu sam-junc --junctions $softwareDir/splice_junctions.txt --in $outDir/$outDir/sorted_genome_alignments.bam --out $workingDir/${sampleID}.junction_quantification.txt > $outDir/working/${sampleID}_junction_quantification.log 108 | 109 | #18. Exon counts 110 | #$bedToolsDir/coverageBed -split -abam $outDir/sorted_genome_alignments.bam -b $softwareDir/composite_exons.bed | perl $softwareDir/normalizeBedToolsExonQuant.pl $softwareDir/composite_exons.bed > $outDir/${sampleID}.bt.exon_quantification.txt 111 | 112 | #19. Cleanup large intermediate output 113 | #rm alignments.bam logs/* working/phred33_alignments.bam working/rg_alignments.bam working/sorted_by_chr_read.bam working/transcriptome_alignments.bam working/transcriptome_alignments_filtered.bam working/prep_1.fastq working/prep_2.fastq > working/cleanup.log 114 | -------------------------------------------------------------------------------- /Scripts/process_tcga_rsubread: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | 5 | mkdir -p DownloadSamples CancerTypes Temp FeatureCounts InProgress 6 | 7 | cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178" 8 | #cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178&disease_abbr=DLBC" 9 | 10 | rm -rfv DownloadSamples/* CancerTypes/* 11 | python Codes/ParseCgHubQueryResults.py Query.xml "" DownloadSamples CancerTypes 12 | 13 | rm -rf Temp/* 14 | 15 | for f in $(pwd)/DownloadSamples/* 16 | do 17 | sampleID=$(basename $f) 18 | sampleID=${sampleID/\.xml/} 19 | 20 | if [ -f FeatureCounts/$sampleID ] 21 | then 22 | echo $sampleID already processed 23 | continue 24 | fi 25 | 26 | if [ -f InProgress/$sampleID ] 27 | then 28 | echo $sampleID currently being processed 29 | continue 30 | fi 31 | 32 | $(pwd)/Scripts/normalize_tcga_rsubread $f 33 | done 34 | -------------------------------------------------------------------------------- /Scripts/summarize_tcga_rsubread: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function buildCancerTypesFile { 4 | rm -rf Temp/CancerTypes 5 | mkdir -p Temp/CancerTypes 6 | 7 | for f in FeatureCounts/* 8 | do 9 | sampleID=$(basename $f) 10 | sampleCancerType=$(cat CancerTypes/$sampleID) 11 | cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt) 12 | 13 | # Make sure we can publish on this sample 14 | if [ "$cancerTypeMatch" == "$sampleCancerType" ] 15 | then 16 | cp -v CancerTypes/$sampleID Temp/CancerTypes/ 17 | fi 18 | done 19 | 20 | python Codes/CombineScalarValues.py "Temp/CancerTypes/*" PANCAN20_CancerType_Samples.txt 21 | 22 | rm -rf Temp/CancerTypes 23 | } 24 | 25 | function matricize { 26 | subDir=$1 27 | 28 | tempSummDir=Temp/Summarize_${subDir} 29 | rm -rf $tempSummDir 30 | mkdir -p $tempSummDir 31 | 32 | for f in $subDir/* 33 | do 34 | sampleID=$(basename $f) 35 | sampleCancerType=$(cat CancerTypes/$sampleID) 36 | cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt) 37 | 38 | # Make sure we can publish on this sample 39 | if [ "$cancerTypeMatch" == "$sampleCancerType" ] 40 | then 41 | cp -v $f $tempSummDir/ 42 | fi 43 | done 44 | 45 | outFile=matrices/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_${subDir}.txt 46 | 47 | python Codes/BuildMatrixFile.py "$tempSummDir/*" $outFile 48 | python Codes/PrintMatrixDimensions.py $outFile 49 | 50 | rm -f $outFile.gz 51 | 52 | echo Zipping $outFile 53 | gzip -v $outFile 54 | 55 | rm -rf $tempSummDir 56 | } 57 | 58 | buildCancerTypesFile 59 | 60 | matricize RPKMlog & 61 | matricize RPKM & 62 | matricize FeatureCounts & 63 | wait 64 | -------------------------------------------------------------------------------- /TCGA_CancerType_Abbreviations.txt: -------------------------------------------------------------------------------- 1 | LAML Acute Myeloid Leukemia 2 | ACC Adrenocortical carcinoma 3 | BLCA Bladder Urothelial Carcinoma 4 | LGG Brain Lower Grade Glioma 5 | BRCA Breast invasive carcinoma 6 | CESC Cervical squamous cell carcinoma and endocervical adenocarcinoma 7 | CHOL Cholangiocarcinoma 8 | LCML Chronic Myelogenous Leukemia 9 | COAD Colon adenocarcinoma 10 | CNTL Controls 11 | ESCA Esophageal carcinoma 12 | GBM Glioblastoma multiforme 13 | HNSC Head and Neck squamous cell carcinoma 14 | KICH Kidney Chromophobe 15 | KIRC Kidney renal clear cell carcinoma 16 | KIRP Kidney renal papillary cell carcinoma 17 | LIHC Liver hepatocellular carcinoma 18 | LUAD Lung adenocarcinoma 19 | LUSC Lung squamous cell carcinoma 20 | DLBC Lymphoid Neoplasm Diffuse Large B-cell Lymphoma 21 | MESO Mesothelioma 22 | MISC Miscellaneous 23 | OV Ovarian serous cystadenocarcinoma 24 | PAAD Pancreatic adenocarcinoma 25 | PCPG Pheochromocytoma and Paraganglioma 26 | PRAD Prostate adenocarcinoma 27 | READ Rectum adenocarcinoma 28 | SARC Sarcoma 29 | SKCM Skin Cutaneous Melanoma 30 | STAD Stomach adenocarcinoma 31 | TGCT Testicular Germ Cell Tumors 32 | THYM Thymoma 33 | THCA Thyroid carcinoma 34 | UCS Uterine Carcinosarcoma 35 | UCEC Uterine Corpus Endometrial Carcinoma 36 | UVM Uveal Melanoma 37 | -------------------------------------------------------------------------------- /TCGA_CancerType_Publishable.txt: -------------------------------------------------------------------------------- 1 | ACC 2 | BLCA 3 | BRCA 4 | CESC 5 | COAD 6 | DLBC 7 | GBM 8 | HNSC 9 | KICH 10 | KIRC 11 | KIRP 12 | LAML 13 | LGG 14 | LIHC 15 | LUAD 16 | LUSC 17 | OV 18 | PRAD 19 | READ 20 | SKCM 21 | STAD 22 | THCA 23 | UCEC 24 | UCS 25 | --------------------------------------------------------------------------------