├── .gitignore
├── Analysis
├── TCGA_24_manuscript_analysis.Rmd
└── TCGA_24_manuscript_analysis.html
├── Analysis_datasets
├── .RData
├── .Rhistory
├── 10_14_predictions_raw
│ ├── RSEM_q_log_200_f
│ │ ├── REPORT.html
│ │ ├── model.txt
│ │ ├── parameters.txt
│ │ ├── predictions.png
│ │ ├── probabilities.txt
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
│ ├── TPM_q_log_200_f
│ │ ├── REPORT.html
│ │ ├── model.txt
│ │ ├── parameters.txt
│ │ ├── predictions.png
│ │ ├── probabilities.txt
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
│ └── fpkm_q_log_200_f
│ │ ├── REPORT.html
│ │ ├── model.txt
│ │ ├── parameters.txt
│ │ ├── predictions.png
│ │ ├── probabilities.txt
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
├── 5_01_predictions_raw
│ ├── fpkmlog_no
│ │ ├── REPORT.html
│ │ ├── predictions.png
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
│ ├── rsem
│ │ ├── REPORT.html
│ │ ├── predictions.png
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
│ ├── rsem_no
│ │ ├── REPORT.html
│ │ ├── predictions.png
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
│ └── tpmlog_no
│ │ ├── REPORT.html
│ │ ├── predictions.png
│ │ ├── signature.cdt
│ │ ├── signature.png
│ │ └── signature_s.cdt
├── Classification_12_LUAD_LUSC_Predictions.txt
├── Classification_20_LUAD_LUSC_Predictions.txt
├── GFP18_HER2_Rsubread_FPKM.txt
├── GFP18_HER2_Rsubread_TPM.txt
├── GFP18_HER2_Rsubread_geneCounts.txt
├── GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt
├── GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt
├── PANCAN12_19583_by_3380_numZeroes.txt
├── PANCAN20_19583_by_3380_numZeroes.txt
├── Rsem_10_14.txt
├── TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt
└── rsubread_10_14.txt
├── Codes
├── BuildMatrixFile.py
├── CalcAUC.R
├── CalcAccuracy.R
├── Classify_luad_vs_lusc.R
├── CombineScalarValues.py
├── FileContainsText.py
├── GetFileExtension.py
├── IdentifyDiscordantPredictions.R
├── IdentifyInconsistentPredictions.R
├── LUSC_vs_LUAD.R
├── ParseCgHubQueryResults.py
├── ParseSampleTypes.py
├── PeekMatrix.py
├── PlotDiscordant.R
├── PrintMatrixDimensions.py
├── ProcessClinicalData.R
├── ProcessRnaSeqFeatureCounts.R
├── Split.py
├── TransposeData.py
├── biological_rep.R
├── numZero.R
└── utilities.py
├── LICENSE
├── README.md
├── Scripts
├── LUSC_LUAD_discordant_analysis
├── normalize_tcga_rsubread
├── process_tcga_level_3
├── process_tcga_rsubread
└── summarize_tcga_rsubread
├── TCGA_CancerType_Abbreviations.txt
└── TCGA_CancerType_Publishable.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | FeatureCounts
3 | FPKM
4 | FPKMlog
5 | TPM
6 | TPMlog
7 | DownloadSamples
8 | Temp
9 | CancerTypes
10 | InProgress
11 | go
12 | Query.xml
13 | *.key
14 | XmlFiles
15 | Genome
16 | *.jar
17 | temp*
18 | Scripts/*_rsubread2
19 | Codes/ProcessRnaSeqFeatureCounts2.R
20 | nohup*
21 | Stats
22 | update_git
23 | commit_git
24 | Analysis/*_cache
25 | Analysis/*_cache/*
26 | Analysis/*_files
27 | Analysis/*_files/*
28 | Codes/ForMoom
29 | Codes/ForMoom/*
30 | Analysis/*20*
31 |
--------------------------------------------------------------------------------
/Analysis_datasets/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/.RData
--------------------------------------------------------------------------------
/Analysis_datasets/.Rhistory:
--------------------------------------------------------------------------------
1 | for (i in 1:4)
2 | + {
3 | + hist(iris[,i], main=colnames(iris)[i])
4 | + }
5 | dskasd
6 | for (i in 1:4)
7 | hist(iris[,i], main=colnames(iris)[i])
8 | par(mfrow=c(2,2))
9 | for (i in 1:4)
10 | hist(iris[,i], main=colnames(iris)[i])
11 | library("knitr", lib.loc="~/Library/R/3.1/library")
12 | install.packages(c("car", "colorspace", "manipulate", "Rcpp", "RcppArmadillo"))
13 | install.packages("mgcv", lib="/Library/Frameworks/R.framework/Versions/3.1/Resources/library")
14 | source('~/.active-rstudio-document')
15 | date: March 9, 2015
16 | date: 3/9/2015
17 | date:
18 | #date:
19 | date: 03-19-2015
20 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species))
21 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
22 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)))
23 | print(1:length(levels(iris$Species)))
24 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
25 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1)
26 | print(1:length(levels(iris$Species)))
27 | par(mfrow=c(2,2))
28 | for (i in 1:(ncol(iris) - 1))
29 | hist(iris[,i], main=colnames(iris[i]), xlab="centimeters")
30 | par(mfrow=c(1,1))
31 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
32 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lty=1, lwd=1)
33 | print(1:length(levels(iris$Species)))
34 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
35 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), lwd=1)
36 | print(1:length(levels(iris$Species)))
37 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot")
38 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
39 | print(1:length(levels(iris$Species)))
40 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20)
41 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
42 | print(1:length(levels(iris$Species)))
43 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20)
44 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.5)
45 | print(1:length(levels(iris$Species)))
46 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.5)
47 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20)
48 | print(1:length(levels(iris$Species)))
49 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
50 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
51 | print(1:length(levels(iris$Species)))
52 | library("lattice", lib.loc="/Library/Frameworks/R.framework/Versions/3.1/Resources/library")
53 | install.packages("lattice")
54 | print(head(irisData))
55 | irisData = select(iris, -Species)
56 | print(head(irisData))
57 | irisData = select(iris, -Species)
58 | #print(head(irisData))
59 | library(dplyr)
60 | irisData = select(iris, -Species)
61 | #print(head(irisData))
62 | library(dplyr)
63 | install.packages("dplyr")
64 | library(dplyr)
65 | irisData = select(iris, -Species)
66 | print(head(irisData))
67 | library(dplyr)
68 | irisData = select(iris, -Species)
69 | head(irisData)
70 | irisData = scale(irisData)
71 | head(irisData)
72 | boxplot(irisData)
73 | irisData = scale(irisData)
74 | head(irisData)
75 | boxplot(irisData)
76 | boxplot(irisData$Petal.Width~iris$Species)
77 | for (i in 1:(ncol(iris) - 1))
78 | boxplot(irisData[,i] ~ iris$Species, main=colnames(iris)[i], ylab="centimeters")
79 | pcIrisData = prcomp(irisData)
80 | pcIrisData
81 | pcIrisData = prcomp(t(irisData))
82 | pcIrisData
83 | pcIrisData = prcomp(irisData)
84 | pcIrisData
85 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st Principal Component", ylab="2nd Principal Component", pch=20, cex=1.25)
86 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point()
87 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data")
88 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25)
89 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
90 | legend("topleft", legend=levels(iris$Species), col=iris$Species, pch=20, cex=1.25)
91 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
92 | legend("topleft", legend=levels(iris$Species), col=levels(iris$Species), pch=20, cex=1.25)
93 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
94 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
95 | #ggplot(pcIrisData, aes(x=PC1, y=PC2, color=Species)) + geom_point()
96 | plot(pcIrisData$x[,1], pcIrisData$x[,2], col=iris$Species, xlab="1st principal component", ylab="2nd principal component", pch=20, cex=1.25, main="Principal Components for iris data")
97 | legend("topleft", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
98 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Basic scatterplot", pch=20, cex=1.25)
99 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
100 | plot(iris$Petal.Length, iris$Petal.Width)
101 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot")
102 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=10)
103 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=12)
104 | ```
105 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=111)
106 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18)
107 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15)
108 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="green")
109 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=15, col="red")
110 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", xlab="Petal Length", ylab="Petal Width")
111 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=3, xlab="Petal Length", ylab="Petal Width")
112 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
113 | plot(iris$Petal.Length, iris$Petal.Width, main="Basic scatterplot of petal features")
114 | plot(iris$Sepal.Length, iris$Sepal.Width, col=iris$Species, xlab="Sepal length", ylab="Sepal width", main="Sepal characteristics among species", pch=20, cex=1.25)
115 | legend("topright", legend=levels(iris$Species), col=1:length(levels(iris$Species)), pch=20, cex=1.25)
116 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
117 | model <- lm(iris$Petal.Length ~ iris$Petal.Width)
118 | abline(model, lwd = 2)
119 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
120 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
121 | abline(model, lwd = 2)
122 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
123 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
124 | abline(model, lwd = 2)
125 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
126 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
127 | abline(model, lwd = 4)
128 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=15, col="red", cex=2, xlab="Petal Length", ylab="Petal Width")
129 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
130 | abline(model, lwd = 4, lty=2)
131 | plot(iris$Petal.Length, iris$Petal.Width, main="Change plotting character", pch=18)
132 | plot(iris$Petal.Length, iris$Petal.Width, main="Change axis labels", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width")
133 | plot(iris$Petal.Length, iris$Petal.Width, main="Plot regression line", pch=18, col="red", cex=1.5, xlab="Petal Length", ylab="Petal Width")
134 | model <- lm(iris$Petal.Width ~ iris$Petal.Length)
135 | abline(model, lwd = 3, lty=2, col="gray")
136 | abline(model, lwd = 3, lty=2, col="darkgray")
137 | pca$rotation
138 | pcIrisData$rotation
139 | percent <- 100 * pcIrisData$sdev^2 / sum(pca$sdev^2)
140 | percent
141 | percent <- 100 * pcIrisData$sdev^2 / sum(pcIrisData$sdev^2)
142 | percent
143 | barplot(percent)
144 | barplot(percent, names.arg=1:4, xlab="Principal Component", ylab="% variance explained")
145 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
146 | xlim(-1.,1.25) +
147 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
148 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
149 | ggplot(rotation_data) +
150 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
151 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
152 | xlim(-1.,1.25) +
153 | ylim(-1.,1.) +
154 | coord_fixed() # fix aspect ratio to 1:1
155 | library(ggplot)
156 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
157 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
158 | ggplot(rotation_data) +
159 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
160 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
161 | xlim(-1.,1.25) +
162 | ylim(-1.,1.) +
163 | coord_fixed() # fix aspect ratio to 1:1
164 | library(ggplot2)
165 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
166 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
167 | ggplot(rotation_data) +
168 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
169 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
170 | xlim(-1.,1.25) +
171 | ylim(-1.,1.) +
172 | coord_fixed() # fix aspect ratio to 1:1
173 | library(graphics)
174 | library(ggplot2)
175 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
176 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
177 | ggplot(rotation_data) +
178 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
179 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
180 | xlim(-1.,1.25) +
181 | ylim(-1.,1.) +
182 | coord_fixed() # fix aspect ratio to 1:1
183 | library(graphics)
184 | library(ggplot2)
185 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
186 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
187 | #ggplot(rotation_data) +
188 | # geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
189 | # geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
190 | # xlim(-1.,1.25) +
191 | # ylim(-1.,1.) +
192 | # coord_fixed() # fix aspect ratio to 1:1
193 | library(ggplot2)
194 | library(grid)
195 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
196 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
197 | #ggplot(rotation_data) +
198 | # geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
199 | # geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
200 | # xlim(-1.,1.25) +
201 | # ylim(-1.,1.) +
202 | # coord_fixed() # fix aspect ratio to 1:1
203 | library(ggplot2)
204 | library(grid)
205 | rotation_data <- data.frame(pcIrisData$rotation, variable=row.names(pcIrisData$rotation))
206 | arrow_style <- arrow(length = unit(0.05, "inches"), type = "closed")
207 | ggplot(rotation_data) +
208 | geom_segment(aes(xend=PC1, yend=PC2), x=0, y=0, arrow=arrow_style) +
209 | geom_text(aes(x=PC1, y=PC2, label=variable), hjust=0, size=3, color='red') +
210 | xlim(-1.,1.25) +
211 | ylim(-1.,1.) +
212 | coord_fixed() # fix aspect ratio to 1:1
213 | ?prcomp
214 | iris
215 | ?kmeans
216 | install.packages("useful")
217 | plot(k1, data=iris)
218 | source('~/.active-rstudio-document')
219 | k1 <- kmeans(x=iris[, 1:4], centers=3)
220 | plot(k1)
221 | plot(k1, data=iris)
222 | k1 <- kmeans(x=iris[, 1:4], centers=3)
223 | library(useful)
224 | plot(k1)
225 | irisData = iris[,-5] # Negative sign excludes the specified column
226 | head(irisData)
227 | irisData = scale(irisData)
228 | head(irisData)
229 | boxplot(irisData)
230 | ?subset
231 | ?c
232 | x = rep(1, 1000)
233 | x
234 | hist(x)
235 | plot(density(x))
236 | ls()
237 | library(dplyr)
238 | ?inner_join
239 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets")
240 | setwd("TCGA_RNASeq_clinical/Analysis_datasets")
241 | setwd("~/GitRepos/TCGA_RNASeq_clinical/Analysis_datasets")
242 | getwd()
243 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
244 | # Rsubread pipeline, gene counts
245 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
246 | # TCGA pipeline, normalized expression files
247 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
248 | # Rsubread pipeline, FPKM values
249 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
250 | rsub_fpkmlog<-log2(rsub_fpkm+1)
251 | # Rsubread pipeline, TPM values
252 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
253 | rsub_tpmlog<-log2(rsub_tpm+1)
254 | # Clinical data
255 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped
256 | ```
257 | # TCGA pipeline, expected counts
258 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
259 | # Rsubread pipeline, gene counts
260 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
261 | # TCGA pipeline, normalized expression files
262 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F) # This was downloaded from GEO Accession # GSE62820 and unzipped
263 | # Rsubread pipeline, FPKM values
264 | rsub_fpkm<-read.table("GFP18_HER2_Rsubread_FPKM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
265 | rsub_fpkmlog<-log2(rsub_fpkm+1)
266 | # Rsubread pipeline, TPM values
267 | rsub_tpm<-read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F) # This was downloaded from GEO Accession # GSE62820 and unzipped
268 | rsub_tpmlog<-log2(rsub_tpm+1)
269 | # Clinical data
270 | clinicals<-t(read.delim('TCGA20_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt',sep='\t',header=1, row.names=1,check.names=F)) # This was downloaded from GEO Accession # GSE62820 and unzipped
271 | rsub_preds<-read.table("rsubread_10_14.txt", sep='\t', header=1, row.names=1)
272 | tcga_preds<-read.table("Rsem_10_14.txt", sep='\t', header=1, row.names=1)
273 | pancan12_zero<-read.table("PANCAN12_19583_by_3380_numZeroes.txt",row.names=1,sep='\t')# File is at Analysis_datasets
274 | pancan20_tpm_zero<-read.table("PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',row.names=1)# File is at Analysis_datasets
275 | data12 = read.table("Classification_12_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1) # File is at Analysis_datasets
276 | data20 = read.table("Classification_20_LUAD_LUSC_Predictions.txt", sep="\t", stringsAsFactors=FALSE, header=TRUE, row.names=1)# File is at Analysis_datasets
277 | #This function calculates the standardized mean using Hedge's formula
278 | standardized_mean<-function(m.1,sd.1,n.1,m.2,sd.2,n.2){
279 | sd_pooled=sqrt(((n.1-1)*sd.1^2+(n.2-1)*sd.2^2)/(n.1+n.2-2))
280 | (m.1-m.2)/sd_pooled
281 | }
282 | #This function merges two matrices on row names, sets the common items as rownames and removes the extra column resulting from merge function.
283 | merge_drop<-function(x,y,by=0)
284 | {
285 | new_m<-merge(x,y,by=by)
286 | rownames(new_m)<-new_m$Row.names
287 | return(new_m[,2:length(colnames(new_m))])
288 | }
289 | #This function plots the ROC based on the actual and predicted class
290 | plotROC = function(actual, probabilities, plotCI=FALSE)
291 | {
292 | # bottom, left, top, right
293 | par(mar=c(4.5, 4.7, 0.0, 0.5),lwd=4)
294 | library(pROC)
295 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE)
296 | lowerBoundAuc = format(roc_result$ci[1], digits=3)
297 | midAuc = format(roc_result$ci[2], digits=3)
298 | upperBoundAuc = format(roc_result$ci[3], digits=3)
299 | if (plotCI)
300 | {
301 | ci(roc_result)
302 | sens.ci <- ci.se(roc_result)
303 | plot(sens.ci, type="shape", col="gray95")
304 | plot(sens.ci, type="bars")
305 | plot(roc_result, add=TRUE)
306 | }
307 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep=""))
308 | par(mar=c(5.1, 4.1, 2.1, 2.1))
309 | }
310 | ##########computing the empiric cumulative distribution per sample overlaied on same graph########
311 | ###using TCGA pipelined aligned data
312 | ecdf_all_ex<-apply(log2(rsem_her2_expected_counts+1),2,ecdf)
313 | par( mfrow = c( 1, 2 ) )
314 | plot(ecdf_all_ex[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="TCGA pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5)
315 | legend(10,10,c("GFP", "HER2"), col = c("blue","brown"))
316 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")}
317 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")}
318 | ###using Rsubread pipeline aligned data
319 | ecdf_all<-apply(log2(feature+1),2,ecdf)
320 | plot(ecdf_all[[1]],xlab="log2(Total mapped reads)",ylab="Cumulative proportion",col="blue",main="Rsubread pipeline",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5)
321 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
322 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
323 | ############computing total number of read counts per samples and plotting them as dot plots####
324 | expected_counts<-apply(rsem_her2_expected_counts,2,sum)
325 | feature_counts<-apply(feature,2,sum)
326 | # Creating a plot showing total mapped reads per sample
327 | par( mfrow = c( 1, 2 ),lwd=4 )
328 | x = c(rep(1, 12), rep(2, 5)) # this indicates where on the x axis to plot
329 | par(mar=c(3.1, 4.6, 2.1, 0.6)) # figure margins
330 | boxplot(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",main="TCGA Pipeline",col='grey75',medcol="grey75",lwd=4,border = "grey35")
331 | points(jitter(x, factor=2), c(log2(expected_counts[1:12]+1), log2(expected_counts[13:17]+1)), pch=4, cex=2, col=1, xaxt="n",cex.lab=1.5)
332 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5)
333 | boxplot(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1),range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,lwd=4,ylim=c(20,25),xlab="", ylab="log2(Total mapped reads)",col='grey75',medcol="grey75",lwd=4,main="Rsubread Pipeline",border = "grey35")
334 | points(jitter(x, factor=1.5), c(log2(feature_counts[1:12]+1), log2(feature_counts[13:17]+1)), pch=4,cex=2,cex.lab=1.5,col="black")
335 | axis(1, at=1:2, tick=T, labels=c("Control", "HER2"), cex.axis=1.5)
336 | #######Boxplotting ERBB2 gene counts in HMEC samples#####
337 | par(mfrow = c(1, 1),lwd=4)
338 | names=c('TCGA\nGFP','TCGA\nHER2','Rsubread\nGFP', 'Rsubread\nHER2')
339 | rsem_her2<-data.frame(t(rsem_her2_expected_counts["ERBB2",]))
340 | rsub_her2<-data.frame(t(feature["ERBB2",]))
341 | x = c(rep(1, 12), rep(2, 5),rep(3, 12), rep(4, 5))
342 | boxplot(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1),log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1),ylab="",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,main=paste('Comparing TGCA and Rsubread Pipelines','\n', 'in Differentiating HER2 Overexpression from Controls',sep=''),border = "grey35")
343 | points(jitter(rep(1,12),factor=2),log2(rsem_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col="black")
344 | points(jitter(rep(2,5),factor=2),log2(rsem_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black')
345 | points(jitter(rep(3,12),factor=2),log2(rsub_her2$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5,col='black')
346 | points(jitter(rep(4,5),factor=2),log2(rsub_her2$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5,col='black')
347 | axis(1, at=1:4, tick=T, labels=c("TCGA\nControl", "TCGA\nHER2","Rsubread\nControl", "Rsubread\nHER2"), cex.axis=0.8)
348 | ##using data processed by RSEM detected difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples
349 | ##t = -12.1833, df = 4.157, p-value = 0.0002081 but was worse than Rsubread
350 | t.test(log2(rsem_her2$ERBB2[1:12]+1),log2(rsem_her2$ERBB2[13:17]+1))
351 | ##using not normalized data processed by Rsubread was much better at detecting difference in her2 gene count in HER2 overexpressed versus GFP overexpressed samples
352 | ##t = -46.6747, df = 8.35, p-value = 2.152e-11
353 | t.test(log2(rsub_her2$ERBB2[1:12]+1),log2(rsub_her2$ERBB2[13:17]+1))
354 | ###########here we are computing standardized mean difference using the exprected gene counts from TCGA pipeline and gene counts from Rsubread algorithm ############
355 | ####Hedge's standardized mean/effect size using TCGA pipeline
356 | standardized_mean(mean(log2(rsem_her2$ERBB2[13:17]+1)),sd(log2(rsem_her2$ERBB2[13:17]+1)),5,mean(log2(rsem_her2$ERBB2[1:12]+1)),sd(log2(rsem_her2$ERBB2[1:12]+1)),12)
357 | ####Hedge's standardized mean/effect size using Rsubread pipeline
358 | standardized_mean(m.1=mean((log2(rsub_her2$ERBB2[13:17]+1))),sd.1=sd((log2(rsub_her2$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(rsub_her2$ERBB2[1:12]+1))),sd.2=sd((log2(rsub_her2$ERBB2[1:12]+1))),n.2 = 12)
359 | #######################comparing gene counts results ############
360 | par( mfrow = c( 1,3 ) ,lwd=4)
361 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),]
362 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene
363 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene)
364 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1)
365 | ###Coefficient of variation in GFP samples across all common genes
366 | ####Coefficient of variation in TCGA pipeline processed data
367 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_fpkm),]
368 | hist(na.omit(apply(com_genes_TCGA,1,sd)/apply(com_genes_TCGA,1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
369 | hist(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
370 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the control samples:",median(na.omit(apply(com_genes_TCGA[,1:12],1,sd)/apply(com_genes_TCGA[,1:12],1,mean))),sep=" "))
371 | hist(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean)),main = "TCGA Level 3",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes", breaks = 20)
372 | print(paste("Coefficient of variation in TCGA Level 3 data across 19585 genes in the HER2-overexpressed samples:",median(na.omit(apply(com_genes_TCGA[,13:17],1,sd)/apply(com_genes_TCGA[,13:17],1,mean))),sep=" "))
373 | tcga_her2_normalized<-data.frame(t(TCGA_her2["ERBB2",]))
374 | ####Coefficient of variation in Rsubread pipeline processed data
375 | com_genes_fpkm<-rsub_fpkm[rownames(rsub_fpkm)%in%rownames(com_genes_TCGA),]
376 | hist(na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)),main = "Rsubread FPKM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes",breaks=20)
377 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_fpkm[,1:12],1,sd)/apply(com_genes_fpkm[,1:12],1,mean)))),sep=''))
378 | print(paste("Coefficient of variation in Rsubread FPKM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_fpkm[,13:17],1,sd)/apply(com_genes_fpkm[,13:17],1,mean)))),sep=''))
379 | rsub_fpkmlog_her2<-data.frame(t(rsub_fpkmlog["ERBB2",]))
380 | rsub_fpkm_her2<-data.frame(t(rsub_fpkm["ERBB2",]))
381 | com_genes_tpm<-rsub_fpkm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),]
382 | hist(na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)),main = "Rsubread TPM",xlab = "Coefficient of variation",ylim=c(0,12500),lwd=4,ylab="Number of genes")
383 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the control samples:",median((na.omit(apply(com_genes_tpm[,1:12],1,sd)/apply(com_genes_tpm[,1:12],1,mean)))),sep=''))
384 | print(paste("Coefficient of variation in Rsubread TPM normalized data across 19585 genes in the HER2-overexpressed samples:",median((na.omit(apply(com_genes_tpm[,13:17],1,sd)/apply(com_genes_tpm[,13:17],1,mean)))),sep=''))
385 | rsub_tpm_her2<-data.frame(t(rsub_tpm["ERBB2",]))
386 | rsub_tpmlog_her2<-data.frame(t(rsub_tpmlog["ERBB2",]))
387 | #######post normalization ecdf
388 | ecdf_all_ex<-apply(log2(TCGA_her2+1),2,ecdf)
389 | par( mfrow = c( 1, 3 ) )
390 | plot(ecdf_all_ex[[1]],xlab=NA, ylab = NA,col="blue",main="TCGA Level 3",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,)
391 | for(i in 2:12){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="blue")}
392 | for(i in 13:17){lines(ecdf_all_ex[[i]],xlab=NA, ylab = NA,col="brown")}
393 | ###using Rsubread pipeline aligned data
394 | ecdf_all<-apply(rsub_fpkmlog,2,ecdf)
395 | plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
396 | for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
397 | for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
398 | ecdf_all_t<-apply(rsub_tpmlog,2,ecdf)
399 | plot(ecdf_all_t[[1]],col="blue",main="Rsubread TPM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
400 | for(i in 2:12){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="blue")}
401 | for(i in 13:17){lines(ecdf_all_t[[i]],xlab=NA,ylab = NA,col="brown")}
402 | ###Creating boxplots of the normalized ERBB2 expression
403 | par( mfrow = c( 1, 1 ) )
404 | par(mar=c(5, 4.5, 3.5, 0.5))
405 | boxplot(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1),rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17],rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17],ylab="log2(HER2 gene expression values)",main="Comparing HER2 normalized expression between\n control and her2 samples",range=0,cex.axis=1.5, cex.lab=1.5,outpch=NA,col='grey75',medcol="grey75",lwd=4,border = "grey35")
406 | names=c("TCGA\nGFP","TCGA\nHER2","Rsubred FPKM\nGFP", "Rsubred FPKM\nHER2","Rsubred TPM\nGFP", "Rsubred TPM\nHER2")
407 | text(seq(1,6,by=1),par("usr")[3] - 2, labels = names, srt = 45, pos = 1, xpd = TRUE)
408 | points(jitter(rep(1,12),factor=2),log2(tcga_her2_normalized$ERBB2[1:12]+1),pch=4,cex=2,cex.lab=1.5)
409 | points(jitter(rep(2,5),factor=2),log2(tcga_her2_normalized$ERBB2[13:17]+1),pch=4,cex=2,cex.lab=1.5)
410 | points(jitter(rep(3,12),factor=2),rsub_fpkmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5)
411 | points(jitter(rep(4,5),factor=2),rsub_fpkmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5)
412 | points(jitter(rep(5,12),factor=2),rsub_tpmlog_her2$ERBB2[1:12],pch=4,cex=2,cex.lab=1.5)
413 | points(jitter(rep(6,5),factor=2),rsub_tpmlog_her2$ERBB2[13:17],pch=4,cex=2,cex.lab=1.5)
414 | ###t.test to see if there is significance
415 | t.test(log2(tcga_her2_normalized$ERBB2[1:12]+1),log2(tcga_her2_normalized$ERBB2[13:17]+1))
416 | t.test(rsub_fpkmlog_her2$ERBB2[1:12],rsub_fpkmlog_her2$ERBB2[13:17])
417 | t.test(rsub_tpmlog_her2$ERBB2[1:12],rsub_tpmlog_her2$ERBB2[13:17])
418 | ###Standardized mean difference: TCGA pipeline normalized ERBB2 expression values
419 | standardized_mean(m.1=mean((log2(tcga_her2_normalized$ERBB2[13:17]+1))),sd.1=sd((log2(tcga_her2_normalized$ERBB2[13:17]+1))),n.1=5,m.2=mean((log2(tcga_her2_normalized$ERBB2[1:12]+1))),sd.2=sd((log2(tcga_her2_normalized$ERBB2[1:12]+1))),n.2=12)
420 | ###Standardized mean difference: Rsubread pipeline FPKM normalized ERBB2 expression values
421 | standardized_mean(mean(rsub_fpkmlog_her2$ERBB2[13:17]),sd(rsub_fpkmlog_her2$ERBB2[13:17]),5,mean(rsub_fpkmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12)
422 | ###Standardized mean difference:Rsubread pipeline TPM normalized ERBB2 expression values
423 | standardized_mean(mean(rsub_tpmlog_her2$ERBB2[13:17]),sd(rsub_tpmlog_her2$ERBB2[13:17]),5,mean(rsub_tpmlog_her2$ERBB2[1:12]),sd(rsub_fpkmlog_her2$ERBB2[1:12]),12)
424 | colnames(pancan12_zero)<-"PANCAN12"
425 | colnames(pancan20_tpm_zero)<-"TPM"
426 | all_zeros<-merge_drop(pancan12_zero,pancan20_tpm_zero)
427 | #3380 samples are common
428 | par(mfrow = c(1, 2),lwd=4)
429 | h1<-hist(all_zeros$PANCAN12,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks = 25)
430 | abline(v=median(all_zeros$PANCAN12),col="red",lty=2)
431 | h2<-hist(all_zeros$TPM,xlab='',ylab='',main='',xlim=c(0,8000),ylim=c(0,800),lwd=4,breaks=25)
432 | abline(v=median(all_zeros$TPM),col="red",lty=2)
433 | t.test(all_zeros$PANCAN12,all_zeros$TPM)
434 | #############Predicted HER2 pathway activity analysis#############################################
435 | all_preds<-merge_drop(rsub_preds,tcga_preds,by=0)
436 | brca_clinical<-subset(clinicals,clinicals[,'tumor_tissue_site']=='Breast',select=c("bcr_patient_barcode","her2_status_by_ihc"))
437 | common_all<-merge_drop(all_preds,brca_clinical,by=0)
438 | all_preds_pos_neg<-subset(common_all,common_all$her2_status_by_ihc=="Negative"|common_all$her2_status_by_ihc=="Positive")
439 | all_ranked<-apply(all_preds_pos_neg[,1:3],2,rank)
440 | all<-cbind(all_ranked,all_preds_pos_neg[,4:5])
441 | ihc_neg<-subset(all,all$her2_status_by_ihc=="Negative")
442 | ihc_pos<-subset(all,all$her2_status_by_ihc=="Positive")
443 | ##############boxplot of ranked estimated HER2 pathway activity
444 | ##in TCGA BRCA samples####
445 | par(mfrow = c(1, 1))
446 | par(mar=c(5, 4.6, 2.5, 0.6)) # figure margins
447 | boxplot(ihc_pos$Rsem_log_q_200_f,ihc_neg$Rsem_log_q_200_f,ihc_pos$FPKM_log_q_200_f,ihc_neg$FPKM_log_q_200_f,ihc_pos$TPM_log_q_200_f,ihc_neg$TPM_log_q_200_f,cex.axis=1.5, cex.lab=1.5,outpch=NA,range=0,cex.axis=1, cex.lab=0.7,outpch=NA,col='grey75',medcol="grey5",lwd=4,border = "grey5", main="Comparison of rank-based estimate \nof HER2 activation",ylab="Ranked HER2 prediction")
448 | names=c("TCGA\nLevel3\nHER2(+)","TCGA\nLevel3\nHER2(-)","Rsubred\nFPKM\nHER2(+)", "Rsubred\nFPKM\nHER2(-)","Rsubred\nTPM\nHER2(+)", "Rsubred\nTPM\nHER2(-)")
449 | text(seq(1,6,by=1),par("usr")[3] - 4.5, labels = names, srt = 45, pos = 1, xpd = TRUE)
450 | ihc_neg_t<-subset(common_all,common_all$her2_status_by_ihc=="Negative")
451 | ihc_pos_t<-subset(common_all,common_all$her2_status_by_ihc=="Positive")
452 | ##coefficient of variation in TCGA pipeline processed HER2 predictions
453 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$Rsem_log_q_200_f)/mean(ihc_neg_t$Rsem_log_q_200_f),sep=' '))
454 | print(paste("Coefficient of variation in TCGA pipeline processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$Rsem_log_q_200_f)/mean(ihc_pos_t$Rsem_log_q_200_f),sep=' '))
455 | ##coefficient of variation in Rsubread FPKM pipeline processed HER2 predictions
456 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$FPKM_log_q_200_f)/mean(ihc_neg_t$FPKM_log_q_200_f),sep=" "))
457 | print(paste("Coefficient of variation in Rsubread FPKM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$FPKM_log_q_200_f)/mean(ihc_pos_t$FPKM_log_q_200_f),sep=" "))
458 | ##coefficient of variation in Rsubread TPM pipeline processed HER2 predictions
459 | print(paste("Coefficient of variation in Rsubread TPM processed HER2 predictions in HER2(-) BRCA samples",sd(ihc_neg_t$TPM_log_q_200_f)/mean(ihc_neg_t$TPM_log_q_200_f),sep=" "))
460 | print(paste("Coefficient of variation in Rsubread TPM processed HER2 predictions in HER2(+) BRCA samples",sd(ihc_pos_t$TPM_log_q_200_f)/mean(ihc_pos_t$TPM_log_q_200_f),sep=" "))
461 | ##Calculating standardized mean differences between the HER2(+) and HER2(-) groups
462 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for TCGA Level 3 data :",standardized_mean(m.1=mean(ihc_pos_t$Rsem_log_q_200_f),sd.1=sd(ihc_pos_t$Rsem_log_q_200_f),n.1=length(ihc_pos_t$Rsem_log_q_200_f),m.2=mean(ihc_neg_t$Rsem_log_q_200_f),sd.2=sd(ihc_neg_t$Rsem_log_q_200_f),n.2=length(ihc_neg_t$Rsem_log_q_200_f)),sep=' '))
463 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for Rsubread FPKM data :",standardized_mean(m.1=mean(ihc_pos_t$FPKM_log_q_200_f),sd.1=sd(ihc_pos_t$FPKM_log_q_200_f),n.1=length(ihc_pos_t$FPKM_log_q_200_f),m.2=mean(ihc_neg_t$FPKM_log_q_200_f),sd.2=sd(ihc_neg_t$FPKM_log_q_200_f),n.2=length(ihc_neg_t$FPKM_log_q_200_f)),sep=' '))
464 | print(paste("Standardized mean difference in predicrion between HER2 (+) and HER2 (-) samples for FPKM TPM data :",standardized_mean(m.1=mean(ihc_pos_t$TPM_log_q_200_f),sd.1=sd(ihc_pos_t$TPM_log_q_200_f),n.1=length(ihc_pos_t$TPM_log_q_200_f),m.2=mean(ihc_neg_t$TPM_log_q_200_f),sd.2=sd(ihc_neg_t$TPM_log_q_200_f),n.2=length(ihc_neg_t$TPM_log_q_200_f)),sep=' '))
465 | ## t-tests comparing HER(+) and HER(-) prediction
466 | t.test(ihc_pos_t$Rsem_log_q_200_f,ihc_neg_t$Rsem_log_q_200_f)# For TCGA Level 3: p-value = 2.009e-05
467 | t.test(ihc_pos_t$FPKM_log_q_200_f,ihc_neg_t$FPKM_log_q_200_f)#For Rsubread FPKM: p-value = 1.493e-10
468 | t.test(ihc_pos_t$TPM_log_q_200_f,ihc_neg_t$TPM_log_q_200_f)#For Rsubread TPM:p-value = 3.197e-12
469 | par(mfrow = c(1, 1),lwd=4)
470 | actual12 = data12$ActualClass
471 | predictions12 = data12$LUAD_Probability
472 | auc = plotROC(actual12, predictions12, TRUE)
473 | title("TCGA Level 3 LUAD vs LUSC")
474 | actual20 = data20$ActualClass
475 | predictions20 = data20$LUAD_Probability
476 | auc = plotROC(actual20, predictions20, TRUE)
477 | title("Rsubread TPM LUAD vs LUSC")
478 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of GFP_RSEM_log_10_9_filtered.txt and HER2_RSEM_log_10_9_filtered.txt. I generated predictions on TCGA_RSEM_log_BRCA_10_9_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Monday, 13 October 2014, 11:34 PM on adira.genetics.utah.edu. It took 12m 2s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
1 | Name Coefficient
2 | Intercept 4.524853
3 | ERBB2 0.164782
4 | HSPA7 -0.125612
5 | GDF6 -0.111343
6 | HSPA6 -0.097087
7 | CCL2 -0.093873
8 | CXCL10 -0.092074
9 | LOC338651 0.079326
10 | TNFSF14 -0.073710
11 | CD248 -0.059249
12 | IFIT1 -0.057644
13 | DNAJA4 -0.053322
14 | GNAO1 -0.050292
15 | CRHR1 0.048706
16 | EEF1A2 0.045896
17 | HSPA1B -0.045632
18 | CCL20 -0.044527
19 | TNFAIP2 -0.044330
20 | LOC91948 0.042751
21 | ATP6V0A4 0.038768
22 | CFB -0.037830
23 | CALB2 0.036782
24 | PADI1 0.035659
25 | PDGFB 0.034971
26 | LOC285629 -0.034876
27 | CRYAB -0.032468
28 | GABRA2 0.030593
29 | SOD2 -0.028653
30 | ULBP1 -0.028346
31 | KRT18 0.028246
32 | GPR1 -0.027639
33 | CXCL5 -0.027617
34 | EPHA3 -0.026868
35 | IL8 -0.025943
36 | EPHA4 -0.025735
37 | TLR3 -0.025646
38 | HSPB8 -0.025054
39 | RPSAP52 0.024980
40 | RGS2 -0.024874
41 | SLC2A12 -0.024861
42 | KRT19 0.024626
43 | TRANK1 -0.024277
44 | MGP 0.023918
45 | SAA1 -0.023534
46 | SHC4 0.022446
47 | KITLG -0.022152
48 | KRT8 0.022084
49 | CGNL1 -0.021984
50 | MYCL1 -0.021942
51 | ANGPTL4 0.021650
52 | PARP9 -0.021303
53 | DNAJB4 -0.021262
54 | SPON1 0.021236
55 | PIK3C2B -0.021143
56 | PARP14 -0.021042
57 | SERPINB1 0.020839
58 | CXCL2 -0.020713
59 | SERPINB13 -0.020613
60 | SNX9 0.020262
61 | TRIM22 -0.020121
62 | DNAJB1 -0.019926
63 | KANK4 -0.019885
64 | GBP6 -0.019667
65 | MLPH 0.019478
66 | APOL6 -0.019334
67 | OAS3 -0.019302
68 | HSP90AA1 -0.019165
69 | KRT81 0.019156
70 | GM2A -0.019126
71 | ENGASE -0.017973
72 | KRT75 0.017856
73 | CBLC 0.017765
74 | CCNA1 0.017623
75 | FERMT2 0.017321
76 | CEACAM1 0.017130
77 | SLC13A5 0.017066
78 | MTSS1L -0.017003
79 | TCF4 -0.016884
80 | PLAUR 0.016528
81 | GPR110 0.016330
82 | TP53AIP1 -0.016244
83 | APAF1 0.016161
84 | HSPH1 -0.016115
85 | RAB6B 0.016005
86 | LOXL4 0.015594
87 | OSBP2 0.015384
88 | HSPA8 -0.015298
89 | UNC5B -0.015048
90 | RASA3 0.014898
91 | KCNN4 0.014783
92 | ANPEP 0.014734
93 | AMACR -0.014480
94 | ZC3HAV1 -0.014280
95 | COBLL1 -0.014277
96 | ECT2 0.014259
97 | SMURF2 0.014218
98 | CBR1 -0.014049
99 | TUFT1 0.013455
100 | C1R -0.013313
101 | SESN2 -0.013303
102 | TWF2 0.013165
103 | INPP4B 0.013134
104 | SMO -0.013129
105 | ITGB3 0.013106
106 | CAST 0.013084
107 | FBXW7 -0.013061
108 | VASP 0.012979
109 | SASH1 -0.012828
110 | MT2A 0.012725
111 | NAV3 0.012684
112 | NET1 0.012572
113 | CGN 0.012481
114 | SYTL2 -0.012440
115 | CYBASC3 -0.012341
116 | ST3GAL4 0.012295
117 | TNS3 -0.012073
118 | BCAR3 0.011678
119 | SEC24D 0.011623
120 | DTX4 -0.011553
121 | PYGB 0.011389
122 | MYO1E 0.011297
123 | PTPRE 0.011089
124 | GFPT1 0.011087
125 | ACTB 0.011033
126 | STIM2 -0.011012
127 | XPC -0.011008
128 | MFI2 0.010950
129 | NFATC3 -0.010879
130 | C19orf66 -0.010511
131 | PDZD2 -0.010452
132 | ARHGEF2 0.010354
133 | TRIOBP 0.010316
134 | SLC34A2 -0.010288
135 | FRMD4A -0.010219
136 | MAP3K2 -0.010081
137 | NPAS2 0.010074
138 | IGFL3 -0.009956
139 | ARHGAP12 0.009927
140 | SH2D3A 0.009911
141 | NAV2 -0.009866
142 | SMOC1 0.009764
143 | HERPUD1 0.009567
144 | WDR1 0.009562
145 | RASA1 0.009529
146 | MBD4 -0.009337
147 | PLEK2 0.009276
148 | BCAP29 0.009270
149 | ATG16L1 0.009237
150 | LDB1 -0.009222
151 | NCDN -0.009177
152 | NEK9 -0.009083
153 | CSGALNACT2 0.009018
154 | ATP1B1 -0.008895
155 | APBB2 -0.008881
156 | CAPN2 0.008880
157 | CALM2 0.008674
158 | TRAFD1 -0.008589
159 | PGM1 0.008555
160 | FGFR2 -0.008354
161 | DOPEY1 -0.008331
162 | NISCH -0.008191
163 | PI4KB -0.008141
164 | TOR3A -0.007819
165 | LRIG3 0.007766
166 | POLR2A -0.007749
167 | NEU1 -0.007665
168 | KPNA4 0.007656
169 | PIK3CD 0.007606
170 | ANKRD13A -0.007496
171 | TBRG1 -0.007462
172 | EPS15 0.007458
173 | TRIM5 -0.007361
174 | PCSK7 -0.007332
175 | ANKFY1 -0.007320
176 | C20orf194 0.007244
177 | C19orf42 -0.007162
178 | ITGA5 0.007095
179 | ARHGEF12 -0.006996
180 | STK40 -0.006932
181 | MLLT6 -0.006786
182 | C1orf85 -0.006767
183 | PTPN12 0.006480
184 | MAP2K4 -0.006351
185 | ZNF532 -0.006134
186 | AFAP1L2 0.006103
187 | ARID1B -0.005924
188 | SEC14L1 0.005811
189 | PLEKHA6 -0.005776
190 | ELOVL1 0.005764
191 | CLASP1 -0.005727
192 | SMEK1 -0.005478
193 | NUMA1 -0.005168
194 | ZMYND8 0.005151
195 | PDXK -0.005071
196 | MYO10 0.004929
197 | UBP1 -0.004780
198 | RCC2 0.004742
199 | SGK1 0.004731
200 | RFWD3 -0.004666
201 | C20orf3 -0.004354
202 | WDR91 -0.004333
203 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
1 | NAME VALUE
2 | Binreg Version 2
3 | Genes 200
4 | Metagenes 2
5 | Strip AFFX control 0
6 | Log Train0 0
7 | Log Train1 0
8 | Log Test 0
9 | Quantile Normalize 1
10 | Shift-Scale Normalize 0
11 | DWD Normalize 0
12 | DWD Normalize (Bild) 0
13 | Burn In 1000
14 | Samples 5000
15 | Skips 1
16 | Credible Interval 95
17 | Cross Validate 1
18 | Make Plots 1
19 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/RSEM_q_log_200_f/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of Feature_GFP_TPMlog_10_6.txt and Feature_HER2_TPMlog_10_6.txt. I generated predictions on TCGA_PANCAN20_Rsubread_BRCA_TPMlog_10_9_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Monday, 13 October 2014, 11:47 PM on adira.genetics.utah.edu. It took 11m 41s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
1 | Name Coefficient
2 | Intercept -0.504928
3 | ERBB2 0.305527
4 | HSPA6 -0.158780
5 | HSPA7 -0.151412
6 | CCL2 -0.106984
7 | DNAJA4 -0.093340
8 | TNFAIP2 -0.075825
9 | HSPA1A -0.073306
10 | EEF1A2 0.071440
11 | PDGFB 0.067870
12 | EPGN -0.067303
13 | HSPA1B -0.066745
14 | ATP6V0A4 0.062446
15 | CFB -0.060075
16 | CALB2 0.058290
17 | CRYAB -0.054796
18 | SAA2 -0.050794
19 | PNMA2 0.050400
20 | KRT80 0.050203
21 | TNFRSF11B 0.048283
22 | UCA1 0.046302
23 | CXCL5 -0.045923
24 | ANGPTL7 -0.044990
25 | KPRP 0.044522
26 | SOD2 -0.044234
27 | SYTL5 0.043949
28 | KRT19 0.043441
29 | AKAP12 0.043351
30 | SRMS 0.042485
31 | PADI1 0.042177
32 | GPR1 -0.041418
33 | RGS2 -0.041195
34 | MYADM 0.040819
35 | SHC4 0.040550
36 | BST2 -0.039644
37 | EPHA3 -0.039500
38 | KLK6 0.038871
39 | KRT18 0.038599
40 | SAA1 -0.038474
41 | SPON1 0.038178
42 | HSP90AA1 -0.038082
43 | TSPAN18 0.037454
44 | EPHA4 -0.037243
45 | ANGPTL4 0.036491
46 | PAQR7 -0.036256
47 | ULBP1 -0.035505
48 | HSPH1 -0.035296
49 | PGM2L1 0.035069
50 | CRHR1 0.034918
51 | SERPINB13 -0.034840
52 | PIK3C2B -0.034825
53 | PTK6 0.034722
54 | CXCR1 0.034384
55 | FAM198B -0.034254
56 | GRAMD2 -0.034033
57 | DDAH1 0.033964
58 | GPRC5A 0.033659
59 | DAPK1 -0.033620
60 | SLC1A1 0.033565
61 | VWA1 0.033251
62 | DNAJA1 -0.032433
63 | SNX9 0.032379
64 | KITLG -0.032252
65 | HSPB8 -0.032155
66 | GBP6 -0.031284
67 | C10orf10 0.030517
68 | CCNA1 0.030310
69 | GM2A -0.030108
70 | C8orf84 0.029972
71 | ALDH1A3 0.029680
72 | TRIM22 -0.029548
73 | SREK1IP1 0.029351
74 | KRT8 0.029074
75 | NOTCH1 -0.028721
76 | DNAJB4 -0.028676
77 | FERMT2 0.027438
78 | EMP1 0.027141
79 | MAFF 0.026901
80 | TCF4 -0.026670
81 | DNAJB1 -0.026460
82 | PARP14 -0.026319
83 | PLAUR 0.026168
84 | LOC644961 0.026082
85 | KHDRBS3 0.025650
86 | PLAU 0.025228
87 | KANK4 -0.025090
88 | ESR1 -0.024670
89 | APOL6 -0.024617
90 | KCNN4 0.024463
91 | IGFL3 -0.024452
92 | MTSS1L -0.024210
93 | RAPH1 0.024168
94 | IFIT5 -0.024094
95 | DUSP10 0.024043
96 | PMP22 0.023801
97 | VASP 0.023373
98 | ARRDC4 -0.023118
99 | SMO -0.023104
100 | FAM176A 0.022803
101 | CBR1 -0.022764
102 | WWTR1 0.022599
103 | PGF 0.022576
104 | STX2 0.022286
105 | ZPLD1 0.022175
106 | KMO -0.022123
107 | FAM214B 0.021843
108 | TUFT1 0.021717
109 | TNS3 -0.021558
110 | MAP6 0.021499
111 | ST3GAL4 0.021422
112 | HMGB3 0.021401
113 | HS6ST1 -0.021304
114 | DLC1 -0.021275
115 | POU2F1 0.021216
116 | APAF1 0.021057
117 | STOX2 -0.020845
118 | RASA3 0.020767
119 | HERC3 0.020487
120 | DFNB31 -0.020337
121 | FBXO22 -0.020150
122 | BRMS1 -0.020097
123 | IER3 0.020017
124 | NET1 0.019989
125 | CYBASC3 -0.019984
126 | PYGB 0.019830
127 | XPC -0.019811
128 | BCAR3 0.019647
129 | ZXDB 0.019586
130 | CELF2 0.019402
131 | IGF2BP3 0.019325
132 | TIMP1 -0.019048
133 | ARHGAP12 0.019010
134 | NME7 0.018951
135 | ARV1 -0.018928
136 | CASP1 -0.018873
137 | MR1 -0.018826
138 | KCNJ5 -0.018762
139 | LRRC8C 0.018716
140 | TWF2 0.018592
141 | PPP3CC 0.018547
142 | ANKRD33B -0.018542
143 | CAST 0.018294
144 | SH3KBP1 0.017947
145 | PODXL2 0.017847
146 | INPP4B 0.017676
147 | TNS4 0.017660
148 | DAB2 0.017551
149 | MFI2 0.017540
150 | RBMS2 0.017501
151 | FGFR2 -0.017469
152 | GFPT1 0.017427
153 | TP53AIP1 -0.017304
154 | NAV3 0.017121
155 | ARHGEF2 0.017063
156 | SESN1 -0.016845
157 | DNAJB9 0.016278
158 | NFE2L1 -0.016229
159 | TRIOBP 0.016197
160 | KIAA1671 -0.016057
161 | ZNFX1 -0.015835
162 | CROT -0.015664
163 | SLC20A2 0.015334
164 | B2M -0.015314
165 | UBB -0.015001
166 | FBXW2 -0.014918
167 | LDB1 -0.014863
168 | SEC24D 0.014746
169 | MICALCL 0.014702
170 | MYO1E 0.014521
171 | RASSF1 0.014486
172 | TOR3A -0.014460
173 | PIK3R1 -0.014459
174 | TRAFD1 -0.014282
175 | ANKRD13A -0.014195
176 | SLC41A1 -0.014065
177 | MEF2D 0.013983
178 | PI4KB -0.013683
179 | LRRFIP1 0.013638
180 | PRRC1 0.013535
181 | FRMD4A -0.012667
182 | PNMAL1 -0.012235
183 | LPP 0.011861
184 | CAPN2 0.011646
185 | ADAR -0.011625
186 | PRDM4 -0.011432
187 | APBB2 -0.011350
188 | SEC14L1 0.011315
189 | UBP1 -0.010824
190 | ASAP2 0.010731
191 | PRPSAP2 -0.010671
192 | PPP2R5B 0.010646
193 | NFATC3 -0.010535
194 | AFAP1 0.010482
195 | DCAF7 0.010296
196 | MYL12A 0.009901
197 | ARHGEF12 -0.009895
198 | STAT3 -0.009518
199 | ANKRD27 0.008986
200 | IFFO2 0.008553
201 | GTF2I -0.008151
202 | CYB561 0.007650
203 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
1 | NAME VALUE
2 | Binreg Version 2
3 | Genes 200
4 | Metagenes 2
5 | Strip AFFX control 0
6 | Log Train0 0
7 | Log Train1 0
8 | Log Test 0
9 | Quantile Normalize 1
10 | Shift-Scale Normalize 0
11 | DWD Normalize 0
12 | DWD Normalize (Bild) 0
13 | Burn In 1000
14 | Samples 5000
15 | Skips 1
16 | Credible Interval 95
17 | Cross Validate 1
18 | Make Plots 1
19 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/TPM_q_log_200_f/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of Feature_GFP_FPKMlog_10_6.txt and Feature_HER2_FPKMlog_10_6.txt. I generated predictions on TCGA_PANCAN20_Rsubread_BRCA_RPKMlog_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I logged the expression values in Feature_GFP_FPKMlog_10_6.txt and Feature_HER2_FPKMlog_10_6.txt.
12 |
- I applied quantile normalization.
13 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
14 |
15 |
16 |
II. Results
17 |
18 |
19 |
20 | 200 Genes, 2 Metagenes
21 | |
22 |
23 |
24 |
25 |
26 |
27 |
28 | |
29 |
30 |
31 |
32 | |
33 |
34 |
35 |
36 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 5 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
37 | |
38 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
39 | probabilities.txt
40 | .
41 | |
42 |
43 |
44 |
45 |
46 | This analysis was run on Monday, 13 October 2014, 11:21 PM on adira.genetics.utah.edu. It took 12m 57s to complete.
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/model.txt:
--------------------------------------------------------------------------------
1 | Name Coefficient
2 | Intercept 0.168851
3 | ERBB2 0.257577
4 | HSPA7 -0.187866
5 | HSPA6 -0.136333
6 | GDF6 0.098740
7 | DNAJA4 -0.080598
8 | KPRP 0.074612
9 | EEF1A2 0.069003
10 | TNFAIP2 -0.067720
11 | PDGFB 0.066514
12 | TSPAN18 0.066512
13 | HSPA1A -0.062749
14 | ATP6V0A4 0.058443
15 | CFB -0.058034
16 | HSPA1B -0.057605
17 | EPGN -0.057545
18 | CALB2 0.054193
19 | PNMA2 0.048449
20 | SAA2 -0.047311
21 | CRYAB -0.046179
22 | KRT80 0.045195
23 | SRMS 0.043627
24 | GPR1 -0.043320
25 | UCA1 0.041757
26 | TNFRSF11B 0.041583
27 | FAM83A 0.040141
28 | EPHA3 -0.039923
29 | CXCL5 -0.039762
30 | RGS2 -0.039724
31 | DDAH1 0.039198
32 | ULBP1 -0.038466
33 | AKAP12 0.038418
34 | SOD2 -0.037183
35 | KRT19 0.036641
36 | TLR3 -0.035985
37 | SHC4 0.035642
38 | PPP1R3C -0.035295
39 | PTK6 0.034658
40 | SPON1 0.034473
41 | MYADM 0.034361
42 | BST2 -0.034136
43 | GRAMD2 -0.034067
44 | SAA1 -0.033523
45 | HSP90AA1 -0.032999
46 | KRT18 0.032801
47 | EPHA4 -0.032767
48 | PIK3C2B -0.032631
49 | KLK6 0.032407
50 | CXCR1 0.031954
51 | PGM2L1 0.031133
52 | ANGPTL4 0.031075
53 | PAQR7 -0.031038
54 | DAPK1 -0.030705
55 | FAM198B -0.030230
56 | SERPINB13 -0.030208
57 | GBP6 -0.030003
58 | VWA1 0.029805
59 | SLC1A1 0.029764
60 | HSPH1 -0.029464
61 | KITLG -0.028275
62 | GPRC5A 0.027836
63 | HSPB8 -0.027616
64 | SNX9 0.027574
65 | DNAJA1 -0.026591
66 | C10orf10 0.026544
67 | SREK1IP1 0.026213
68 | GM2A -0.026028
69 | C8orf84 0.025904
70 | CCNA1 0.025808
71 | TRIM22 -0.025731
72 | APOL6 -0.025483
73 | KRT8 0.025158
74 | DNAJB4 -0.025018
75 | TCF4 -0.024505
76 | NOTCH1 -0.024433
77 | ALDH1A3 0.024322
78 | MAFF 0.023981
79 | PARP14 -0.023917
80 | FERMT2 0.023615
81 | IL7R -0.023182
82 | LOC644961 0.023169
83 | KHDRBS3 0.022993
84 | EMP1 0.022449
85 | KMO -0.022438
86 | PLAUR 0.022023
87 | DNAJB1 -0.022019
88 | IFIT5 -0.021954
89 | RAPH1 0.021690
90 | KANK4 -0.021458
91 | DUSP10 0.020861
92 | SMO -0.020834
93 | DFNB31 -0.020759
94 | MTSS1L -0.020665
95 | PLAU 0.020509
96 | KCNN4 0.020505
97 | PMP22 0.020330
98 | STX2 0.020322
99 | VASP 0.020230
100 | IGFL3 -0.020208
101 | POU2F1 0.020096
102 | WWTR1 0.019760
103 | FAM176A 0.019732
104 | PGF 0.019637
105 | ARRDC4 -0.019625
106 | TNS3 -0.019394
107 | CBR1 -0.019365
108 | RASA3 0.019126
109 | APAF1 0.018740
110 | HERC3 0.018697
111 | HMGB3 0.018691
112 | ZXDB 0.018650
113 | ST3GAL4 0.018588
114 | HS6ST1 -0.018541
115 | IGF2BP3 0.018523
116 | TUFT1 0.018493
117 | FAM214B 0.018467
118 | NET1 0.017866
119 | XPC -0.017726
120 | FBXO22 -0.017678
121 | MR1 -0.017472
122 | CYBASC3 -0.017218
123 | KCNJ5 -0.017167
124 | IER3 0.017056
125 | NME7 0.016958
126 | PYGB 0.016808
127 | NAV3 0.016742
128 | BRMS1 -0.016648
129 | ARV1 -0.016434
130 | BCAR3 0.016403
131 | ARHGAP12 0.016383
132 | PPP3CC 0.016377
133 | PODXL2 0.016365
134 | PDZD2 -0.016253
135 | TWF2 0.016132
136 | RBMS2 0.016093
137 | CASP1 -0.015992
138 | TIMP1 -0.015829
139 | LRRC8C 0.015828
140 | SH3KBP1 0.015714
141 | CAST 0.015525
142 | TP53AIP1 -0.015300
143 | DAB2 0.015248
144 | FGFR2 -0.015210
145 | INPP4B 0.015146
146 | HMGN3 -0.015120
147 | SESN1 -0.014994
148 | TRIOBP 0.014970
149 | GFPT1 0.014771
150 | ARHGEF2 0.014671
151 | TNS4 0.014658
152 | MFI2 0.014631
153 | CROT -0.014554
154 | KIAA1671 -0.013946
155 | ZNFX1 -0.013815
156 | DNAJB9 0.013602
157 | NFE2L1 -0.013277
158 | PIK3R1 -0.013264
159 | FBXW2 -0.013023
160 | RASSF1 0.012832
161 | MICALCL 0.012790
162 | SLC20A2 0.012767
163 | LDB1 -0.012706
164 | IGFBP4 -0.012603
165 | SEC24D 0.012592
166 | B2M -0.012511
167 | CCDC50 0.012451
168 | SLC41A1 -0.012315
169 | TOR3A -0.012280
170 | HERPUD1 0.012254
171 | TRAFD1 -0.012195
172 | MYO1E 0.012108
173 | MEF2D 0.012092
174 | FRMD4A -0.011928
175 | LRRFIP1 0.011781
176 | ANKRD13A -0.011763
177 | PI4KB -0.011583
178 | PRRC1 0.011518
179 | UBB -0.011513
180 | FAM129B 0.011441
181 | PNMAL1 -0.010498
182 | LPP 0.010416
183 | APBB2 -0.010189
184 | PRDM4 -0.010085
185 | ADAR -0.010018
186 | SEC14L1 0.009938
187 | CAPN2 0.009793
188 | ASAP2 0.009678
189 | PPP2R5B 0.009550
190 | NFATC3 -0.009429
191 | PRPSAP2 -0.009416
192 | DCAF7 0.009216
193 | MEX3C 0.009174
194 | AFAP1 0.009148
195 | UBP1 -0.008794
196 | ARHGEF12 -0.008606
197 | SDC1 0.008466
198 | ADCY9 -0.008152
199 | STAT3 -0.008103
200 | ANKRD27 0.007958
201 | IFFO2 0.007081
202 | GTF2I -0.006848
203 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/parameters.txt:
--------------------------------------------------------------------------------
1 | NAME VALUE
2 | Binreg Version 2
3 | Genes 200
4 | Metagenes 2
5 | Strip AFFX control 0
6 | Log Train0 1
7 | Log Train1 1
8 | Log Test 0
9 | Quantile Normalize 1
10 | Shift-Scale Normalize 0
11 | DWD Normalize 0
12 | DWD Normalize (Bild) 0
13 | Burn In 1000
14 | Samples 5000
15 | Skips 1
16 | Credible Interval 95
17 | Cross Validate 1
18 | Make Plots 1
19 |
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/10_14_predictions_raw/fpkm_q_log_200_f/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of Feature_GFP_FPKMlog_10_6.txt and Rsub_HER2_FPKMlog.txt. I generated predictions on TCGA_PANCAN20_Rsubread_BRCA_RPKMlog_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Saturday, 02 May 2015, 02:43 PM on adira.genetics.utah.edu. It took 23m 34s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/fpkmlog_no/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of GFP_RSEM_log_10_9_filtered.txt and HER2_RSEM_3_log.txt. I generated predictions on TCGA_RSEM_log_BRCA_10_9_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Friday, 01 May 2015, 11:17 AM on adira.genetics.utah.edu. It took 13m 38s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of GFP_RSEM_log_10_9_filtered.txt and HER2_RSEM_3_log.txt. I generated predictions on TCGA_RSEM_log_BRCA_10_9_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Saturday, 02 May 2015, 04:04 PM on adira.genetics.utah.edu. It took 17m 54s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/rsem_no/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/REPORT.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | CreateSignatures Report
4 |
5 |
6 | CreateSignatures Report
7 | I. Analysis
8 |
9 | - I ran a signature analysis using a training set of Feature_GFP_TPMlog_10_6.txt and Rsub_HER2_TPMlog.txt. I generated predictions on TCGA_PANCAN20_Rsubread_BRCA_TPMlog_10_9_filtered.txt.
10 |
- I used the BinReg 2 algorithm with 200 genes and 2 metagenes.
11 |
- I applied quantile normalization.
12 |
- For the statistical (Markov chain Monte Carlo) simulation, I discarded 1,000 samples for the burn-in and then collected 5,000 samples for the model.
13 |
14 |
15 |
II. Results
16 |
17 |
18 |
19 | 200 Genes, 2 Metagenes
20 | |
21 |
22 |
23 |
24 |
25 |
26 |
27 | |
28 |
29 |
30 |
31 | |
32 |
33 |
34 |
35 | Figure 1: Signature Heatmap. In this heatmap, each row represents a gene in the signature. The first 12 columns are the samples from the train 0 data set, and the remaining 3 columns are the samples from the train 1 data set. Warm colors indicate high expression of the gene, and cool colors indicate low expression.
36 | |
37 | Figure 2: Predictions. This scatter plot shows the predictions from the signature for each sample. On the Y-axis, high probabilities indicate that the gene expression profile of the sample better resembles the train 1 class, while low probabilities indicate a closer resemblance to train 0.The blue and red circles are the predictions (from leave-one-out cross-validation) on the train 0 and train 1 samples, respectively. The black squares are the predictions on the test samples. The error bars show the 95% credible interval. The X-axis, the Metagene Score, is the magnitude of the sample on the first principal component. This is used only to separate the samples on the plot, and we do not further interpret these values. The raw values from this plot are available as a tab-delimited text file:
38 | probabilities.txt
39 | .
40 | |
41 |
42 |
43 |
44 |
45 | This analysis was run on Saturday, 02 May 2015, 03:35 PM on adira.genetics.utah.edu. It took 21m 33s to complete.
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/predictions.png
--------------------------------------------------------------------------------
/Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/srp33/TCGA_RNASeq_Clinical/05c95679476addb9b145e0c4044d422fea6bb407/Analysis_datasets/5_01_predictions_raw/tpmlog_no/signature.png
--------------------------------------------------------------------------------
/Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt:
--------------------------------------------------------------------------------
1 | SampleID ActualClass PredictedClass LUAD_Probability LUSC_Probability
2 | TCGA-05-4244-01A-01R-1107-07 LUAD LUAD 0.948 0.052
3 | TCGA-05-4249-01A-01R-1107-07 LUAD LUAD 0.946 0.054
4 | TCGA-05-4250-01A-01R-1107-07 LUAD LUAD 0.924 0.076
5 | TCGA-05-4382-01A-01R-1206-07 LUAD LUAD 0.94 0.06
6 | TCGA-05-4384-01A-01R-1755-07 LUAD LUAD 0.994 0.006
7 | TCGA-05-4389-01A-01R-1206-07 LUAD LUAD 0.86 0.14
8 | TCGA-05-4390-01A-02R-1755-07 LUAD LUAD 0.918 0.082
9 | TCGA-05-4395-01A-01R-1206-07 LUAD LUAD 0.776 0.224
10 | TCGA-05-4396-01A-21R-1858-07 LUAD LUAD 0.94 0.06
11 | TCGA-05-4397-01A-01R-1206-07 LUAD LUAD 0.632 0.368
12 | TCGA-05-4398-01A-01R-1206-07 LUAD LUAD 0.914 0.086
13 | TCGA-05-4402-01A-01R-1206-07 LUAD LUAD 0.968 0.032
14 | TCGA-05-4403-01A-01R-1206-07 LUAD LUAD 0.934 0.066
15 | TCGA-05-4405-01A-21R-1858-07 LUAD LUAD 0.968 0.032
16 | TCGA-05-4410-01A-21R-1858-07 LUAD LUAD 0.958 0.042
17 | TCGA-05-4415-01A-22R-1858-07 LUAD LUAD 0.714 0.286
18 | TCGA-05-4417-01A-22R-1858-07 LUAD LUAD 0.976 0.024
19 | TCGA-05-4418-01A-01R-1206-07 LUAD LUAD 0.85 0.15
20 | TCGA-05-4420-01A-01R-1206-07 LUAD LUAD 0.858 0.142
21 | TCGA-05-4422-01A-01R-1206-07 LUAD LUAD 0.938 0.062
22 | TCGA-05-4424-01A-22R-1858-07 LUAD LUAD 0.9 0.1
23 | TCGA-05-4425-01A-01R-1755-07 LUAD LUAD 0.952 0.048
24 | TCGA-05-4426-01A-01R-1206-07 LUAD LUAD 0.94 0.06
25 | TCGA-05-4427-01A-21R-1858-07 LUAD LUAD 0.914 0.086
26 | TCGA-05-4430-01A-02R-1206-07 LUAD LUAD 0.938 0.062
27 | TCGA-05-4432-01A-01R-1206-07 LUAD LUAD 0.932 0.068
28 | TCGA-05-4433-01A-22R-1858-07 LUAD LUAD 0.976 0.024
29 | TCGA-05-4434-01A-01R-1206-07 LUAD LUAD 0.776 0.224
30 | TCGA-05-5420-01A-01R-1628-07 LUAD LUAD 0.78 0.22
31 | TCGA-05-5423-01A-01R-1628-07 LUAD LUAD 0.952 0.048
32 | TCGA-05-5425-01A-02R-1628-07 LUAD LUAD 0.928 0.072
33 | TCGA-05-5428-01A-01R-1628-07 LUAD LUAD 0.834 0.166
34 | TCGA-05-5429-01A-01R-1628-07 LUAD LUAD 0.854 0.146
35 | TCGA-05-5715-01A-01R-1628-07 LUAD LUAD 0.96 0.04
36 | TCGA-35-3615-01A-01R-0946-07 LUAD LUAD 0.984 0.016
37 | TCGA-35-4122-01A-01R-1107-07 LUAD LUAD 0.748 0.252
38 | TCGA-35-4123-01A-01R-1107-07 LUAD LUAD 0.818 0.182
39 | TCGA-35-5375-01A-01R-1628-07 LUAD LUSC 0.44 0.56
40 | TCGA-38-4625-01A-01R-1206-07 LUAD LUAD 0.766 0.234
41 | TCGA-38-4626-01A-01R-1206-07 LUAD LUAD 0.964 0.036
42 | TCGA-38-4627-01A-01R-1206-07 LUAD LUAD 0.89 0.11
43 | TCGA-38-4628-01A-01R-1206-07 LUAD LUAD 0.896 0.104
44 | TCGA-38-4629-01A-02R-1206-07 LUAD LUAD 0.878 0.122
45 | TCGA-38-4630-01A-01R-1206-07 LUAD LUAD 0.52 0.48
46 | TCGA-38-4631-01A-01R-1755-07 LUAD LUAD 0.744 0.256
47 | TCGA-38-4632-01A-01R-1755-07 LUAD LUAD 0.888 0.112
48 | TCGA-38-6178-01A-11R-1755-07 LUAD LUAD 0.948 0.052
49 | TCGA-38-7271-01A-11R-2039-07 LUAD LUAD 0.944 0.056
50 | TCGA-44-2655-01A-01R-0946-07 LUAD LUAD 0.992 0.008
51 | TCGA-44-2656-01A-02R-0946-07 LUAD LUAD 0.988 0.012
52 | TCGA-44-2657-01A-01R-1107-07 LUAD LUAD 0.97 0.03
53 | TCGA-44-2659-01A-01R-0946-07 LUAD LUAD 0.976 0.024
54 | TCGA-44-2661-01A-01R-1107-07 LUAD LUAD 0.972 0.028
55 | TCGA-44-2662-01A-01R-0946-07 LUAD LUAD 0.926 0.074
56 | TCGA-44-2665-01A-01R-0946-07 LUAD LUAD 0.894 0.106
57 | TCGA-44-2666-01A-01R-0946-07 LUAD LUAD 0.978 0.022
58 | TCGA-44-2668-01A-01R-0946-07 LUAD LUAD 0.83 0.17
59 | TCGA-44-3396-01A-01R-1206-07 LUAD LUAD 0.946 0.054
60 | TCGA-44-3398-01A-01R-1107-07 LUAD LUAD 0.822 0.178
61 | TCGA-44-3918-01A-01R-1107-07 LUAD LUAD 0.934 0.066
62 | TCGA-44-3919-01A-02R-1107-07 LUAD LUAD 0.978 0.022
63 | TCGA-44-4112-01A-01R-1107-07 LUAD LUAD 0.944 0.056
64 | TCGA-44-5643-01A-01R-1628-07 LUAD LUSC 0.04 0.96
65 | TCGA-44-5644-01A-21R-2039-07 LUAD LUAD 0.838 0.162
66 | TCGA-44-5645-01A-01R-1628-07 LUAD LUAD 0.96 0.04
67 | TCGA-44-6145-01A-11R-1755-07 LUAD LUAD 0.982 0.018
68 | TCGA-44-6146-01A-11R-1755-07 LUAD LUAD 0.93 0.07
69 | TCGA-44-6147-01A-11R-1755-07 LUAD LUAD 0.946 0.054
70 | TCGA-44-6148-01A-11R-1755-07 LUAD LUAD 0.864 0.136
71 | TCGA-44-6774-01A-21R-1858-07 LUAD LUAD 0.898 0.102
72 | TCGA-44-6775-01A-11R-1858-07 LUAD LUAD 0.972 0.028
73 | TCGA-44-6776-01A-11R-1858-07 LUAD LUAD 0.976 0.024
74 | TCGA-44-6777-01A-11R-1858-07 LUAD LUAD 0.946 0.054
75 | TCGA-44-6778-01A-11R-1858-07 LUAD LUAD 0.878 0.122
76 | TCGA-44-6779-01A-11R-1858-07 LUAD LUAD 0.87 0.13
77 | TCGA-44-7659-01A-11R-2066-07 LUAD LUAD 0.98 0.02
78 | TCGA-44-7660-01A-11R-2066-07 LUAD LUSC 0.46 0.54
79 | TCGA-44-7661-01A-11R-2066-07 LUAD LUAD 0.906 0.094
80 | TCGA-44-7662-01A-11R-2066-07 LUAD LUAD 0.938 0.062
81 | TCGA-44-7667-01A-31R-2066-07 LUAD LUAD 0.568 0.432
82 | TCGA-44-7669-01A-21R-2066-07 LUAD LUAD 0.816 0.184
83 | TCGA-44-7670-01A-11R-2066-07 LUAD LUAD 0.744 0.256
84 | TCGA-44-7671-01A-11R-2066-07 LUAD LUAD 0.976 0.024
85 | TCGA-44-7672-01A-11R-2066-07 LUAD LUAD 0.988 0.012
86 | TCGA-44-8117-01A-11R-2241-07 LUAD LUAD 0.918 0.082
87 | TCGA-44-8119-01A-11R-2241-07 LUAD LUAD 0.878 0.122
88 | TCGA-44-8120-01A-11R-2241-07 LUAD LUAD 0.976 0.024
89 | TCGA-49-4486-01A-01R-1206-07 LUAD LUAD 0.938 0.062
90 | TCGA-49-4487-01A-21R-1858-07 LUAD LUAD 0.904 0.096
91 | TCGA-49-4488-01A-01R-1755-07 LUAD LUAD 0.918 0.082
92 | TCGA-49-4490-01A-21R-1858-07 LUAD LUAD 0.892 0.108
93 | TCGA-49-4494-01A-01R-1206-07 LUAD LUAD 0.89 0.11
94 | TCGA-49-4501-01A-01R-1206-07 LUAD LUAD 0.976 0.024
95 | TCGA-49-4505-01A-01R-1206-07 LUAD LUAD 0.978 0.022
96 | TCGA-49-4506-01A-01R-1206-07 LUAD LUAD 0.698 0.302
97 | TCGA-49-4507-01A-01R-1206-07 LUAD LUAD 0.798 0.202
98 | TCGA-49-4510-01A-01R-1206-07 LUAD LUAD 0.966 0.034
99 | TCGA-49-4512-01A-21R-1858-07 LUAD LUAD 0.958 0.042
100 | TCGA-49-4514-01A-21R-1858-07 LUAD LUAD 0.824 0.176
101 | TCGA-49-6742-01A-11R-1858-07 LUAD LUAD 0.954 0.046
102 | TCGA-49-6743-01A-11R-1858-07 LUAD LUAD 0.868 0.132
103 | TCGA-49-6744-01A-11R-1858-07 LUAD LUAD 0.998 0.002
104 | TCGA-49-6745-01A-11R-1858-07 LUAD LUAD 0.956 0.044
105 | TCGA-49-6761-01A-31R-1949-07 LUAD LUAD 0.84 0.16
106 | TCGA-49-6767-01A-11R-1858-07 LUAD LUAD 0.808 0.192
107 | TCGA-50-5044-01A-21R-1858-07 LUAD LUAD 0.768 0.232
108 | TCGA-50-5049-01A-01R-1628-07 LUAD LUAD 0.922 0.078
109 | TCGA-50-5051-01A-21R-1858-07 LUAD LUAD 0.912 0.088
110 | TCGA-50-5055-01A-01R-1628-07 LUAD LUAD 0.912 0.088
111 | TCGA-50-5066-01A-01R-1628-07 LUAD LUAD 0.762 0.238
112 | TCGA-50-5066-02A-11R-2090-07 LUAD LUAD 0.938 0.062
113 | TCGA-50-5068-01A-01R-1628-07 LUAD LUAD 0.856 0.144
114 | TCGA-50-5072-01A-21R-1858-07 LUAD LUAD 0.848 0.152
115 | TCGA-50-5931-01A-11R-1755-07 LUAD LUSC 0.226 0.774
116 | TCGA-50-5932-01A-11R-1755-07 LUAD LUAD 0.954 0.046
117 | TCGA-50-5933-01A-11R-1755-07 LUAD LUAD 0.89 0.11
118 | TCGA-50-5935-01A-11R-1755-07 LUAD LUAD 0.984 0.016
119 | TCGA-50-5936-01A-11R-1628-07 LUAD LUAD 0.95 0.05
120 | TCGA-50-5939-01A-11R-1628-07 LUAD LUAD 0.918 0.082
121 | TCGA-50-5941-01A-11R-1755-07 LUAD LUAD 0.962 0.038
122 | TCGA-50-5942-01A-21R-1755-07 LUAD LUAD 0.974 0.026
123 | TCGA-50-5944-01A-11R-1755-07 LUAD LUAD 0.99 0.01
124 | TCGA-50-5946-01A-11R-1755-07 LUAD LUAD 0.854 0.146
125 | TCGA-50-5946-02A-11R-2090-07 LUAD LUAD 0.904 0.096
126 | TCGA-50-6590-01A-12R-1858-07 LUAD LUAD 0.682 0.318
127 | TCGA-50-6591-01A-11R-1755-07 LUAD LUAD 0.632 0.368
128 | TCGA-50-6592-01A-11R-1755-07 LUAD LUAD 0.792 0.208
129 | TCGA-50-6593-01A-11R-1755-07 LUAD LUAD 0.97 0.03
130 | TCGA-50-6594-01A-11R-1755-07 LUAD LUAD 0.832 0.168
131 | TCGA-50-6595-01A-12R-1858-07 LUAD LUAD 0.74 0.26
132 | TCGA-50-6597-01A-11R-1858-07 LUAD LUAD 0.87 0.13
133 | TCGA-50-6673-01A-11R-1949-07 LUAD LUAD 0.934 0.066
134 | TCGA-50-7109-01A-11R-2039-07 LUAD LUAD 0.95 0.05
135 | TCGA-53-7624-01A-11R-2066-07 LUAD LUAD 0.576 0.424
136 | TCGA-53-7626-01A-12R-2066-07 LUAD LUAD 0.986 0.014
137 | TCGA-53-7813-01A-11R-2170-07 LUAD LUAD 0.952 0.048
138 | TCGA-55-1592-01A-01R-0946-07 LUAD LUAD 0.96 0.04
139 | TCGA-55-1594-01A-01R-0946-07 LUAD LUAD 0.79 0.21
140 | TCGA-55-1595-01A-01R-0946-07 LUAD LUAD 0.968 0.032
141 | TCGA-55-1596-01A-01R-0946-07 LUAD LUAD 0.844 0.156
142 | TCGA-55-5899-01A-11R-1628-07 LUAD LUAD 0.73 0.27
143 | TCGA-55-6543-01A-11R-1755-07 LUAD LUAD 0.98 0.02
144 | TCGA-55-6642-01A-11R-1858-07 LUAD LUAD 0.962 0.038
145 | TCGA-55-6712-01A-11R-1858-07 LUAD LUAD 0.944 0.056
146 | TCGA-55-6968-01A-11R-1949-07 LUAD LUAD 0.544 0.456
147 | TCGA-55-6969-01A-11R-1949-07 LUAD LUAD 0.84 0.16
148 | TCGA-55-6970-01A-11R-1949-07 LUAD LUAD 0.97 0.03
149 | TCGA-55-6971-01A-11R-1949-07 LUAD LUAD 0.964 0.036
150 | TCGA-55-6972-01A-11R-1949-07 LUAD LUAD 0.924 0.076
151 | TCGA-55-6975-01A-11R-1949-07 LUAD LUAD 0.902 0.098
152 | TCGA-55-6978-01A-11R-1949-07 LUAD LUAD 0.91 0.09
153 | TCGA-55-6979-01A-11R-1949-07 LUAD LUAD 0.946 0.054
154 | TCGA-55-6980-01A-11R-1949-07 LUAD LUAD 0.968 0.032
155 | TCGA-55-6981-01A-11R-1949-07 LUAD LUAD 0.95 0.05
156 | TCGA-55-6982-01A-11R-1949-07 LUAD LUAD 0.97 0.03
157 | TCGA-55-6983-01A-11R-1949-07 LUAD LUAD 0.988 0.012
158 | TCGA-55-6984-01A-11R-1949-07 LUAD LUAD 0.888 0.112
159 | TCGA-55-6985-01A-11R-1949-07 LUAD LUAD 0.964 0.036
160 | TCGA-55-6986-01A-11R-1949-07 LUAD LUAD 0.978 0.022
161 | TCGA-55-6987-01A-11R-1949-07 LUAD LUAD 0.95 0.05
162 | TCGA-55-7227-01A-11R-2039-07 LUAD LUAD 0.992 0.008
163 | TCGA-55-7281-01A-11R-2039-07 LUAD LUAD 0.954 0.046
164 | TCGA-55-7283-01A-11R-2039-07 LUAD LUAD 0.99 0.01
165 | TCGA-55-7284-01B-11R-2241-07 LUAD LUAD 0.974 0.026
166 | TCGA-55-7570-01A-11R-2039-07 LUAD LUAD 0.58 0.42
167 | TCGA-55-7573-01A-11R-2039-07 LUAD LUAD 0.982 0.018
168 | TCGA-55-7574-01A-11R-2039-07 LUAD LUAD 0.98 0.02
169 | TCGA-55-7576-01A-11R-2066-07 LUAD LUAD 0.97 0.03
170 | TCGA-55-7724-01A-11R-2170-07 LUAD LUSC 0.228 0.772
171 | TCGA-55-7725-01A-11R-2170-07 LUAD LUAD 0.936 0.064
172 | TCGA-55-7726-01A-11R-2170-07 LUAD LUSC 0.368 0.632
173 | TCGA-55-7727-01A-11R-2170-07 LUAD LUAD 0.944 0.056
174 | TCGA-55-7728-01A-11R-2187-07 LUAD LUAD 0.886 0.114
175 | TCGA-55-7815-01A-11R-2170-07 LUAD LUAD 0.848 0.152
176 | TCGA-55-7903-01A-11R-2170-07 LUAD LUAD 0.966 0.034
177 | TCGA-55-7907-01A-11R-2170-07 LUAD LUAD 0.956 0.044
178 | TCGA-55-7910-01A-11R-2170-07 LUAD LUAD 0.916 0.084
179 | TCGA-55-7911-01A-11R-2170-07 LUAD LUAD 0.966 0.034
180 | TCGA-55-7913-01B-11R-2241-07 LUAD LUAD 0.904 0.096
181 | TCGA-55-7914-01A-11R-2170-07 LUAD LUAD 0.956 0.044
182 | TCGA-55-7994-01A-11R-2187-07 LUAD LUAD 0.906 0.094
183 | TCGA-55-7995-01A-11R-2187-07 LUAD LUAD 0.756 0.244
184 | TCGA-55-8085-01A-11R-2241-07 LUAD LUAD 0.97 0.03
185 | TCGA-55-8087-01A-11R-2241-07 LUAD LUAD 0.976 0.024
186 | TCGA-55-8089-01A-11R-2241-07 LUAD LUAD 0.902 0.098
187 | TCGA-55-8090-01A-11R-2241-07 LUAD LUAD 0.96 0.04
188 | TCGA-55-8091-01A-11R-2241-07 LUAD LUAD 0.948 0.052
189 | TCGA-55-8092-01A-11R-2241-07 LUAD LUAD 0.924 0.076
190 | TCGA-55-8094-01A-11R-2241-07 LUAD LUAD 0.842 0.158
191 | TCGA-55-8096-01A-11R-2241-07 LUAD LUAD 0.948 0.052
192 | TCGA-55-8097-01A-11R-2241-07 LUAD LUAD 0.986 0.014
193 | TCGA-55-8203-01A-11R-2241-07 LUAD LUAD 0.992 0.008
194 | TCGA-55-8204-01A-11R-2241-07 LUAD LUSC 0.184 0.816
195 | TCGA-55-8205-01A-11R-2241-07 LUAD LUAD 0.922 0.078
196 | TCGA-55-8206-01A-11R-2241-07 LUAD LUAD 0.97 0.03
197 | TCGA-55-8207-01A-11R-2241-07 LUAD LUAD 0.976 0.024
198 | TCGA-55-8208-01A-11R-2241-07 LUAD LUAD 0.954 0.046
199 | TCGA-55-8299-01A-11R-2287-07 LUAD LUAD 0.952 0.048
200 | TCGA-55-8301-01A-11R-2287-07 LUAD LUAD 0.956 0.044
201 | TCGA-64-1676-01A-01R-0946-07 LUAD LUAD 0.836 0.164
202 | TCGA-64-1677-01A-01R-0946-07 LUAD LUAD 0.89 0.11
203 | TCGA-64-1678-01A-01R-0946-07 LUAD LUAD 0.7 0.3
204 | TCGA-64-1679-01A-21R-2066-07 LUAD LUAD 0.904 0.096
205 | TCGA-64-1680-01A-02R-0946-07 LUAD LUAD 0.966 0.034
206 | TCGA-64-1681-01A-11R-2066-07 LUAD LUAD 0.968 0.032
207 | TCGA-64-5774-01A-01R-1628-07 LUAD LUAD 0.938 0.062
208 | TCGA-64-5775-01A-01R-1628-07 LUAD LUAD 0.688 0.312
209 | TCGA-64-5778-01A-01R-1628-07 LUAD LUAD 0.936 0.064
210 | TCGA-64-5779-01A-01R-1628-07 LUAD LUAD 0.908 0.092
211 | TCGA-64-5781-01A-01R-1628-07 LUAD LUAD 0.808 0.192
212 | TCGA-64-5815-01A-01R-1628-07 LUAD LUAD 0.872 0.128
213 | TCGA-67-3770-01A-01R-0946-07 LUAD LUAD 0.94 0.06
214 | TCGA-67-3771-01A-01R-0946-07 LUAD LUAD 0.81 0.19
215 | TCGA-67-3772-01A-01R-0946-07 LUAD LUAD 0.954 0.046
216 | TCGA-67-3773-01A-01R-0946-07 LUAD LUAD 0.94 0.06
217 | TCGA-67-3774-01A-01R-0946-07 LUAD LUAD 0.978 0.022
218 | TCGA-67-4679-01B-01R-1755-07 LUAD LUAD 0.988 0.012
219 | TCGA-67-6215-01A-11R-1755-07 LUAD LUAD 0.974 0.026
220 | TCGA-67-6216-01A-11R-1755-07 LUAD LUAD 0.98 0.02
221 | TCGA-67-6217-01A-11R-1755-07 LUAD LUAD 0.982 0.018
222 | TCGA-69-7760-01A-11R-2170-07 LUAD LUAD 0.904 0.096
223 | TCGA-69-7761-01A-11R-2170-07 LUAD LUAD 0.938 0.062
224 | TCGA-69-7763-01A-11R-2170-07 LUAD LUAD 0.96 0.04
225 | TCGA-69-7764-01A-11R-2170-07 LUAD LUAD 0.974 0.026
226 | TCGA-69-7765-01A-11R-2170-07 LUAD LUAD 0.98 0.02
227 | TCGA-69-7973-01A-11R-2187-07 LUAD LUAD 0.948 0.052
228 | TCGA-69-7974-01A-11R-2187-07 LUAD LUAD 0.982 0.018
229 | TCGA-69-7978-01A-11R-2187-07 LUAD LUAD 0.958 0.042
230 | TCGA-69-7979-01A-11R-2187-07 LUAD LUAD 0.772 0.228
231 | TCGA-69-7980-01A-11R-2187-07 LUAD LUAD 0.956 0.044
232 | TCGA-69-8253-01A-11R-2287-07 LUAD LUAD 0.958 0.042
233 | TCGA-69-8254-01A-11R-2287-07 LUAD LUAD 0.982 0.018
234 | TCGA-69-8255-01A-11R-2287-07 LUAD LUAD 0.7 0.3
235 | TCGA-71-6725-01A-11R-1858-07 LUAD LUAD 0.934 0.066
236 | TCGA-73-4658-01A-01R-1755-07 LUAD LUAD 0.968 0.032
237 | TCGA-73-4659-01A-01R-1206-07 LUAD LUAD 0.976 0.024
238 | TCGA-73-4662-01A-01R-1206-07 LUAD LUAD 0.99 0.01
239 | TCGA-73-4666-01A-01R-1206-07 LUAD LUAD 0.852 0.148
240 | TCGA-73-4668-01A-01R-1206-07 LUAD LUAD 0.94 0.06
241 | TCGA-73-4670-01A-01R-1206-07 LUAD LUAD 0.81 0.19
242 | TCGA-73-4675-01A-01R-1206-07 LUAD LUAD 0.898 0.102
243 | TCGA-73-4676-01A-01R-1755-07 LUAD LUAD 0.886 0.114
244 | TCGA-73-4677-01A-01R-1206-07 LUAD LUAD 0.982 0.018
245 | TCGA-73-7498-01A-12R-2187-07 LUAD LUAD 0.996 0.004
246 | TCGA-73-7499-01A-11R-2187-07 LUAD LUAD 0.872 0.128
247 | TCGA-75-5122-01A-01R-1755-07 LUAD LUAD 0.88 0.12
248 | TCGA-75-5125-01A-01R-1755-07 LUAD LUAD 0.896 0.104
249 | TCGA-75-5126-01A-01R-1755-07 LUAD LUAD 0.93 0.07
250 | TCGA-75-5146-01A-01R-1628-07 LUAD LUAD 0.944 0.056
251 | TCGA-75-5147-01A-01R-1628-07 LUAD LUAD 0.87 0.13
252 | TCGA-75-6203-01A-11R-1755-07 LUAD LUAD 0.952 0.048
253 | TCGA-75-6205-01A-11R-1755-07 LUAD LUAD 0.892 0.108
254 | TCGA-75-6206-01A-11R-1755-07 LUAD LUAD 0.96 0.04
255 | TCGA-75-6207-01A-11R-1755-07 LUAD LUAD 0.884 0.116
256 | TCGA-75-6211-01A-11R-1755-07 LUAD LUAD 0.838 0.162
257 | TCGA-75-6212-01A-11R-1755-07 LUAD LUAD 0.968 0.032
258 | TCGA-75-6214-01A-41R-1949-07 LUAD LUSC 0.24 0.76
259 | TCGA-75-7025-01A-12R-1949-07 LUAD LUAD 0.952 0.048
260 | TCGA-75-7027-01A-11R-1949-07 LUAD LUAD 0.9 0.1
261 | TCGA-75-7030-01A-11R-1949-07 LUAD LUAD 0.906 0.094
262 | TCGA-75-7031-01A-11R-1949-07 LUAD LUAD 0.912 0.088
263 | TCGA-78-7143-01A-11R-2039-07 LUAD LUAD 0.99 0.01
264 | TCGA-78-7145-01A-11R-2039-07 LUAD LUAD 0.972 0.028
265 | TCGA-78-7146-01A-11R-2039-07 LUAD LUAD 0.8 0.2
266 | TCGA-78-7147-01A-11R-2039-07 LUAD LUAD 0.94 0.06
267 | TCGA-78-7148-01A-11R-2039-07 LUAD LUAD 0.994 0.006
268 | TCGA-78-7149-01A-11R-2039-07 LUAD LUAD 0.974 0.026
269 | TCGA-78-7150-01A-21R-2039-07 LUAD LUAD 0.824 0.176
270 | TCGA-78-7152-01A-11R-2039-07 LUAD LUAD 0.95 0.05
271 | TCGA-78-7153-01A-11R-2039-07 LUAD LUAD 0.956 0.044
272 | TCGA-78-7154-01A-11R-2039-07 LUAD LUAD 0.788 0.212
273 | TCGA-78-7155-01A-11R-2039-07 LUAD LUAD 0.594 0.406
274 | TCGA-78-7156-01A-11R-2039-07 LUAD LUAD 0.96 0.04
275 | TCGA-78-7158-01A-11R-2039-07 LUAD LUAD 0.966 0.034
276 | TCGA-78-7159-01A-11R-2039-07 LUAD LUAD 0.978 0.022
277 | TCGA-78-7160-01A-11R-2039-07 LUAD LUAD 0.978 0.022
278 | TCGA-78-7161-01A-11R-2039-07 LUAD LUAD 0.978 0.022
279 | TCGA-78-7162-01A-21R-2066-07 LUAD LUAD 0.974 0.026
280 | TCGA-78-7163-01A-12R-2066-07 LUAD LUAD 0.908 0.092
281 | TCGA-78-7166-01A-12R-2066-07 LUAD LUAD 0.946 0.054
282 | TCGA-78-7167-01A-11R-2066-07 LUAD LUAD 0.974 0.026
283 | TCGA-78-7220-01A-11R-2039-07 LUAD LUAD 0.826 0.174
284 | TCGA-78-7535-01A-11R-2066-07 LUAD LUAD 0.87 0.13
285 | TCGA-78-7536-01A-11R-2066-07 LUAD LUAD 0.83 0.17
286 | TCGA-78-7537-01A-11R-2066-07 LUAD LUAD 0.982 0.018
287 | TCGA-78-7539-01A-11R-2066-07 LUAD LUAD 0.974 0.026
288 | TCGA-78-7540-01A-11R-2066-07 LUAD LUAD 0.954 0.046
289 | TCGA-78-7542-01A-21R-2066-07 LUAD LUAD 0.626 0.374
290 | TCGA-78-7633-01A-11R-2066-07 LUAD LUAD 0.976 0.024
291 | TCGA-80-5607-01A-31R-1949-07 LUAD LUAD 0.968 0.032
292 | TCGA-80-5608-01A-31R-1949-07 LUAD LUAD 0.974 0.026
293 | TCGA-80-5611-01A-01R-1628-07 LUAD LUAD 0.872 0.128
294 | TCGA-83-5908-01A-21R-2287-07 LUAD LUAD 0.856 0.144
295 | TCGA-86-6562-01A-11R-1755-07 LUAD LUAD 0.988 0.012
296 | TCGA-86-6851-01A-11R-1949-07 LUAD LUAD 0.924 0.076
297 | TCGA-86-7701-01A-11R-2170-07 LUAD LUAD 0.932 0.068
298 | TCGA-86-7711-01A-11R-2066-07 LUAD LUAD 0.726 0.274
299 | TCGA-86-7713-01A-11R-2066-07 LUAD LUAD 0.91 0.09
300 | TCGA-86-7714-01A-12R-2170-07 LUAD LUAD 0.97 0.03
301 | TCGA-86-7953-01A-11R-2187-07 LUAD LUAD 0.9 0.1
302 | TCGA-86-7954-01A-11R-2187-07 LUAD LUAD 0.954 0.046
303 | TCGA-86-7955-01A-11R-2187-07 LUAD LUAD 0.878 0.122
304 | TCGA-86-8054-01A-11R-2241-07 LUAD LUAD 0.846 0.154
305 | TCGA-86-8055-01A-11R-2241-07 LUAD LUAD 0.956 0.044
306 | TCGA-86-8056-01A-11R-2241-07 LUAD LUAD 0.97 0.03
307 | TCGA-86-8073-01A-11R-2241-07 LUAD LUAD 0.912 0.088
308 | TCGA-86-8074-01A-11R-2241-07 LUAD LUAD 0.984 0.016
309 | TCGA-86-8075-01A-11R-2241-07 LUAD LUAD 0.956 0.044
310 | TCGA-86-8076-01A-31R-2241-07 LUAD LUAD 0.988 0.012
311 | TCGA-86-8279-01A-11R-2287-07 LUAD LUAD 0.944 0.056
312 | TCGA-86-8280-01A-11R-2287-07 LUAD LUAD 0.986 0.014
313 | TCGA-86-8281-01A-11R-2287-07 LUAD LUAD 0.99 0.01
314 | TCGA-91-6828-01A-11R-1858-07 LUAD LUAD 0.97 0.03
315 | TCGA-91-6829-01A-21R-1858-07 LUAD LUAD 0.928 0.072
316 | TCGA-91-6830-01A-11R-1949-07 LUAD LUAD 0.916 0.084
317 | TCGA-91-6831-01A-11R-1858-07 LUAD LUAD 0.894 0.106
318 | TCGA-91-6835-01A-11R-1858-07 LUAD LUAD 0.936 0.064
319 | TCGA-91-6836-01A-21R-1858-07 LUAD LUAD 0.832 0.168
320 | TCGA-91-6840-01A-11R-1949-07 LUAD LUAD 0.798 0.202
321 | TCGA-91-6847-01A-11R-1949-07 LUAD LUAD 0.736 0.264
322 | TCGA-91-6848-01A-11R-1949-07 LUAD LUAD 0.606 0.394
323 | TCGA-91-6849-01A-11R-1949-07 LUAD LUAD 0.974 0.026
324 | TCGA-91-7771-01A-11R-2170-07 LUAD LUAD 0.99 0.01
325 | TCGA-93-7347-01A-11R-2187-07 LUAD LUAD 0.99 0.01
326 | TCGA-93-7348-01A-21R-2039-07 LUAD LUAD 0.978 0.022
327 | TCGA-93-8067-01A-11R-2287-07 LUAD LUAD 0.91 0.09
328 | TCGA-95-7039-01A-11R-1949-07 LUAD LUAD 0.928 0.072
329 | TCGA-95-7043-01A-11R-1949-07 LUAD LUAD 0.786 0.214
330 | TCGA-95-7562-01A-11R-2241-07 LUAD LUAD 0.87 0.13
331 | TCGA-95-7567-01A-11R-2066-07 LUAD LUAD 0.966 0.034
332 | TCGA-95-7944-01A-11R-2187-07 LUAD LUAD 0.776 0.224
333 | TCGA-95-7947-01A-11R-2187-07 LUAD LUAD 0.834 0.166
334 | TCGA-95-7948-01A-11R-2187-07 LUAD LUAD 0.828 0.172
335 | TCGA-95-8039-01A-11R-2241-07 LUAD LUAD 0.998 0.002
336 | TCGA-97-7546-01A-11R-2039-07 LUAD LUAD 0.966 0.034
337 | TCGA-97-7547-01A-11R-2039-07 LUAD LUAD 0.958 0.042
338 | TCGA-97-7552-01A-11R-2039-07 LUAD LUAD 0.97 0.03
339 | TCGA-97-7553-01A-21R-2039-07 LUAD LUAD 0.978 0.022
340 | TCGA-97-7554-01A-11R-2039-07 LUAD LUAD 0.972 0.028
341 | TCGA-97-7937-01A-11R-2170-07 LUAD LUAD 0.98 0.02
342 | TCGA-97-7938-01A-11R-2170-07 LUAD LUAD 0.978 0.022
343 | TCGA-97-7941-01A-11R-2187-07 LUAD LUAD 0.968 0.032
344 | TCGA-97-8171-01A-11R-2287-07 LUAD LUAD 0.888 0.112
345 | TCGA-97-8172-01A-11R-2287-07 LUAD LUAD 0.984 0.016
346 | TCGA-97-8174-01A-11R-2287-07 LUAD LUAD 0.932 0.068
347 | TCGA-97-8175-01A-11R-2287-07 LUAD LUAD 0.956 0.044
348 | TCGA-97-8177-01A-11R-2287-07 LUAD LUAD 0.96 0.04
349 | TCGA-97-8179-01A-11R-2287-07 LUAD LUAD 0.97 0.03
350 | TCGA-99-7458-01A-11R-2039-07 LUAD LUAD 0.984 0.016
351 | TCGA-99-8025-01A-11R-2241-07 LUAD LUAD 0.958 0.042
352 | TCGA-99-8028-01A-11R-2241-07 LUAD LUAD 0.972 0.028
353 | TCGA-99-8032-01A-11R-2241-07 LUAD LUAD 0.958 0.042
354 | TCGA-99-8033-01A-11R-2241-07 LUAD LUAD 0.944 0.056
355 | TCGA-J2-8192-01A-11R-2241-07 LUAD LUAD 0.962 0.038
356 | TCGA-J2-8194-01A-11R-2241-07 LUAD LUAD 0.984 0.016
357 | TCGA-18-3406-01A-01R-0980-07 LUSC LUSC 0.102 0.898
358 | TCGA-18-3407-01A-01R-0980-07 LUSC LUSC 0.024 0.976
359 | TCGA-18-3408-01A-01R-0980-07 LUSC LUSC 0.22 0.78
360 | TCGA-18-3409-01A-01R-0980-07 LUSC LUSC 0.26 0.74
361 | TCGA-18-3410-01A-01R-0980-07 LUSC LUSC 0.18 0.82
362 | TCGA-18-3411-01A-01R-0980-07 LUSC LUSC 0.028 0.972
363 | TCGA-18-3412-01A-01R-0980-07 LUSC LUSC 0.024 0.976
364 | TCGA-18-3414-01A-01R-0980-07 LUSC LUSC 0.024 0.976
365 | TCGA-18-3415-01A-01R-0980-07 LUSC LUSC 0.02 0.98
366 | TCGA-18-3416-01A-01R-0980-07 LUSC LUSC 0.128 0.872
367 | TCGA-18-3417-01A-01R-1443-07 LUSC LUSC 0.156 0.844
368 | TCGA-18-3419-01A-01R-0980-07 LUSC LUSC 0.14 0.86
369 | TCGA-18-3421-01A-01R-0980-07 LUSC LUSC 0.086 0.914
370 | TCGA-18-4083-01A-01R-1100-07 LUSC LUSC 0.052 0.948
371 | TCGA-18-4086-01A-01R-1100-07 LUSC LUSC 0.004 0.996
372 | TCGA-18-4721-01A-01R-1443-07 LUSC LUSC 0.072 0.928
373 | TCGA-18-5592-01A-01R-1635-07 LUSC LUSC 0.048 0.952
374 | TCGA-18-5595-01A-01R-1635-07 LUSC LUSC 0.076 0.924
375 | TCGA-21-1070-01A-01R-0692-07 LUSC LUSC 0.118 0.882
376 | TCGA-21-1071-01A-01R-0692-07 LUSC LUSC 0.11 0.89
377 | TCGA-21-1072-01A-01R-0692-07 LUSC LUSC 0.042 0.958
378 | TCGA-21-1075-01A-01R-0692-07 LUSC LUSC 0.05 0.95
379 | TCGA-21-1076-01A-02R-0692-07 LUSC LUSC 0.138 0.862
380 | TCGA-21-1077-01A-01R-0692-07 LUSC LUSC 0.01 0.99
381 | TCGA-21-1078-01A-01R-0692-07 LUSC LUSC 0.444 0.556
382 | TCGA-21-1079-01A-01R-0692-07 LUSC LUSC 0.132 0.868
383 | TCGA-21-1080-01A-01R-0692-07 LUSC LUSC 0.056 0.944
384 | TCGA-21-1081-01A-01R-0692-07 LUSC LUSC 0.098 0.902
385 | TCGA-21-1082-01A-01R-0692-07 LUSC LUSC 0.062 0.938
386 | TCGA-21-1083-01A-01R-0692-07 LUSC LUSC 0.246 0.754
387 | TCGA-21-5782-01A-01R-1635-07 LUSC LUSC 0.128 0.872
388 | TCGA-21-5784-01A-01R-1635-07 LUSC LUSC 0.086 0.914
389 | TCGA-21-5786-01A-01R-1635-07 LUSC LUSC 0.04 0.96
390 | TCGA-21-5787-01A-01R-1635-07 LUSC LUAD 0.662 0.338
391 | TCGA-22-0940-01A-01R-0692-07 LUSC LUSC 0.16 0.84
392 | TCGA-22-0944-01A-01R-0692-07 LUSC LUSC 0.028 0.972
393 | TCGA-22-1002-01A-01R-0692-07 LUSC LUSC 0.204 0.796
394 | TCGA-22-1011-01A-01R-0692-07 LUSC LUSC 0.072 0.928
395 | TCGA-22-1012-01A-01R-0692-07 LUSC LUSC 0.036 0.964
396 | TCGA-22-1016-01A-01R-0692-07 LUSC LUSC 0.412 0.588
397 | TCGA-22-1017-01A-01R-0692-07 LUSC LUAD 0.874 0.126
398 | TCGA-22-4591-01A-01R-1201-07 LUSC LUSC 0.208 0.792
399 | TCGA-22-4593-01A-21R-1820-07 LUSC LUSC 0.068 0.932
400 | TCGA-22-4594-01A-01R-1201-07 LUSC LUAD 0.892 0.108
401 | TCGA-22-4595-01A-01R-1201-07 LUSC LUSC 0.058 0.942
402 | TCGA-22-4596-01A-01R-1201-07 LUSC LUAD 0.936 0.064
403 | TCGA-22-4599-01A-01R-1443-07 LUSC LUSC 0.068 0.932
404 | TCGA-22-4601-01A-01R-1443-07 LUSC LUSC 0.054 0.946
405 | TCGA-22-4604-01A-01R-1201-07 LUSC LUSC 0.018 0.982
406 | TCGA-22-4607-01A-01R-1201-07 LUSC LUSC 0.088 0.912
407 | TCGA-22-4613-01A-01R-1443-07 LUSC LUSC 0.022 0.978
408 | TCGA-22-5471-01A-01R-1635-07 LUSC LUSC 0.04 0.96
409 | TCGA-22-5472-01A-01R-1635-07 LUSC LUSC 0.09 0.91
410 | TCGA-22-5473-01A-01R-1635-07 LUSC LUSC 0.008 0.992
411 | TCGA-22-5474-01A-01R-1635-07 LUSC LUSC 0.048 0.952
412 | TCGA-22-5477-01A-01R-1635-07 LUSC LUSC 0.094 0.906
413 | TCGA-22-5478-01A-01R-1635-07 LUSC LUSC 0.278 0.722
414 | TCGA-22-5479-01A-31R-1949-07 LUSC LUSC 0.01 0.99
415 | TCGA-22-5480-01A-01R-1635-07 LUSC LUSC 0.196 0.804
416 | TCGA-22-5481-01A-31R-1949-07 LUSC LUSC 0.422 0.578
417 | TCGA-22-5482-01A-01R-1635-07 LUSC LUSC 0.014 0.986
418 | TCGA-22-5483-01A-01R-1820-07 LUSC LUSC 0.408 0.592
419 | TCGA-22-5485-01A-01R-1635-07 LUSC LUSC 0.04 0.96
420 | TCGA-22-5489-01A-01R-1635-07 LUSC LUSC 0.106 0.894
421 | TCGA-22-5491-01A-01R-1635-07 LUSC LUSC 0.024 0.976
422 | TCGA-22-5492-01A-01R-1635-07 LUSC LUSC 0.106 0.894
423 | TCGA-33-4532-01A-01R-1201-07 LUSC LUSC 0.028 0.972
424 | TCGA-33-4533-01A-01R-1201-07 LUSC LUSC 0.194 0.806
425 | TCGA-33-4538-01A-01R-1201-07 LUSC LUSC 0.014 0.986
426 | TCGA-33-4547-01A-01R-1201-07 LUSC LUSC 0.044 0.956
427 | TCGA-33-4566-01A-01R-1443-07 LUSC LUSC 0.486 0.514
428 | TCGA-33-4582-01A-01R-1443-07 LUSC LUSC 0.084 0.916
429 | TCGA-33-4583-01A-01R-1443-07 LUSC LUSC 0.04 0.96
430 | TCGA-33-4586-01A-01R-1443-07 LUSC LUSC 0.03 0.97
431 | TCGA-33-6737-01A-11R-1820-07 LUSC LUAD 0.754 0.246
432 | TCGA-33-6738-01A-11R-1949-07 LUSC LUSC 0.376 0.624
433 | TCGA-34-2596-01A-01R-0851-07 LUSC LUSC 0.05 0.95
434 | TCGA-34-2600-01A-01R-0851-07 LUSC LUSC 0.076 0.924
435 | TCGA-34-2608-01A-02R-0851-07 LUSC LUSC 0.046 0.954
436 | TCGA-34-5231-01A-21R-1820-07 LUSC LUSC 0.094 0.906
437 | TCGA-34-5232-01A-21R-1820-07 LUSC LUSC 0.024 0.976
438 | TCGA-34-5234-01A-01R-1635-07 LUSC LUSC 0.282 0.718
439 | TCGA-34-5236-01A-21R-1820-07 LUSC LUSC 0.016 0.984
440 | TCGA-34-5239-01A-21R-1820-07 LUSC LUSC 0.138 0.862
441 | TCGA-34-5240-01A-01R-1443-07 LUSC LUSC 0.064 0.936
442 | TCGA-34-5241-01A-01R-1443-07 LUSC LUSC 0.024 0.976
443 | TCGA-34-5927-01A-11R-1820-07 LUSC LUSC 0.184 0.816
444 | TCGA-34-5928-01A-11R-1820-07 LUSC LUSC 0.15 0.85
445 | TCGA-34-5929-01A-11R-1820-07 LUSC LUSC 0.086 0.914
446 | TCGA-34-7107-01A-11R-1949-07 LUSC LUSC 0.034 0.966
447 | TCGA-37-3783-01A-01R-1201-07 LUSC LUSC 0.1 0.9
448 | TCGA-37-3789-01A-01R-0980-07 LUSC LUSC 0.06 0.94
449 | TCGA-37-3792-01A-01R-0980-07 LUSC LUAD 0.504 0.496
450 | TCGA-37-4129-01A-01R-1100-07 LUSC LUAD 0.594 0.406
451 | TCGA-37-4130-01A-01R-1100-07 LUSC LUAD 0.598 0.402
452 | TCGA-37-4132-01A-01R-1100-07 LUSC LUSC 0.428 0.572
453 | TCGA-37-4133-01A-01R-1100-07 LUSC LUSC 0.172 0.828
454 | TCGA-37-4135-01A-01R-1100-07 LUSC LUSC 0.344 0.656
455 | TCGA-37-4141-01A-02R-1100-07 LUSC LUSC 0.434 0.566
456 | TCGA-37-5819-01A-01R-1635-07 LUSC LUAD 0.54 0.46
457 | TCGA-39-5011-01A-01R-1443-07 LUSC LUAD 0.838 0.162
458 | TCGA-39-5016-01A-01R-1443-07 LUSC LUSC 0.044 0.956
459 | TCGA-39-5019-01A-01R-1820-07 LUSC LUSC 0.004 0.996
460 | TCGA-39-5021-01A-01R-1443-07 LUSC LUSC 0.08 0.92
461 | TCGA-39-5024-01A-21R-1820-07 LUSC LUSC 0.094 0.906
462 | TCGA-39-5027-01A-21R-1820-07 LUSC LUSC 0.02 0.98
463 | TCGA-39-5028-01A-01R-1443-07 LUSC LUSC 0.166 0.834
464 | TCGA-39-5029-01A-01R-1443-07 LUSC LUSC 0.062 0.938
465 | TCGA-39-5030-01A-01R-1443-07 LUSC LUSC 0.082 0.918
466 | TCGA-39-5031-01A-01R-1443-07 LUSC LUSC 0.022 0.978
467 | TCGA-39-5034-01A-01R-1443-07 LUSC LUAD 0.802 0.198
468 | TCGA-39-5035-01A-01R-1443-07 LUSC LUSC 0.144 0.856
469 | TCGA-39-5036-01A-01R-1443-07 LUSC LUSC 0.048 0.952
470 | TCGA-39-5037-01A-01R-1443-07 LUSC LUSC 0.066 0.934
471 | TCGA-39-5039-01A-01R-1443-07 LUSC LUSC 0.288 0.712
472 | TCGA-43-2578-01A-01R-0851-07 LUSC LUSC 0.422 0.578
473 | TCGA-43-2581-01A-01R-0851-07 LUSC LUAD 0.838 0.162
474 | TCGA-43-3394-01A-01R-0980-07 LUSC LUSC 0.03 0.97
475 | TCGA-43-3920-01A-01R-0980-07 LUSC LUSC 0.094 0.906
476 | TCGA-43-5668-01A-01R-1635-07 LUSC LUAD 0.728 0.272
477 | TCGA-43-6143-01A-11R-1820-07 LUSC LUSC 0.192 0.808
478 | TCGA-43-6647-01A-11R-1820-07 LUSC LUSC 0.116 0.884
479 | TCGA-43-6770-01A-11R-1820-07 LUSC LUSC 0.016 0.984
480 | TCGA-43-6771-01A-11R-1820-07 LUSC LUSC 0.218 0.782
481 | TCGA-46-3765-01A-01R-0980-07 LUSC LUSC 0.01 0.99
482 | TCGA-46-3766-01A-01R-0980-07 LUSC LUSC 0.262 0.738
483 | TCGA-46-3767-01A-01R-0980-07 LUSC LUSC 0.052 0.948
484 | TCGA-46-3768-01A-01R-0980-07 LUSC LUSC 0.092 0.908
485 | TCGA-46-3769-01A-01R-0980-07 LUSC LUAD 0.572 0.428
486 | TCGA-46-6025-01A-11R-1820-07 LUSC LUSC 0.088 0.912
487 | TCGA-46-6026-01A-11R-1820-07 LUSC LUSC 0.144 0.856
488 | TCGA-51-4079-01A-01R-1100-07 LUSC LUSC 0.02 0.98
489 | TCGA-51-4080-01A-01R-1100-07 LUSC LUSC 0.07 0.93
490 | TCGA-51-4081-01A-01R-1100-07 LUSC LUSC 0.032 0.968
491 | TCGA-56-1622-01A-01R-0692-07 LUSC LUSC 0.168 0.832
492 | TCGA-56-5897-01A-11R-1635-07 LUSC LUSC 0.05 0.95
493 | TCGA-56-5898-01A-11R-1635-07 LUSC LUSC 0.046 0.954
494 | TCGA-56-6545-01A-11R-1820-07 LUSC LUSC 0.074 0.926
495 | TCGA-56-6546-01A-11R-1820-07 LUSC LUSC 0.466 0.534
496 | TCGA-60-2695-01A-01R-0851-07 LUSC LUSC 0.448 0.552
497 | TCGA-60-2696-01A-01R-0851-07 LUSC LUSC 0.156 0.844
498 | TCGA-60-2698-01A-01R-0851-07 LUSC LUSC 0.08 0.92
499 | TCGA-60-2706-01A-01R-0851-07 LUSC LUAD 0.624 0.376
500 | TCGA-60-2707-01A-01R-0851-07 LUSC LUSC 0.076 0.924
501 | TCGA-60-2708-01A-01R-0851-07 LUSC LUSC 0.044 0.956
502 | TCGA-60-2709-01A-21R-1820-07 LUSC LUSC 0.106 0.894
503 | TCGA-60-2710-01A-01R-0851-07 LUSC LUSC 0.05 0.95
504 | TCGA-60-2711-01A-01R-0851-07 LUSC LUSC 0.058 0.942
505 | TCGA-60-2712-01A-01R-0851-07 LUSC LUSC 0.096 0.904
506 | TCGA-60-2713-01A-01R-0851-07 LUSC LUSC 0.028 0.972
507 | TCGA-60-2714-01A-01R-0851-07 LUSC LUAD 0.838 0.162
508 | TCGA-60-2715-01A-01R-0851-07 LUSC LUSC 0.174 0.826
509 | TCGA-60-2716-01A-01R-0851-07 LUSC LUSC 0.206 0.794
510 | TCGA-60-2719-01A-01R-0851-07 LUSC LUSC 0.058 0.942
511 | TCGA-60-2720-01A-01R-0851-07 LUSC LUSC 0.16 0.84
512 | TCGA-60-2721-01A-01R-0851-07 LUSC LUSC 0.05 0.95
513 | TCGA-60-2722-01A-01R-0851-07 LUSC LUSC 0.032 0.968
514 | TCGA-60-2723-01A-01R-0851-07 LUSC LUSC 0.06 0.94
515 | TCGA-60-2724-01A-01R-0851-07 LUSC LUSC 0.056 0.944
516 | TCGA-60-2725-01A-01R-1201-07 LUSC LUSC 0.06 0.94
517 | TCGA-60-2726-01A-01R-0851-07 LUSC LUSC 0.166 0.834
518 | TCGA-63-5128-01A-01R-1443-07 LUSC LUSC 0.048 0.952
519 | TCGA-63-5131-01A-01R-1443-07 LUSC LUSC 0.078 0.922
520 | TCGA-63-6202-01A-11R-1820-07 LUSC LUAD 0.826 0.174
521 | TCGA-63-7020-01A-11R-1949-07 LUSC LUSC 0.13 0.87
522 | TCGA-63-7021-01A-11R-1949-07 LUSC LUSC 0.042 0.958
523 | TCGA-63-7022-01A-11R-1949-07 LUSC LUSC 0.09 0.91
524 | TCGA-63-7023-01A-11R-1949-07 LUSC LUSC 0.144 0.856
525 | TCGA-66-2727-01A-01R-0980-07 LUSC LUSC 0.034 0.966
526 | TCGA-66-2734-01A-01R-0980-07 LUSC LUSC 0.042 0.958
527 | TCGA-66-2737-01A-01R-0980-07 LUSC LUSC 0.008 0.992
528 | TCGA-66-2742-01A-01R-0980-07 LUSC LUSC 0.056 0.944
529 | TCGA-66-2744-01A-01R-0980-07 LUSC LUSC 0.426 0.574
530 | TCGA-66-2753-01A-01R-0980-07 LUSC LUSC 0.126 0.874
531 | TCGA-66-2754-01A-01R-0980-07 LUSC LUSC 0.5 0.5
532 | TCGA-66-2755-01A-01R-0851-07 LUSC LUSC 0.136 0.864
533 | TCGA-66-2756-01A-01R-0851-07 LUSC LUAD 0.578 0.422
534 | TCGA-66-2757-01A-01R-0851-07 LUSC LUSC 0.356 0.644
535 | TCGA-66-2758-01A-02R-0851-07 LUSC LUSC 0.07 0.93
536 | TCGA-66-2759-01A-01R-0851-07 LUSC LUSC 0.024 0.976
537 | TCGA-66-2763-01A-01R-0851-07 LUSC LUSC 0.04 0.96
538 | TCGA-66-2765-01A-01R-0851-07 LUSC LUSC 0.04 0.96
539 | TCGA-66-2766-01A-01R-0851-07 LUSC LUSC 0.036 0.964
540 | TCGA-66-2767-01A-01R-0851-07 LUSC LUSC 0.046 0.954
541 | TCGA-66-2768-01A-01R-0851-07 LUSC LUSC 0.04 0.96
542 | TCGA-66-2769-01A-02R-0851-07 LUSC LUSC 0.156 0.844
543 | TCGA-66-2770-01A-01R-0851-07 LUSC LUSC 0.038 0.962
544 | TCGA-66-2771-01A-01R-0980-07 LUSC LUSC 0.068 0.932
545 | TCGA-66-2773-01A-01R-1201-07 LUSC LUSC 0.09 0.91
546 | TCGA-66-2777-01A-01R-1201-07 LUSC LUSC 0.044 0.956
547 | TCGA-66-2778-01A-02R-0851-07 LUSC LUSC 0.102 0.898
548 | TCGA-66-2780-01A-01R-0851-07 LUSC LUSC 0.016 0.984
549 | TCGA-66-2781-01A-01R-0851-07 LUSC LUSC 0.024 0.976
550 | TCGA-66-2782-01A-01R-0851-07 LUSC LUSC 0.118 0.882
551 | TCGA-66-2783-01A-01R-1201-07 LUSC LUSC 0.038 0.962
552 | TCGA-66-2785-01A-01R-0851-07 LUSC LUSC 0.386 0.614
553 | TCGA-66-2786-01A-01R-0851-07 LUSC LUSC 0.166 0.834
554 | TCGA-66-2787-01A-01R-0980-07 LUSC LUSC 0.06 0.94
555 | TCGA-66-2788-01A-01R-0980-07 LUSC LUSC 0.018 0.982
556 | TCGA-66-2789-01A-01R-0980-07 LUSC LUSC 0.096 0.904
557 | TCGA-66-2790-01A-01R-0980-07 LUSC LUSC 0.11 0.89
558 | TCGA-66-2791-01A-01R-0980-07 LUSC LUSC 0.008 0.992
559 | TCGA-66-2792-01A-01R-0980-07 LUSC LUSC 0.018 0.982
560 | TCGA-66-2793-01A-01R-1201-07 LUSC LUSC 0.164 0.836
561 | TCGA-66-2794-01A-01R-1201-07 LUSC LUSC 0.036 0.964
562 | TCGA-66-2795-01A-02R-0980-07 LUSC LUSC 0.03 0.97
563 | TCGA-66-2800-01A-01R-1201-07 LUSC LUSC 0.04 0.96
564 | TCGA-70-6722-01A-11R-1820-07 LUSC LUSC 0.336 0.664
565 | TCGA-70-6723-01A-11R-1820-07 LUSC LUSC 0.11 0.89
566 | TCGA-77-6842-01A-11R-1949-07 LUSC LUSC 0.312 0.688
567 | TCGA-77-6843-01A-11R-1949-07 LUSC LUSC 0.104 0.896
568 | TCGA-77-6844-01A-11R-1949-07 LUSC LUSC 0.026 0.974
569 | TCGA-77-6845-01A-11R-1949-07 LUSC LUSC 0.04 0.96
570 | TCGA-79-5596-01A-31R-1949-07 LUSC LUSC 0.012 0.988
571 | TCGA-85-6175-01A-11R-1820-07 LUSC LUSC 0.264 0.736
572 | TCGA-85-6560-01A-11R-1820-07 LUSC LUSC 0.498 0.502
573 | TCGA-85-6561-01A-11R-1820-07 LUSC LUSC 0.022 0.978
574 | TCGA-85-6798-01A-11R-1949-07 LUSC LUSC 0.05 0.95
575 | TCGA-90-6837-01A-11R-1949-07 LUSC LUSC 0.088 0.912
576 | TCGA-94-7033-01A-11R-1949-07 LUSC LUSC 0.106 0.894
577 |
--------------------------------------------------------------------------------
/Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt:
--------------------------------------------------------------------------------
1 | SampleID ActualClass PredictedClass LUAD_Probability LUSC_Probability
2 | TCGA-05-4244-01A-01R-1107-07 LUAD LUSC 0.44 0.56
3 | TCGA-05-4249-01A-01R-1107-07 LUAD LUAD 0.768 0.232
4 | TCGA-05-4250-01A-01R-1107-07 LUAD LUSC 0.368 0.632
5 | TCGA-05-4382-01A-01R-1206-07 LUAD LUAD 0.996 0.004
6 | TCGA-05-4384-01A-01R-1755-07 LUAD LUAD 1 0
7 | TCGA-05-4389-01A-01R-1206-07 LUAD LUAD 1 0
8 | TCGA-05-4390-01A-02R-1755-07 LUAD LUAD 0.996 0.004
9 | TCGA-05-4395-01A-01R-1206-07 LUAD LUAD 0.992 0.008
10 | TCGA-05-4396-01A-21R-1858-07 LUAD LUAD 0.986 0.014
11 | TCGA-05-4397-01A-01R-1206-07 LUAD LUAD 0.992 0.008
12 | TCGA-05-4398-01A-01R-1206-07 LUAD LUAD 0.992 0.008
13 | TCGA-05-4402-01A-01R-1206-07 LUAD LUAD 1 0
14 | TCGA-05-4403-01A-01R-1206-07 LUAD LUAD 0.994 0.006
15 | TCGA-05-4405-01A-21R-1858-07 LUAD LUAD 0.998 0.002
16 | TCGA-05-4410-01A-21R-1858-07 LUAD LUAD 1 0
17 | TCGA-05-4415-01A-22R-1858-07 LUAD LUAD 0.992 0.008
18 | TCGA-05-4417-01A-22R-1858-07 LUAD LUAD 1 0
19 | TCGA-05-4418-01A-01R-1206-07 LUAD LUAD 0.998 0.002
20 | TCGA-05-4420-01A-01R-1206-07 LUAD LUAD 0.998 0.002
21 | TCGA-05-4422-01A-01R-1206-07 LUAD LUAD 0.996 0.004
22 | TCGA-05-4424-01A-22R-1858-07 LUAD LUAD 1 0
23 | TCGA-05-4425-01A-01R-1755-07 LUAD LUAD 0.996 0.004
24 | TCGA-05-4426-01A-01R-1206-07 LUAD LUAD 0.998 0.002
25 | TCGA-05-4427-01A-21R-1858-07 LUAD LUAD 0.98 0.02
26 | TCGA-05-4430-01A-02R-1206-07 LUAD LUAD 0.996 0.004
27 | TCGA-05-4432-01A-01R-1206-07 LUAD LUAD 0.994 0.006
28 | TCGA-05-4433-01A-22R-1858-07 LUAD LUAD 0.996 0.004
29 | TCGA-05-4434-01A-01R-1206-07 LUAD LUAD 1 0
30 | TCGA-05-5420-01A-01R-1628-07 LUAD LUAD 0.986 0.014
31 | TCGA-05-5423-01A-01R-1628-07 LUAD LUAD 0.994 0.006
32 | TCGA-05-5425-01A-02R-1628-07 LUAD LUAD 1 0
33 | TCGA-05-5428-01A-01R-1628-07 LUAD LUAD 0.994 0.006
34 | TCGA-05-5429-01A-01R-1628-07 LUAD LUAD 0.984 0.016
35 | TCGA-05-5715-01A-01R-1628-07 LUAD LUAD 1 0
36 | TCGA-35-3615-01A-01R-0946-07 LUAD LUAD 1 0
37 | TCGA-35-4122-01A-01R-1107-07 LUAD LUSC 0.184 0.816
38 | TCGA-35-4123-01A-01R-1107-07 LUAD LUSC 0.25 0.75
39 | TCGA-35-5375-01A-01R-1628-07 LUAD LUAD 0.872 0.128
40 | TCGA-38-4625-01A-01R-1206-07 LUAD LUAD 0.998 0.002
41 | TCGA-38-4626-01A-01R-1206-07 LUAD LUAD 1 0
42 | TCGA-38-4627-01A-01R-1206-07 LUAD LUAD 0.996 0.004
43 | TCGA-38-4628-01A-01R-1206-07 LUAD LUAD 0.996 0.004
44 | TCGA-38-4629-01A-02R-1206-07 LUAD LUAD 0.99 0.01
45 | TCGA-38-4630-01A-01R-1206-07 LUAD LUAD 0.902 0.098
46 | TCGA-38-4631-01A-01R-1755-07 LUAD LUAD 0.962 0.038
47 | TCGA-38-4632-01A-01R-1755-07 LUAD LUAD 1 0
48 | TCGA-38-6178-01A-11R-1755-07 LUAD LUAD 0.994 0.006
49 | TCGA-38-7271-01A-11R-2039-07 LUAD LUAD 0.998 0.002
50 | TCGA-44-2655-01A-01R-0946-07 LUAD LUAD 1 0
51 | TCGA-44-2656-01A-02R-0946-07 LUAD LUAD 1 0
52 | TCGA-44-2657-01A-01R-1107-07 LUAD LUAD 1 0
53 | TCGA-44-2659-01A-01R-0946-07 LUAD LUAD 0.998 0.002
54 | TCGA-44-2661-01A-01R-1107-07 LUAD LUAD 0.968 0.032
55 | TCGA-44-2662-01A-01R-0946-07 LUAD LUAD 1 0
56 | TCGA-44-2665-01A-01R-0946-07 LUAD LUAD 0.982 0.018
57 | TCGA-44-2666-01A-01R-0946-07 LUAD LUAD 1 0
58 | TCGA-44-2668-01A-01R-0946-07 LUAD LUAD 1 0
59 | TCGA-44-3396-01A-01R-1206-07 LUAD LUAD 1 0
60 | TCGA-44-3398-01A-01R-1107-07 LUAD LUAD 0.898 0.102
61 | TCGA-44-3918-01A-01R-1107-07 LUAD LUSC 0.49 0.51
62 | TCGA-44-3919-01A-02R-1107-07 LUAD LUAD 0.864 0.136
63 | TCGA-44-4112-01A-01R-1107-07 LUAD LUSC 0.434 0.566
64 | TCGA-44-5643-01A-01R-1628-07 LUAD LUAD 0.892 0.108
65 | TCGA-44-5644-01A-21R-2039-07 LUAD LUAD 0.998 0.002
66 | TCGA-44-5645-01A-01R-1628-07 LUAD LUAD 0.982 0.018
67 | TCGA-44-6145-01A-11R-1755-07 LUAD LUAD 1 0
68 | TCGA-44-6146-01A-11R-1755-07 LUAD LUAD 1 0
69 | TCGA-44-6147-01A-11R-1755-07 LUAD LUAD 1 0
70 | TCGA-44-6148-01A-11R-1755-07 LUAD LUAD 0.982 0.018
71 | TCGA-44-6774-01A-21R-1858-07 LUAD LUAD 0.998 0.002
72 | TCGA-44-6775-01A-11R-1858-07 LUAD LUAD 1 0
73 | TCGA-44-6776-01A-11R-1858-07 LUAD LUAD 1 0
74 | TCGA-44-6777-01A-11R-1858-07 LUAD LUAD 0.996 0.004
75 | TCGA-44-6778-01A-11R-1858-07 LUAD LUAD 0.97 0.03
76 | TCGA-44-6779-01A-11R-1858-07 LUAD LUAD 0.994 0.006
77 | TCGA-44-7659-01A-11R-2066-07 LUAD LUAD 0.998 0.002
78 | TCGA-44-7660-01A-11R-2066-07 LUAD LUAD 0.988 0.012
79 | TCGA-44-7661-01A-11R-2066-07 LUAD LUAD 0.998 0.002
80 | TCGA-44-7662-01A-11R-2066-07 LUAD LUAD 1 0
81 | TCGA-44-7667-01A-31R-2066-07 LUAD LUAD 0.99 0.01
82 | TCGA-44-7669-01A-21R-2066-07 LUAD LUAD 0.982 0.018
83 | TCGA-44-7670-01A-11R-2066-07 LUAD LUAD 0.992 0.008
84 | TCGA-44-7671-01A-11R-2066-07 LUAD LUAD 1 0
85 | TCGA-44-7672-01A-11R-2066-07 LUAD LUAD 1 0
86 | TCGA-44-8117-01A-11R-2241-07 LUAD LUAD 0.994 0.006
87 | TCGA-44-8119-01A-11R-2241-07 LUAD LUAD 1 0
88 | TCGA-44-8120-01A-11R-2241-07 LUAD LUAD 1 0
89 | TCGA-49-4486-01A-01R-1206-07 LUAD LUAD 0.992 0.008
90 | TCGA-49-4487-01A-21R-1858-07 LUAD LUAD 0.998 0.002
91 | TCGA-49-4488-01A-01R-1755-07 LUAD LUAD 1 0
92 | TCGA-49-4490-01A-21R-1858-07 LUAD LUAD 1 0
93 | TCGA-49-4494-01A-01R-1206-07 LUAD LUAD 0.996 0.004
94 | TCGA-49-4501-01A-01R-1206-07 LUAD LUAD 1 0
95 | TCGA-49-4505-01A-01R-1206-07 LUAD LUAD 1 0
96 | TCGA-49-4506-01A-01R-1206-07 LUAD LUAD 0.938 0.062
97 | TCGA-49-4507-01A-01R-1206-07 LUAD LUAD 1 0
98 | TCGA-49-4510-01A-01R-1206-07 LUAD LUAD 1 0
99 | TCGA-49-4512-01A-21R-1858-07 LUAD LUAD 1 0
100 | TCGA-49-4514-01A-21R-1858-07 LUAD LUAD 0.992 0.008
101 | TCGA-49-6742-01A-11R-1858-07 LUAD LUAD 1 0
102 | TCGA-49-6743-01A-11R-1858-07 LUAD LUAD 0.998 0.002
103 | TCGA-49-6744-01A-11R-1858-07 LUAD LUAD 1 0
104 | TCGA-49-6745-01A-11R-1858-07 LUAD LUAD 1 0
105 | TCGA-49-6761-01A-31R-1949-07 LUAD LUAD 1 0
106 | TCGA-49-6767-01A-11R-1858-07 LUAD LUAD 0.988 0.012
107 | TCGA-50-5044-01A-21R-1858-07 LUAD LUAD 0.986 0.014
108 | TCGA-50-5049-01A-01R-1628-07 LUAD LUAD 0.996 0.004
109 | TCGA-50-5051-01A-21R-1858-07 LUAD LUAD 0.984 0.016
110 | TCGA-50-5055-01A-01R-1628-07 LUAD LUAD 0.998 0.002
111 | TCGA-50-5066-01A-01R-1628-07 LUAD LUAD 0.974 0.026
112 | TCGA-50-5066-02A-11R-2090-07 LUAD LUAD 0.996 0.004
113 | TCGA-50-5068-01A-01R-1628-07 LUAD LUAD 0.98 0.02
114 | TCGA-50-5072-01A-21R-1858-07 LUAD LUAD 0.94 0.06
115 | TCGA-50-5931-01A-11R-1755-07 LUAD LUAD 0.926 0.074
116 | TCGA-50-5932-01A-11R-1755-07 LUAD LUAD 0.998 0.002
117 | TCGA-50-5933-01A-11R-1755-07 LUAD LUAD 0.988 0.012
118 | TCGA-50-5935-01A-11R-1755-07 LUAD LUAD 1 0
119 | TCGA-50-5936-01A-11R-1628-07 LUAD LUAD 0.994 0.006
120 | TCGA-50-5939-01A-11R-1628-07 LUAD LUAD 0.998 0.002
121 | TCGA-50-5941-01A-11R-1755-07 LUAD LUAD 1 0
122 | TCGA-50-5942-01A-21R-1755-07 LUAD LUAD 0.996 0.004
123 | TCGA-50-5944-01A-11R-1755-07 LUAD LUAD 1 0
124 | TCGA-50-5946-01A-11R-1755-07 LUAD LUAD 0.996 0.004
125 | TCGA-50-5946-02A-11R-2090-07 LUAD LUAD 1 0
126 | TCGA-50-6590-01A-12R-1858-07 LUAD LUAD 0.922 0.078
127 | TCGA-50-6591-01A-11R-1755-07 LUAD LUAD 0.9 0.1
128 | TCGA-50-6592-01A-11R-1755-07 LUAD LUAD 0.996 0.004
129 | TCGA-50-6593-01A-11R-1755-07 LUAD LUAD 1 0
130 | TCGA-50-6594-01A-11R-1755-07 LUAD LUAD 0.992 0.008
131 | TCGA-50-6595-01A-12R-1858-07 LUAD LUAD 1 0
132 | TCGA-50-6597-01A-11R-1858-07 LUAD LUAD 0.994 0.006
133 | TCGA-50-6673-01A-11R-1949-07 LUAD LUAD 1 0
134 | TCGA-50-7109-01A-11R-2039-07 LUAD LUAD 0.998 0.002
135 | TCGA-53-7624-01A-11R-2066-07 LUAD LUAD 0.976 0.024
136 | TCGA-53-7626-01A-12R-2066-07 LUAD LUAD 1 0
137 | TCGA-53-7813-01A-11R-2170-07 LUAD LUAD 0.996 0.004
138 | TCGA-55-1592-01A-01R-0946-07 LUAD LUAD 0.996 0.004
139 | TCGA-55-1594-01A-01R-0946-07 LUAD LUAD 0.986 0.014
140 | TCGA-55-1595-01A-01R-0946-07 LUAD LUAD 1 0
141 | TCGA-55-1596-01A-01R-0946-07 LUAD LUAD 0.996 0.004
142 | TCGA-55-5899-01A-11R-1628-07 LUAD LUAD 0.988 0.012
143 | TCGA-55-6543-01A-11R-1755-07 LUAD LUAD 0.996 0.004
144 | TCGA-55-6642-01A-11R-1858-07 LUAD LUAD 1 0
145 | TCGA-55-6712-01A-11R-1858-07 LUAD LUAD 0.998 0.002
146 | TCGA-55-6968-01A-11R-1949-07 LUAD LUAD 0.982 0.018
147 | TCGA-55-6969-01A-11R-1949-07 LUAD LUAD 0.992 0.008
148 | TCGA-55-6970-01A-11R-1949-07 LUAD LUAD 1 0
149 | TCGA-55-6971-01A-11R-1949-07 LUAD LUAD 1 0
150 | TCGA-55-6972-01A-11R-1949-07 LUAD LUAD 0.996 0.004
151 | TCGA-55-6975-01A-11R-1949-07 LUAD LUAD 0.936 0.064
152 | TCGA-55-6978-01A-11R-1949-07 LUAD LUAD 1 0
153 | TCGA-55-6979-01A-11R-1949-07 LUAD LUAD 0.954 0.046
154 | TCGA-55-6980-01A-11R-1949-07 LUAD LUAD 1 0
155 | TCGA-55-6981-01A-11R-1949-07 LUAD LUAD 0.996 0.004
156 | TCGA-55-6982-01A-11R-1949-07 LUAD LUAD 0.96 0.04
157 | TCGA-55-6983-01A-11R-1949-07 LUAD LUAD 0.998 0.002
158 | TCGA-55-6984-01A-11R-1949-07 LUAD LUAD 0.996 0.004
159 | TCGA-55-6985-01A-11R-1949-07 LUAD LUAD 1 0
160 | TCGA-55-6986-01A-11R-1949-07 LUAD LUAD 0.998 0.002
161 | TCGA-55-6987-01A-11R-1949-07 LUAD LUAD 1 0
162 | TCGA-55-7227-01A-11R-2039-07 LUAD LUAD 1 0
163 | TCGA-55-7281-01A-11R-2039-07 LUAD LUAD 1 0
164 | TCGA-55-7283-01A-11R-2039-07 LUAD LUAD 1 0
165 | TCGA-55-7284-01B-11R-2241-07 LUAD LUAD 0.996 0.004
166 | TCGA-55-7570-01A-11R-2039-07 LUAD LUAD 0.962 0.038
167 | TCGA-55-7573-01A-11R-2039-07 LUAD LUAD 1 0
168 | TCGA-55-7574-01A-11R-2039-07 LUAD LUAD 1 0
169 | TCGA-55-7576-01A-11R-2066-07 LUAD LUAD 1 0
170 | TCGA-55-7724-01A-11R-2170-07 LUAD LUAD 0.724 0.276
171 | TCGA-55-7725-01A-11R-2170-07 LUAD LUAD 0.996 0.004
172 | TCGA-55-7726-01A-11R-2170-07 LUAD LUAD 0.936 0.064
173 | TCGA-55-7727-01A-11R-2170-07 LUAD LUAD 0.926 0.074
174 | TCGA-55-7728-01A-11R-2187-07 LUAD LUAD 1 0
175 | TCGA-55-7815-01A-11R-2170-07 LUAD LUAD 0.87 0.13
176 | TCGA-55-7903-01A-11R-2170-07 LUAD LUAD 1 0
177 | TCGA-55-7907-01A-11R-2170-07 LUAD LUAD 1 0
178 | TCGA-55-7910-01A-11R-2170-07 LUAD LUAD 0.994 0.006
179 | TCGA-55-7911-01A-11R-2170-07 LUAD LUAD 1 0
180 | TCGA-55-7913-01B-11R-2241-07 LUAD LUAD 0.988 0.012
181 | TCGA-55-7914-01A-11R-2170-07 LUAD LUAD 0.998 0.002
182 | TCGA-55-7994-01A-11R-2187-07 LUAD LUAD 0.99 0.01
183 | TCGA-55-7995-01A-11R-2187-07 LUAD LUAD 1 0
184 | TCGA-55-8085-01A-11R-2241-07 LUAD LUAD 1 0
185 | TCGA-55-8087-01A-11R-2241-07 LUAD LUAD 0.998 0.002
186 | TCGA-55-8089-01A-11R-2241-07 LUAD LUAD 1 0
187 | TCGA-55-8090-01A-11R-2241-07 LUAD LUAD 0.996 0.004
188 | TCGA-55-8091-01A-11R-2241-07 LUAD LUAD 0.996 0.004
189 | TCGA-55-8092-01A-11R-2241-07 LUAD LUAD 0.994 0.006
190 | TCGA-55-8094-01A-11R-2241-07 LUAD LUAD 1 0
191 | TCGA-55-8096-01A-11R-2241-07 LUAD LUAD 0.996 0.004
192 | TCGA-55-8097-01A-11R-2241-07 LUAD LUAD 1 0
193 | TCGA-55-8203-01A-11R-2241-07 LUAD LUAD 1 0
194 | TCGA-55-8204-01A-11R-2241-07 LUAD LUAD 0.666 0.334
195 | TCGA-55-8205-01A-11R-2241-07 LUAD LUAD 0.998 0.002
196 | TCGA-55-8206-01A-11R-2241-07 LUAD LUAD 1 0
197 | TCGA-55-8207-01A-11R-2241-07 LUAD LUAD 1 0
198 | TCGA-55-8208-01A-11R-2241-07 LUAD LUAD 1 0
199 | TCGA-55-8299-01A-11R-2287-07 LUAD LUAD 1 0
200 | TCGA-55-8301-01A-11R-2287-07 LUAD LUAD 1 0
201 | TCGA-64-1676-01A-01R-0946-07 LUAD LUAD 0.98 0.02
202 | TCGA-64-1677-01A-01R-0946-07 LUAD LUAD 0.998 0.002
203 | TCGA-64-1678-01A-01R-0946-07 LUAD LUAD 0.928 0.072
204 | TCGA-64-1679-01A-21R-2066-07 LUAD LUAD 1 0
205 | TCGA-64-1680-01A-02R-0946-07 LUAD LUAD 0.998 0.002
206 | TCGA-64-1681-01A-11R-2066-07 LUAD LUAD 1 0
207 | TCGA-64-5774-01A-01R-1628-07 LUAD LUAD 0.998 0.002
208 | TCGA-64-5775-01A-01R-1628-07 LUAD LUAD 0.942 0.058
209 | TCGA-64-5778-01A-01R-1628-07 LUAD LUAD 1 0
210 | TCGA-64-5779-01A-01R-1628-07 LUAD LUAD 1 0
211 | TCGA-64-5781-01A-01R-1628-07 LUAD LUAD 0.998 0.002
212 | TCGA-64-5815-01A-01R-1628-07 LUAD LUAD 1 0
213 | TCGA-67-3770-01A-01R-0946-07 LUAD LUAD 0.998 0.002
214 | TCGA-67-3771-01A-01R-0946-07 LUAD LUAD 1 0
215 | TCGA-67-3772-01A-01R-0946-07 LUAD LUAD 0.996 0.004
216 | TCGA-67-3773-01A-01R-0946-07 LUAD LUAD 0.998 0.002
217 | TCGA-67-3774-01A-01R-0946-07 LUAD LUAD 1 0
218 | TCGA-67-4679-01B-01R-1755-07 LUAD LUAD 1 0
219 | TCGA-67-6215-01A-11R-1755-07 LUAD LUAD 1 0
220 | TCGA-67-6216-01A-11R-1755-07 LUAD LUAD 0.998 0.002
221 | TCGA-67-6217-01A-11R-1755-07 LUAD LUAD 1 0
222 | TCGA-69-7760-01A-11R-2170-07 LUAD LUAD 0.994 0.006
223 | TCGA-69-7761-01A-11R-2170-07 LUAD LUAD 0.988 0.012
224 | TCGA-69-7763-01A-11R-2170-07 LUAD LUAD 1 0
225 | TCGA-69-7764-01A-11R-2170-07 LUAD LUAD 1 0
226 | TCGA-69-7765-01A-11R-2170-07 LUAD LUAD 1 0
227 | TCGA-69-7973-01A-11R-2187-07 LUAD LUAD 0.996 0.004
228 | TCGA-69-7974-01A-11R-2187-07 LUAD LUAD 1 0
229 | TCGA-69-7978-01A-11R-2187-07 LUAD LUAD 1 0
230 | TCGA-69-7979-01A-11R-2187-07 LUAD LUAD 0.992 0.008
231 | TCGA-69-7980-01A-11R-2187-07 LUAD LUAD 1 0
232 | TCGA-69-8253-01A-11R-2287-07 LUAD LUAD 1 0
233 | TCGA-69-8254-01A-11R-2287-07 LUAD LUAD 0.994 0.006
234 | TCGA-69-8255-01A-11R-2287-07 LUAD LUAD 0.988 0.012
235 | TCGA-71-6725-01A-11R-1858-07 LUAD LUAD 1 0
236 | TCGA-73-4658-01A-01R-1755-07 LUAD LUAD 1 0
237 | TCGA-73-4659-01A-01R-1206-07 LUAD LUAD 1 0
238 | TCGA-73-4662-01A-01R-1206-07 LUAD LUAD 0.998 0.002
239 | TCGA-73-4666-01A-01R-1206-07 LUAD LUAD 0.994 0.006
240 | TCGA-73-4668-01A-01R-1206-07 LUAD LUAD 1 0
241 | TCGA-73-4670-01A-01R-1206-07 LUAD LUAD 0.998 0.002
242 | TCGA-73-4675-01A-01R-1206-07 LUAD LUAD 1 0
243 | TCGA-73-4676-01A-01R-1755-07 LUAD LUAD 0.998 0.002
244 | TCGA-73-4677-01A-01R-1206-07 LUAD LUAD 1 0
245 | TCGA-73-7498-01A-12R-2187-07 LUAD LUAD 1 0
246 | TCGA-73-7499-01A-11R-2187-07 LUAD LUAD 1 0
247 | TCGA-75-5122-01A-01R-1755-07 LUAD LUAD 0.992 0.008
248 | TCGA-75-5125-01A-01R-1755-07 LUAD LUAD 1 0
249 | TCGA-75-5126-01A-01R-1755-07 LUAD LUAD 0.998 0.002
250 | TCGA-75-5146-01A-01R-1628-07 LUAD LUAD 0.994 0.006
251 | TCGA-75-5147-01A-01R-1628-07 LUAD LUAD 0.998 0.002
252 | TCGA-75-6203-01A-11R-1755-07 LUAD LUAD 1 0
253 | TCGA-75-6205-01A-11R-1755-07 LUAD LUAD 1 0
254 | TCGA-75-6206-01A-11R-1755-07 LUAD LUAD 1 0
255 | TCGA-75-6207-01A-11R-1755-07 LUAD LUAD 0.99 0.01
256 | TCGA-75-6211-01A-11R-1755-07 LUAD LUAD 0.99 0.01
257 | TCGA-75-6212-01A-11R-1755-07 LUAD LUAD 1 0
258 | TCGA-75-6214-01A-41R-1949-07 LUAD LUAD 0.956 0.044
259 | TCGA-75-7025-01A-12R-1949-07 LUAD LUAD 1 0
260 | TCGA-75-7027-01A-11R-1949-07 LUAD LUAD 0.996 0.004
261 | TCGA-75-7030-01A-11R-1949-07 LUAD LUAD 1 0
262 | TCGA-75-7031-01A-11R-1949-07 LUAD LUAD 1 0
263 | TCGA-78-7143-01A-11R-2039-07 LUAD LUAD 1 0
264 | TCGA-78-7145-01A-11R-2039-07 LUAD LUAD 0.998 0.002
265 | TCGA-78-7146-01A-11R-2039-07 LUAD LUAD 0.974 0.026
266 | TCGA-78-7147-01A-11R-2039-07 LUAD LUAD 1 0
267 | TCGA-78-7148-01A-11R-2039-07 LUAD LUAD 1 0
268 | TCGA-78-7149-01A-11R-2039-07 LUAD LUAD 1 0
269 | TCGA-78-7150-01A-21R-2039-07 LUAD LUAD 0.982 0.018
270 | TCGA-78-7152-01A-11R-2039-07 LUAD LUAD 1 0
271 | TCGA-78-7153-01A-11R-2039-07 LUAD LUAD 1 0
272 | TCGA-78-7154-01A-11R-2039-07 LUAD LUAD 0.974 0.026
273 | TCGA-78-7155-01A-11R-2039-07 LUAD LUAD 0.928 0.072
274 | TCGA-78-7156-01A-11R-2039-07 LUAD LUAD 0.998 0.002
275 | TCGA-78-7158-01A-11R-2039-07 LUAD LUAD 0.988 0.012
276 | TCGA-78-7159-01A-11R-2039-07 LUAD LUAD 1 0
277 | TCGA-78-7160-01A-11R-2039-07 LUAD LUAD 1 0
278 | TCGA-78-7161-01A-11R-2039-07 LUAD LUAD 1 0
279 | TCGA-78-7162-01A-21R-2066-07 LUAD LUAD 0.998 0.002
280 | TCGA-78-7163-01A-12R-2066-07 LUAD LUAD 0.982 0.018
281 | TCGA-78-7166-01A-12R-2066-07 LUAD LUAD 1 0
282 | TCGA-78-7167-01A-11R-2066-07 LUAD LUAD 0.998 0.002
283 | TCGA-78-7220-01A-11R-2039-07 LUAD LUAD 0.996 0.004
284 | TCGA-78-7535-01A-11R-2066-07 LUAD LUAD 1 0
285 | TCGA-78-7536-01A-11R-2066-07 LUAD LUAD 0.986 0.014
286 | TCGA-78-7537-01A-11R-2066-07 LUAD LUAD 1 0
287 | TCGA-78-7539-01A-11R-2066-07 LUAD LUAD 0.994 0.006
288 | TCGA-78-7540-01A-11R-2066-07 LUAD LUAD 0.998 0.002
289 | TCGA-78-7542-01A-21R-2066-07 LUAD LUAD 0.984 0.016
290 | TCGA-78-7633-01A-11R-2066-07 LUAD LUAD 0.998 0.002
291 | TCGA-80-5607-01A-31R-1949-07 LUAD LUAD 1 0
292 | TCGA-80-5608-01A-31R-1949-07 LUAD LUAD 1 0
293 | TCGA-80-5611-01A-01R-1628-07 LUAD LUAD 0.994 0.006
294 | TCGA-83-5908-01A-21R-2287-07 LUAD LUAD 0.998 0.002
295 | TCGA-86-6562-01A-11R-1755-07 LUAD LUAD 1 0
296 | TCGA-86-6851-01A-11R-1949-07 LUAD LUAD 0.998 0.002
297 | TCGA-86-7701-01A-11R-2170-07 LUAD LUAD 0.992 0.008
298 | TCGA-86-7711-01A-11R-2066-07 LUAD LUAD 0.982 0.018
299 | TCGA-86-7713-01A-11R-2066-07 LUAD LUAD 0.998 0.002
300 | TCGA-86-7714-01A-12R-2170-07 LUAD LUAD 1 0
301 | TCGA-86-7953-01A-11R-2187-07 LUAD LUAD 0.998 0.002
302 | TCGA-86-7954-01A-11R-2187-07 LUAD LUAD 0.998 0.002
303 | TCGA-86-7955-01A-11R-2187-07 LUAD LUAD 0.994 0.006
304 | TCGA-86-8054-01A-11R-2241-07 LUAD LUAD 0.992 0.008
305 | TCGA-86-8055-01A-11R-2241-07 LUAD LUAD 0.998 0.002
306 | TCGA-86-8056-01A-11R-2241-07 LUAD LUAD 0.998 0.002
307 | TCGA-86-8073-01A-11R-2241-07 LUAD LUAD 0.996 0.004
308 | TCGA-86-8074-01A-11R-2241-07 LUAD LUAD 1 0
309 | TCGA-86-8075-01A-11R-2241-07 LUAD LUAD 1 0
310 | TCGA-86-8076-01A-31R-2241-07 LUAD LUAD 1 0
311 | TCGA-86-8279-01A-11R-2287-07 LUAD LUAD 1 0
312 | TCGA-86-8280-01A-11R-2287-07 LUAD LUAD 1 0
313 | TCGA-86-8281-01A-11R-2287-07 LUAD LUAD 1 0
314 | TCGA-91-6828-01A-11R-1858-07 LUAD LUAD 0.998 0.002
315 | TCGA-91-6829-01A-21R-1858-07 LUAD LUAD 0.986 0.014
316 | TCGA-91-6830-01A-11R-1949-07 LUAD LUAD 1 0
317 | TCGA-91-6831-01A-11R-1858-07 LUAD LUAD 0.99 0.01
318 | TCGA-91-6835-01A-11R-1858-07 LUAD LUAD 1 0
319 | TCGA-91-6836-01A-21R-1858-07 LUAD LUAD 0.978 0.022
320 | TCGA-91-6840-01A-11R-1949-07 LUAD LUAD 0.99 0.01
321 | TCGA-91-6847-01A-11R-1949-07 LUAD LUAD 0.95 0.05
322 | TCGA-91-6848-01A-11R-1949-07 LUAD LUAD 0.924 0.076
323 | TCGA-91-6849-01A-11R-1949-07 LUAD LUAD 1 0
324 | TCGA-91-7771-01A-11R-2170-07 LUAD LUAD 0.998 0.002
325 | TCGA-93-7347-01A-11R-2187-07 LUAD LUAD 1 0
326 | TCGA-93-7348-01A-21R-2039-07 LUAD LUAD 1 0
327 | TCGA-93-8067-01A-11R-2287-07 LUAD LUAD 1 0
328 | TCGA-95-7039-01A-11R-1949-07 LUAD LUAD 1 0
329 | TCGA-95-7043-01A-11R-1949-07 LUAD LUAD 0.986 0.014
330 | TCGA-95-7562-01A-11R-2241-07 LUAD LUAD 0.99 0.01
331 | TCGA-95-7567-01A-11R-2066-07 LUAD LUAD 1 0
332 | TCGA-95-7944-01A-11R-2187-07 LUAD LUAD 0.998 0.002
333 | TCGA-95-7947-01A-11R-2187-07 LUAD LUAD 0.988 0.012
334 | TCGA-95-7948-01A-11R-2187-07 LUAD LUAD 0.994 0.006
335 | TCGA-95-8039-01A-11R-2241-07 LUAD LUAD 1 0
336 | TCGA-97-7546-01A-11R-2039-07 LUAD LUAD 0.998 0.002
337 | TCGA-97-7547-01A-11R-2039-07 LUAD LUAD 0.968 0.032
338 | TCGA-97-7552-01A-11R-2039-07 LUAD LUAD 0.996 0.004
339 | TCGA-97-7553-01A-21R-2039-07 LUAD LUAD 1 0
340 | TCGA-97-7554-01A-11R-2039-07 LUAD LUAD 0.998 0.002
341 | TCGA-97-7937-01A-11R-2170-07 LUAD LUAD 1 0
342 | TCGA-97-7938-01A-11R-2170-07 LUAD LUAD 1 0
343 | TCGA-97-7941-01A-11R-2187-07 LUAD LUAD 1 0
344 | TCGA-97-8171-01A-11R-2287-07 LUAD LUAD 0.994 0.006
345 | TCGA-97-8172-01A-11R-2287-07 LUAD LUAD 0.998 0.002
346 | TCGA-97-8174-01A-11R-2287-07 LUAD LUAD 0.998 0.002
347 | TCGA-97-8175-01A-11R-2287-07 LUAD LUAD 0.968 0.032
348 | TCGA-97-8177-01A-11R-2287-07 LUAD LUAD 1 0
349 | TCGA-97-8179-01A-11R-2287-07 LUAD LUAD 1 0
350 | TCGA-99-7458-01A-11R-2039-07 LUAD LUAD 0.998 0.002
351 | TCGA-99-8025-01A-11R-2241-07 LUAD LUAD 1 0
352 | TCGA-99-8028-01A-11R-2241-07 LUAD LUAD 1 0
353 | TCGA-99-8032-01A-11R-2241-07 LUAD LUAD 1 0
354 | TCGA-99-8033-01A-11R-2241-07 LUAD LUAD 0.874 0.126
355 | TCGA-J2-8192-01A-11R-2241-07 LUAD LUAD 0.762 0.238
356 | TCGA-J2-8194-01A-11R-2241-07 LUAD LUAD 1 0
357 | TCGA-18-3406-01A-01R-0980-07 LUSC LUAD 0.776 0.224
358 | TCGA-18-3407-01A-01R-0980-07 LUSC LUSC 0 1
359 | TCGA-18-3408-01A-01R-0980-07 LUSC LUAD 0.9 0.1
360 | TCGA-18-3409-01A-01R-0980-07 LUSC LUAD 0.832 0.168
361 | TCGA-18-3410-01A-01R-0980-07 LUSC LUSC 0.002 0.998
362 | TCGA-18-3411-01A-01R-0980-07 LUSC LUSC 0 1
363 | TCGA-18-3412-01A-01R-0980-07 LUSC LUSC 0.002 0.998
364 | TCGA-18-3414-01A-01R-0980-07 LUSC LUSC 0.062 0.938
365 | TCGA-18-3415-01A-01R-0980-07 LUSC LUSC 0.002 0.998
366 | TCGA-18-3416-01A-01R-0980-07 LUSC LUSC 0.006 0.994
367 | TCGA-18-3417-01A-01R-1443-07 LUSC LUSC 0.016 0.984
368 | TCGA-18-3419-01A-01R-0980-07 LUSC LUSC 0.002 0.998
369 | TCGA-18-3421-01A-01R-0980-07 LUSC LUSC 0.006 0.994
370 | TCGA-18-4083-01A-01R-1100-07 LUSC LUSC 0.006 0.994
371 | TCGA-18-4086-01A-01R-1100-07 LUSC LUSC 0 1
372 | TCGA-18-4721-01A-01R-1443-07 LUSC LUSC 0 1
373 | TCGA-18-5592-01A-01R-1635-07 LUSC LUSC 0.002 0.998
374 | TCGA-18-5595-01A-01R-1635-07 LUSC LUSC 0 1
375 | TCGA-21-1070-01A-01R-0692-07 LUSC LUSC 0.032 0.968
376 | TCGA-21-1071-01A-01R-0692-07 LUSC LUSC 0.006 0.994
377 | TCGA-21-1072-01A-01R-0692-07 LUSC LUSC 0.046 0.954
378 | TCGA-21-1075-01A-01R-0692-07 LUSC LUSC 0.12 0.88
379 | TCGA-21-1076-01A-02R-0692-07 LUSC LUSC 0.008 0.992
380 | TCGA-21-1077-01A-01R-0692-07 LUSC LUSC 0.002 0.998
381 | TCGA-21-1078-01A-01R-0692-07 LUSC LUSC 0.042 0.958
382 | TCGA-21-1079-01A-01R-0692-07 LUSC LUSC 0.014 0.986
383 | TCGA-21-1080-01A-01R-0692-07 LUSC LUSC 0.004 0.996
384 | TCGA-21-1081-01A-01R-0692-07 LUSC LUSC 0.004 0.996
385 | TCGA-21-1082-01A-01R-0692-07 LUSC LUSC 0.004 0.996
386 | TCGA-21-1083-01A-01R-0692-07 LUSC LUSC 0.002 0.998
387 | TCGA-21-5782-01A-01R-1635-07 LUSC LUSC 0.032 0.968
388 | TCGA-21-5784-01A-01R-1635-07 LUSC LUSC 0.01 0.99
389 | TCGA-21-5786-01A-01R-1635-07 LUSC LUSC 0.008 0.992
390 | TCGA-21-5787-01A-01R-1635-07 LUSC LUSC 0.008 0.992
391 | TCGA-22-0940-01A-01R-0692-07 LUSC LUSC 0.004 0.996
392 | TCGA-22-0944-01A-01R-0692-07 LUSC LUSC 0.002 0.998
393 | TCGA-22-1002-01A-01R-0692-07 LUSC LUSC 0 1
394 | TCGA-22-1011-01A-01R-0692-07 LUSC LUSC 0.004 0.996
395 | TCGA-22-1012-01A-01R-0692-07 LUSC LUSC 0.002 0.998
396 | TCGA-22-1016-01A-01R-0692-07 LUSC LUSC 0.006 0.994
397 | TCGA-22-1017-01A-01R-0692-07 LUSC LUSC 0.092 0.908
398 | TCGA-22-4591-01A-01R-1201-07 LUSC LUSC 0.018 0.982
399 | TCGA-22-4593-01A-21R-1820-07 LUSC LUSC 0.006 0.994
400 | TCGA-22-4594-01A-01R-1201-07 LUSC LUSC 0.152 0.848
401 | TCGA-22-4595-01A-01R-1201-07 LUSC LUSC 0.002 0.998
402 | TCGA-22-4596-01A-01R-1201-07 LUSC LUSC 0.172 0.828
403 | TCGA-22-4599-01A-01R-1443-07 LUSC LUSC 0.032 0.968
404 | TCGA-22-4601-01A-01R-1443-07 LUSC LUSC 0.002 0.998
405 | TCGA-22-4604-01A-01R-1201-07 LUSC LUSC 0 1
406 | TCGA-22-4607-01A-01R-1201-07 LUSC LUSC 0.002 0.998
407 | TCGA-22-4613-01A-01R-1443-07 LUSC LUSC 0.008 0.992
408 | TCGA-22-5471-01A-01R-1635-07 LUSC LUSC 0.002 0.998
409 | TCGA-22-5472-01A-01R-1635-07 LUSC LUSC 0 1
410 | TCGA-22-5473-01A-01R-1635-07 LUSC LUSC 0 1
411 | TCGA-22-5474-01A-01R-1635-07 LUSC LUSC 0 1
412 | TCGA-22-5477-01A-01R-1635-07 LUSC LUSC 0 1
413 | TCGA-22-5478-01A-01R-1635-07 LUSC LUSC 0.006 0.994
414 | TCGA-22-5479-01A-31R-1949-07 LUSC LUSC 0 1
415 | TCGA-22-5480-01A-01R-1635-07 LUSC LUSC 0.004 0.996
416 | TCGA-22-5481-01A-31R-1949-07 LUSC LUSC 0.068 0.932
417 | TCGA-22-5482-01A-01R-1635-07 LUSC LUSC 0 1
418 | TCGA-22-5483-01A-01R-1820-07 LUSC LUSC 0.002 0.998
419 | TCGA-22-5485-01A-01R-1635-07 LUSC LUSC 0.002 0.998
420 | TCGA-22-5489-01A-01R-1635-07 LUSC LUSC 0.004 0.996
421 | TCGA-22-5491-01A-01R-1635-07 LUSC LUSC 0.002 0.998
422 | TCGA-22-5492-01A-01R-1635-07 LUSC LUSC 0.008 0.992
423 | TCGA-33-4532-01A-01R-1201-07 LUSC LUSC 0.002 0.998
424 | TCGA-33-4533-01A-01R-1201-07 LUSC LUSC 0.022 0.978
425 | TCGA-33-4538-01A-01R-1201-07 LUSC LUSC 0.016 0.984
426 | TCGA-33-4547-01A-01R-1201-07 LUSC LUSC 0 1
427 | TCGA-33-4566-01A-01R-1443-07 LUSC LUSC 0.054 0.946
428 | TCGA-33-4582-01A-01R-1443-07 LUSC LUSC 0 1
429 | TCGA-33-4583-01A-01R-1443-07 LUSC LUSC 0.012 0.988
430 | TCGA-33-4586-01A-01R-1443-07 LUSC LUSC 0.012 0.988
431 | TCGA-33-6737-01A-11R-1820-07 LUSC LUSC 0.006 0.994
432 | TCGA-33-6738-01A-11R-1949-07 LUSC LUSC 0.004 0.996
433 | TCGA-34-2596-01A-01R-0851-07 LUSC LUSC 0 1
434 | TCGA-34-2600-01A-01R-0851-07 LUSC LUSC 0.002 0.998
435 | TCGA-34-2608-01A-02R-0851-07 LUSC LUSC 0 1
436 | TCGA-34-5231-01A-21R-1820-07 LUSC LUSC 0.008 0.992
437 | TCGA-34-5232-01A-21R-1820-07 LUSC LUSC 0 1
438 | TCGA-34-5234-01A-01R-1635-07 LUSC LUSC 0.108 0.892
439 | TCGA-34-5236-01A-21R-1820-07 LUSC LUSC 0 1
440 | TCGA-34-5239-01A-21R-1820-07 LUSC LUSC 0 1
441 | TCGA-34-5240-01A-01R-1443-07 LUSC LUSC 0.022 0.978
442 | TCGA-34-5241-01A-01R-1443-07 LUSC LUSC 0.004 0.996
443 | TCGA-34-5927-01A-11R-1820-07 LUSC LUSC 0.006 0.994
444 | TCGA-34-5928-01A-11R-1820-07 LUSC LUSC 0.002 0.998
445 | TCGA-34-5929-01A-11R-1820-07 LUSC LUSC 0.002 0.998
446 | TCGA-34-7107-01A-11R-1949-07 LUSC LUSC 0 1
447 | TCGA-37-3783-01A-01R-1201-07 LUSC LUSC 0.008 0.992
448 | TCGA-37-3789-01A-01R-0980-07 LUSC LUSC 0.002 0.998
449 | TCGA-37-3792-01A-01R-0980-07 LUSC LUSC 0.12 0.88
450 | TCGA-37-4129-01A-01R-1100-07 LUSC LUSC 0.082 0.918
451 | TCGA-37-4130-01A-01R-1100-07 LUSC LUSC 0.116 0.884
452 | TCGA-37-4132-01A-01R-1100-07 LUSC LUSC 0.012 0.988
453 | TCGA-37-4133-01A-01R-1100-07 LUSC LUSC 0.034 0.966
454 | TCGA-37-4135-01A-01R-1100-07 LUSC LUSC 0.024 0.976
455 | TCGA-37-4141-01A-02R-1100-07 LUSC LUSC 0.008 0.992
456 | TCGA-37-5819-01A-01R-1635-07 LUSC LUSC 0.032 0.968
457 | TCGA-39-5011-01A-01R-1443-07 LUSC LUSC 0.032 0.968
458 | TCGA-39-5016-01A-01R-1443-07 LUSC LUSC 0 1
459 | TCGA-39-5019-01A-01R-1820-07 LUSC LUSC 0.122 0.878
460 | TCGA-39-5021-01A-01R-1443-07 LUSC LUSC 0 1
461 | TCGA-39-5024-01A-21R-1820-07 LUSC LUSC 0.05 0.95
462 | TCGA-39-5027-01A-21R-1820-07 LUSC LUSC 0 1
463 | TCGA-39-5028-01A-01R-1443-07 LUSC LUSC 0.002 0.998
464 | TCGA-39-5029-01A-01R-1443-07 LUSC LUSC 0 1
465 | TCGA-39-5030-01A-01R-1443-07 LUSC LUSC 0.002 0.998
466 | TCGA-39-5031-01A-01R-1443-07 LUSC LUSC 0 1
467 | TCGA-39-5034-01A-01R-1443-07 LUSC LUSC 0.026 0.974
468 | TCGA-39-5035-01A-01R-1443-07 LUSC LUSC 0.01 0.99
469 | TCGA-39-5036-01A-01R-1443-07 LUSC LUSC 0 1
470 | TCGA-39-5037-01A-01R-1443-07 LUSC LUSC 0.002 0.998
471 | TCGA-39-5039-01A-01R-1443-07 LUSC LUSC 0.006 0.994
472 | TCGA-43-2578-01A-01R-0851-07 LUSC LUSC 0.002 0.998
473 | TCGA-43-2581-01A-01R-0851-07 LUSC LUSC 0.104 0.896
474 | TCGA-43-3394-01A-01R-0980-07 LUSC LUSC 0 1
475 | TCGA-43-3920-01A-01R-0980-07 LUSC LUSC 0.026 0.974
476 | TCGA-43-5668-01A-01R-1635-07 LUSC LUSC 0.016 0.984
477 | TCGA-43-6143-01A-11R-1820-07 LUSC LUSC 0.01 0.99
478 | TCGA-43-6647-01A-11R-1820-07 LUSC LUSC 0.006 0.994
479 | TCGA-43-6770-01A-11R-1820-07 LUSC LUSC 0 1
480 | TCGA-43-6771-01A-11R-1820-07 LUSC LUSC 0.066 0.934
481 | TCGA-46-3765-01A-01R-0980-07 LUSC LUSC 0 1
482 | TCGA-46-3766-01A-01R-0980-07 LUSC LUSC 0.026 0.974
483 | TCGA-46-3767-01A-01R-0980-07 LUSC LUSC 0.006 0.994
484 | TCGA-46-3768-01A-01R-0980-07 LUSC LUSC 0 1
485 | TCGA-46-3769-01A-01R-0980-07 LUSC LUSC 0.034 0.966
486 | TCGA-46-6025-01A-11R-1820-07 LUSC LUSC 0.004 0.996
487 | TCGA-46-6026-01A-11R-1820-07 LUSC LUSC 0.052 0.948
488 | TCGA-51-4079-01A-01R-1100-07 LUSC LUSC 0 1
489 | TCGA-51-4080-01A-01R-1100-07 LUSC LUSC 0.002 0.998
490 | TCGA-51-4081-01A-01R-1100-07 LUSC LUSC 0 1
491 | TCGA-56-1622-01A-01R-0692-07 LUSC LUSC 0.008 0.992
492 | TCGA-56-5897-01A-11R-1635-07 LUSC LUSC 0 1
493 | TCGA-56-5898-01A-11R-1635-07 LUSC LUSC 0.008 0.992
494 | TCGA-56-6545-01A-11R-1820-07 LUSC LUSC 0.036 0.964
495 | TCGA-56-6546-01A-11R-1820-07 LUSC LUSC 0.004 0.996
496 | TCGA-60-2695-01A-01R-0851-07 LUSC LUSC 0.004 0.996
497 | TCGA-60-2696-01A-01R-0851-07 LUSC LUSC 0.006 0.994
498 | TCGA-60-2698-01A-01R-0851-07 LUSC LUSC 0.038 0.962
499 | TCGA-60-2706-01A-01R-0851-07 LUSC LUSC 0.032 0.968
500 | TCGA-60-2707-01A-01R-0851-07 LUSC LUSC 0.002 0.998
501 | TCGA-60-2708-01A-01R-0851-07 LUSC LUSC 0 1
502 | TCGA-60-2709-01A-21R-1820-07 LUSC LUSC 0.004 0.996
503 | TCGA-60-2710-01A-01R-0851-07 LUSC LUSC 0.004 0.996
504 | TCGA-60-2711-01A-01R-0851-07 LUSC LUSC 0 1
505 | TCGA-60-2712-01A-01R-0851-07 LUSC LUSC 0 1
506 | TCGA-60-2713-01A-01R-0851-07 LUSC LUSC 0 1
507 | TCGA-60-2714-01A-01R-0851-07 LUSC LUSC 0.258 0.742
508 | TCGA-60-2715-01A-01R-0851-07 LUSC LUSC 0.002 0.998
509 | TCGA-60-2716-01A-01R-0851-07 LUSC LUSC 0.026 0.974
510 | TCGA-60-2719-01A-01R-0851-07 LUSC LUSC 0 1
511 | TCGA-60-2720-01A-01R-0851-07 LUSC LUSC 0.016 0.984
512 | TCGA-60-2721-01A-01R-0851-07 LUSC LUSC 0 1
513 | TCGA-60-2722-01A-01R-0851-07 LUSC LUSC 0 1
514 | TCGA-60-2723-01A-01R-0851-07 LUSC LUSC 0 1
515 | TCGA-60-2724-01A-01R-0851-07 LUSC LUSC 0.008 0.992
516 | TCGA-60-2725-01A-01R-1201-07 LUSC LUSC 0.002 0.998
517 | TCGA-60-2726-01A-01R-0851-07 LUSC LUSC 0.008 0.992
518 | TCGA-63-5128-01A-01R-1443-07 LUSC LUSC 0.014 0.986
519 | TCGA-63-5131-01A-01R-1443-07 LUSC LUSC 0.012 0.988
520 | TCGA-63-6202-01A-11R-1820-07 LUSC LUSC 0.068 0.932
521 | TCGA-63-7020-01A-11R-1949-07 LUSC LUSC 0.006 0.994
522 | TCGA-63-7021-01A-11R-1949-07 LUSC LUSC 0.028 0.972
523 | TCGA-63-7022-01A-11R-1949-07 LUSC LUSC 0 1
524 | TCGA-63-7023-01A-11R-1949-07 LUSC LUSC 0 1
525 | TCGA-66-2727-01A-01R-0980-07 LUSC LUSC 0 1
526 | TCGA-66-2734-01A-01R-0980-07 LUSC LUSC 0 1
527 | TCGA-66-2737-01A-01R-0980-07 LUSC LUSC 0 1
528 | TCGA-66-2742-01A-01R-0980-07 LUSC LUSC 0 1
529 | TCGA-66-2744-01A-01R-0980-07 LUSC LUSC 0.004 0.996
530 | TCGA-66-2753-01A-01R-0980-07 LUSC LUSC 0.01 0.99
531 | TCGA-66-2754-01A-01R-0980-07 LUSC LUSC 0.02 0.98
532 | TCGA-66-2755-01A-01R-0851-07 LUSC LUSC 0.002 0.998
533 | TCGA-66-2756-01A-01R-0851-07 LUSC LUSC 0.088 0.912
534 | TCGA-66-2757-01A-01R-0851-07 LUSC LUSC 0.004 0.996
535 | TCGA-66-2758-01A-02R-0851-07 LUSC LUSC 0 1
536 | TCGA-66-2759-01A-01R-0851-07 LUSC LUSC 0 1
537 | TCGA-66-2763-01A-01R-0851-07 LUSC LUSC 0.006 0.994
538 | TCGA-66-2765-01A-01R-0851-07 LUSC LUSC 0.02 0.98
539 | TCGA-66-2766-01A-01R-0851-07 LUSC LUSC 0.01 0.99
540 | TCGA-66-2767-01A-01R-0851-07 LUSC LUSC 0 1
541 | TCGA-66-2768-01A-01R-0851-07 LUSC LUSC 0 1
542 | TCGA-66-2769-01A-02R-0851-07 LUSC LUSC 0.002 0.998
543 | TCGA-66-2770-01A-01R-0851-07 LUSC LUSC 0 1
544 | TCGA-66-2771-01A-01R-0980-07 LUSC LUSC 0.002 0.998
545 | TCGA-66-2773-01A-01R-1201-07 LUSC LUSC 0.002 0.998
546 | TCGA-66-2777-01A-01R-1201-07 LUSC LUSC 0 1
547 | TCGA-66-2778-01A-02R-0851-07 LUSC LUSC 0.014 0.986
548 | TCGA-66-2780-01A-01R-0851-07 LUSC LUSC 0.002 0.998
549 | TCGA-66-2781-01A-01R-0851-07 LUSC LUSC 0 1
550 | TCGA-66-2782-01A-01R-0851-07 LUSC LUSC 0 1
551 | TCGA-66-2783-01A-01R-1201-07 LUSC LUSC 0 1
552 | TCGA-66-2785-01A-01R-0851-07 LUSC LUSC 0.024 0.976
553 | TCGA-66-2786-01A-01R-0851-07 LUSC LUSC 0 1
554 | TCGA-66-2787-01A-01R-0980-07 LUSC LUSC 0 1
555 | TCGA-66-2788-01A-01R-0980-07 LUSC LUSC 0.002 0.998
556 | TCGA-66-2789-01A-01R-0980-07 LUSC LUSC 0.004 0.996
557 | TCGA-66-2790-01A-01R-0980-07 LUSC LUSC 0.008 0.992
558 | TCGA-66-2791-01A-01R-0980-07 LUSC LUSC 0 1
559 | TCGA-66-2792-01A-01R-0980-07 LUSC LUSC 0 1
560 | TCGA-66-2793-01A-01R-1201-07 LUSC LUSC 0.016 0.984
561 | TCGA-66-2794-01A-01R-1201-07 LUSC LUSC 0 1
562 | TCGA-66-2795-01A-02R-0980-07 LUSC LUSC 0.008 0.992
563 | TCGA-66-2800-01A-01R-1201-07 LUSC LUSC 0.002 0.998
564 | TCGA-70-6722-01A-11R-1820-07 LUSC LUSC 0.012 0.988
565 | TCGA-70-6723-01A-11R-1820-07 LUSC LUSC 0.026 0.974
566 | TCGA-77-6842-01A-11R-1949-07 LUSC LUSC 0.002 0.998
567 | TCGA-77-6843-01A-11R-1949-07 LUSC LUSC 0.006 0.994
568 | TCGA-77-6844-01A-11R-1949-07 LUSC LUSC 0.004 0.996
569 | TCGA-77-6845-01A-11R-1949-07 LUSC LUSC 0 1
570 | TCGA-79-5596-01A-31R-1949-07 LUSC LUSC 0 1
571 | TCGA-85-6175-01A-11R-1820-07 LUSC LUSC 0.138 0.862
572 | TCGA-85-6560-01A-11R-1820-07 LUSC LUAD 0.882 0.118
573 | TCGA-85-6561-01A-11R-1820-07 LUSC LUSC 0.02 0.98
574 | TCGA-85-6798-01A-11R-1949-07 LUSC LUSC 0 1
575 | TCGA-90-6837-01A-11R-1949-07 LUSC LUSC 0 1
576 | TCGA-94-7033-01A-11R-1949-07 LUSC LUSC 0.008 0.992
577 |
--------------------------------------------------------------------------------
/Codes/BuildMatrixFile.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | from utilities import *
3 |
4 | inFilePattern = sys.argv[1]
5 | outFilePath = sys.argv[2]
6 |
7 | inFilePaths = sorted(glob.glob(inFilePattern))
8 | sampleIDs = [os.path.basename(x) for x in inFilePaths]
9 |
10 | features = set()
11 | for inFilePath in inFilePaths:
12 | print "Identifying features in %s" % inFilePath
13 | for line in file(inFilePath):
14 | features.add(line.rstrip().split("\t")[0])
15 | features = sorted(list(features))
16 |
17 | outData = [[""] + features]
18 | for inFilePath in inFilePaths:
19 | print "Parsing and saving values for %s" % inFilePath
20 | sampleID = os.path.basename(inFilePath)
21 |
22 | valueDict = {}
23 | for line in file(inFilePath):
24 | lineItems = line.rstrip().split("\t")
25 | valueDict[lineItems[0]] = lineItems[1]
26 |
27 | values = [valueDict[feature] for feature in features]
28 | outData.append([sampleID] + values)
29 |
30 | print "Transposing and saving to %s" % outFilePath
31 | writeMatrixToFile(transposeMatrix(outData), outFilePath)
32 |
--------------------------------------------------------------------------------
/Codes/CalcAUC.R:
--------------------------------------------------------------------------------
1 | library(pROC)
2 |
3 | inFilePath = commandArgs()[7]
4 | actualColumnName = commandArgs()[8]
5 | probabilitiesColumnName = commandArgs()[9]
6 | outFilePath = commandArgs()[10]
7 | main = commandArgs()[11]
8 |
9 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
10 |
11 | actual = as.factor(data[,actualColumnName])
12 | probabilities = as.numeric(data[,probabilitiesColumnName])
13 |
14 | pdf(outFilePath)
15 | par(mar=c(4.5, 4.7, 0.0, 0.5), lwd=4)
16 |
17 | roc_result = roc(actual ~ probabilities, ci=TRUE, plot=TRUE, print.auc=FALSE)
18 | lowerBoundAuc = format(roc_result$ci[1], digits=3)
19 | midAuc = format(roc_result$ci[2], digits=3)
20 | upperBoundAuc = format(roc_result$ci[3], digits=3)
21 |
22 | ci(roc_result)
23 | sens.ci <- ci.se(roc_result)
24 | plot(sens.ci, type="shape", col="gray95")
25 | plot(sens.ci, type="bars")
26 | plot(roc_result, add=TRUE)
27 |
28 | text(0.5, 0.00, labels=paste("AUC: ", midAuc, " (", lowerBoundAuc, "-", upperBoundAuc, ")", sep=""))
29 | title(main)
30 |
31 | par(mar=c(5.1, 4.1, 2.1, 2.1))
32 | graphics.off()
33 |
34 | print(c(lowerBoundAuc, midAuc, upperBoundAuc))
35 |
--------------------------------------------------------------------------------
/Codes/CalcAccuracy.R:
--------------------------------------------------------------------------------
1 | library(pROC)
2 |
3 | inFilePath = commandArgs()[7]
4 | actualColumnName = commandArgs()[8]
5 | predColumnName = commandArgs()[9]
6 |
7 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
8 |
9 | actual = data[,actualColumnName]
10 | pred = data[,predColumnName]
11 |
12 | accuracy = sum(actual == pred) / nrow(data)
13 |
14 | print(accuracy)
15 |
--------------------------------------------------------------------------------
/Codes/Classify_luad_vs_lusc.R:
--------------------------------------------------------------------------------
1 | library(caret)
2 |
3 | outFilePath12 = "Classification_12_LUAD_LUSC_Predictions.txt"
4 | outFilePath20 = "Classification_20_LUAD_LUSC_Predictions.txt"
5 |
6 | # Read data from file
7 | setwd("Analysis_datasets")
8 | luad12 = read.table("12_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
9 | lusc12 = read.table("12_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
10 | lu12 = cbind(luad12,lusc12)
11 | luad20 = read.table("20_LUAD_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
12 | lusc20 = read.table("20_LUSC_t.txt", sep="\t", stringsAsFactors=F, header=TRUE, row.names=1, check.names=F)
13 | lu20 = cbind(luad20,lusc20)
14 |
15 | # Only keep the same samples in TCGA processed versus Rsubread processed data
16 | lu20_f = lu20[,colnames(lu20)%in%colnames(lu12)]
17 |
18 | # Remove class values from data frame "LGG"==rownames(data)[9752]
19 | classes12 = as.factor(as.character(lu12[nrow(lu12),]))
20 | data12 = t(data.matrix(lu12[-nrow(lu12),]))
21 | classes20 = as.factor(as.character(lu20_f[nrow(lu20_f),]))
22 | data20 = t(data.matrix(lu20_f[-nrow(lu20_f),]))
23 |
24 | # Retain features that do not have zero variance
25 | data12 = data12[,which(apply(data12, 2, var) > 0)]
26 | data20 = data20[,which(apply(data20, 2, var) > 0)]
27 |
28 | # Set random seed so results are same each time
29 | set.seed(0)
30 |
31 | # Build the classification model
32 | mod12 <- train(classes12~., data=data12, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T))
33 | set.seed(0)
34 | mod20 <- train(classes20~., data=data20, method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T))
35 |
36 | # Determine which mtry parameter value performed best
37 | tuneResults12 = mod12$results[order(mod12$results$Accuracy, decreasing=TRUE),]
38 | bestMtry12 = tuneResults12[1,]$mtry
39 | tuneResults20 = mod20$results[order(mod20$results$Accuracy, decreasing=TRUE),]
40 | bestMtry20 = tuneResults20[1,]$mtry
41 |
42 | # Select predictions that coincide with best mtry parameter
43 | predictions12 = mod12$pred[which(mod12$pred$mtry == bestMtry12),]
44 | predictions20 = mod20$pred[which(mod20$pred$mtry == bestMtry20),]
45 |
46 | # Sort predictions by the original order
47 | predictions12 = predictions12[order(predictions12$rowIndex),]
48 | predictions20 = predictions20[order(predictions20$rowIndex),]
49 |
50 | # Build output matrix
51 | output12 = cbind(rownames(data12), predictions12[,2], predictions12[,1], predictions12[,3:(ncol(predictions12) - 3)])
52 | colnames(output12) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output12)[4:ncol(output12)], "Probability", sep="_"))
53 | output20 = cbind(rownames(data20), predictions20[,2], predictions20[,1], predictions20[,3:(ncol(predictions20) - 3)])
54 | colnames(output20) = c("SampleID", "ActualClass", "PredictedClass", paste(colnames(output20)[4:ncol(output20)], "Probability", sep="_"))
55 |
56 | # Save predictions to output file
57 | write.table(output12, outFilePath12, sep="\t", col.names=T, row.names=F, quote=F)
58 | write.table(output20, outFilePath20, sep="\t", col.names=T, row.names=F, quote=F)
59 |
--------------------------------------------------------------------------------
/Codes/CombineScalarValues.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | from utilities import *
3 |
4 | inFilePattern = sys.argv[1]
5 | outFilePath = sys.argv[2]
6 |
7 | outFile = open(outFilePath, 'w')
8 |
9 | for inFilePath in glob.glob(inFilePattern):
10 | outFile.write("%s\t%s\n" % (os.path.basename(inFilePath), readScalarFromFile(inFilePath)))
11 |
12 | outFile.close()
13 |
--------------------------------------------------------------------------------
/Codes/FileContainsText.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | from utilities import *
3 |
4 | inFilePath = sys.argv[1]
5 | searchPattern = sys.argv[2].decode('string-escape')
6 |
7 | print searchPattern in readTextFromFile(inFilePath)
8 |
--------------------------------------------------------------------------------
/Codes/GetFileExtension.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 |
3 | inFilePath = sys.argv[1]
4 |
5 | file, ext = os.path.splitext(inFilePath)
6 |
7 | print ext
8 |
--------------------------------------------------------------------------------
/Codes/IdentifyDiscordantPredictions.R:
--------------------------------------------------------------------------------
1 | inFilePath = commandArgs()[7]
2 | actualColumnName = commandArgs()[8]
3 | predictedColumnName = commandArgs()[9]
4 | potentiallyDiscordantFilePath = commandArgs()[10]
5 |
6 | data = read.table(inFilePath, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
7 |
8 | incorrect = data[which(data[,actualColumnName]!=data[,predictedColumnName]),]
9 |
10 | potentiallyDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE)
11 |
12 | print("Samples predicted incorrectly:")
13 | print(nrow(incorrect))
14 |
15 | print("Samples predicted incorrectly that were identified previously as potentially discordant:")
16 | print(nrow(incorrect[which(incorrect$row.names %in% potentiallyDiscordantSamples),]))
17 |
--------------------------------------------------------------------------------
/Codes/IdentifyInconsistentPredictions.R:
--------------------------------------------------------------------------------
1 | inFilePath1 = commandArgs()[7]
2 | inFilePath2 = commandArgs()[8]
3 | actualColumnName = commandArgs()[9]
4 | predictedColumnName = commandArgs()[10]
5 |
6 | data1 = read.table(inFilePath1, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
7 | data2 = read.table(inFilePath2, sep="\t", stringsAsFactors=F, header=TRUE, row.names=NULL, check.names=F)
8 |
9 | incorrect1 = data1[which(data1[,actualColumnName]!=data1[,predictedColumnName]),]
10 | incorrect2 = data2[which(data2[,actualColumnName]!=data2[,predictedColumnName]),]
11 |
12 | print(nrow(incorrect1))
13 | print(nrow(incorrect2))
14 |
15 | diff12 = setdiff(incorrect1$row.names, incorrect2$row.names)
16 | diff21 = setdiff(incorrect2$row.names, incorrect1$row.names)
17 | diffs = c(diff12, diff21)
18 |
19 | print("Samples predicted inconsistently between two data sets:")
20 | data = merge(data1, data2, by=1)
21 | print(data[which(data$row.names %in% diffs),])
22 |
--------------------------------------------------------------------------------
/Codes/LUSC_vs_LUAD.R:
--------------------------------------------------------------------------------
1 | library(data.table)
2 | library(stringr)
3 | library(heatmap3)
4 | library(caret)
5 | library(pROC)
6 |
7 | readData = function(filePath, logTransform=FALSE)
8 | {
9 | data = fread(filePath)
10 |
11 | data = data.frame(data[-nrow(data),])
12 | rownames(data) = data[,1]
13 | data = data[,-1]
14 | data = data.matrix(data)
15 |
16 | if (logTransform)
17 | data = log2(data + 1)
18 |
19 | return(data)
20 | }
21 |
22 | mergeData = function(data1, data2)
23 | {
24 | merged = merge(data1, data2, by=0, sort=FALSE)
25 | rownames(merged) = merged[,1]
26 | merged = merged[,-1]
27 | }
28 |
29 | crossValidate = function(data, outPrefix)
30 | {
31 | # Remove any genes with no variance
32 | data = data[which(apply(data, 1, var) > 0),]
33 | write.table(dim(data), paste(outPrefix, "_Dimensions.txt", sep=""))
34 |
35 | library(doParallel)
36 | registerDoParallel(cores=12)
37 |
38 | # From http://stackoverflow.com/questions/13403427/fully-reproducible-parallel-models-using-caret
39 | # Unfortunately, it doesn't seem to ensure that the results are the same for multiple iterations
40 | set.seed(0)
41 | seeds <- vector(mode = "list", length = 11) # length is = (n_repeats*nresampling)+1
42 | for(i in 1:10) seeds[[i]] <- sample.int(n=1000, 3) #(3 is the number of tuning parameter, mtry for rf, here equal to ncol(iris)-2)
43 | seeds[[11]]<-sample.int(1000, 1)#for the last model
44 |
45 | model <- train(classes~., data=t(data), method = "rf", tuneLength = 3, trControl = trainControl(method = "cv", savePred=T, classProb=T), seeds=seeds)
46 |
47 | tuneResults = model$results[order(model$results$Accuracy, decreasing=TRUE),]
48 | bestMtry = tuneResults[1,]$mtry
49 |
50 | # Select predictions that coincide with best mtry parameter
51 | predictions = model$pred[which(model$pred$mtry == bestMtry),]
52 |
53 | # Sort predictions by the original order
54 | predictions = predictions[order(predictions$rowIndex),]
55 |
56 | rownames(predictions) = gsub("\\.", "-", colnames(data))
57 |
58 | write.table(predictions, paste(outPrefix, "_Predictions.txt", sep=""), sep="\t", quote=F, row.names=T, col.names=T)
59 |
60 | featureImportance <- varImp(model, scale = TRUE)$importance
61 | featureImportance <- featureImportance[order(featureImportance$Overall, decreasing=TRUE),,drop=FALSE]
62 | write.table(featureImportance, paste(outPrefix, "_FeatureImportance.txt", sep=""), quote=FALSE, row.names=T, col.names=NA, sep="\t")
63 | }
64 |
65 | identifyDiffExpressedGenes = function(data1, data2, n)
66 | {
67 | data1Mean = apply(data1, 1, mean)
68 | data2Mean = apply(data2, 1, mean)
69 | ratios = (data1Mean + 1) / (data2Mean + 1)
70 | ratios = sort(ratios, decreasing=TRUE)
71 | genesToPlot = c(names(head(ratios, n=n)), names(tail(ratios, n=n)))
72 |
73 | return(genesToPlot)
74 | }
75 |
76 | tcgaLuad = readData("12_LUAD_t.txt", logTransform=TRUE)
77 | tcgaLusc = readData("12_LUSC_t.txt", logTransform=TRUE)
78 | rsubreadLuad = readData("20_LUAD_t.txt")
79 | rsubreadLusc = readData("20_LUSC_t.txt")
80 |
81 | # Extract gene symbols from row names
82 | rownames(tcgaLuad) = sapply(rownames(tcgaLuad), function(x) { str_split(x, "\\|")[[1]][1] })
83 | rownames(tcgaLusc) = sapply(rownames(tcgaLusc), function(x) { str_split(x, "\\|")[[1]][1] })
84 |
85 | # Find genes that are common across both data sets
86 | commonTcgaGenes = intersect(rownames(tcgaLuad), rownames(tcgaLusc))
87 | commonRsubreadGenes = intersect(rownames(rsubreadLuad), rownames(rsubreadLusc))
88 | commonGenes = intersect(commonTcgaGenes, commonRsubreadGenes)
89 | nonOverlappingGenes = setdiff(commonRsubreadGenes, commonTcgaGenes)
90 |
91 | # Find samples that are common across both data sets
92 | commonLuadSamples = intersect(colnames(tcgaLuad), colnames(rsubreadLuad))
93 | commonLuscSamples = intersect(colnames(tcgaLusc), colnames(rsubreadLusc))
94 |
95 | # Select common genes, samples of interest
96 | tcgaLuad = tcgaLuad[commonTcgaGenes,commonLuadSamples]
97 | tcgaLusc = tcgaLusc[commonTcgaGenes,commonLuscSamples]
98 | rsubreadLuad = rsubreadLuad[commonRsubreadGenes,commonLuadSamples]
99 | rsubreadLusc = rsubreadLusc[commonRsubreadGenes,commonLuscSamples]
100 |
101 | classesLuad = rep("LUAD", ncol(tcgaLuad))
102 | classesLusc = rep("LUSC", ncol(tcgaLusc))
103 | classes = as.factor(c(classesLuad, classesLusc))
104 |
105 | tcga = mergeData(tcgaLuad, tcgaLusc)
106 | rsubread = mergeData(rsubreadLuad, rsubreadLusc)
107 |
108 | # Remove any genes with no variance
109 | tcga = tcga[which(apply(tcga, 1, var) > 0),]
110 | rsubread = rsubread[which(apply(rsubread, 1, var) > 0),]
111 |
112 | crossValidate(tcga, "TCGA_AllGenes")
113 | crossValidate(rsubread, "RSubread_AllGenes")
114 | crossValidate(tcga[commonGenes,], "TCGA_CommonGenes")
115 | crossValidate(rsubread[commonGenes,], "RSubread_CommonGenes")
116 | crossValidate(rsubread[nonOverlappingGenes,], "RSubread_NonOverlappingGenes")
117 |
118 | # Identify top differentially expressed genes
119 | tcgaDiffExpressedGenes = identifyDiffExpressedGenes(tcgaLuad, tcgaLusc, 100)
120 | rsubreadNonOverlappingDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLusc[nonOverlappingGenes,], 100)
121 |
122 | # Get potentially discordant samples
123 | luscDiscordantSamples = scan("Potentially_Discordant_LUSC_Samples.txt", what=character(), quiet=TRUE)
124 | luscDiscordantSamples = str_replace_all(luscDiscordantSamples, "\\-", ".")
125 | luscDiscordantSamples = intersect(luscDiscordantSamples, commonLuscSamples)
126 |
127 | tcgaLuscDiscordant = tcgaLusc[,luscDiscordantSamples]
128 | tcgaLusc = tcgaLusc[,setdiff(colnames(tcgaLusc), luscDiscordantSamples)]
129 | tcga = mergeData(tcgaLuad, tcgaLusc)
130 | tcga = mergeData(tcga, tcgaLuscDiscordant)
131 |
132 | rsubreadLuscDiscordant = rsubreadLusc[,luscDiscordantSamples]
133 | rsubreadLusc = rsubreadLusc[,setdiff(colnames(rsubreadLusc), luscDiscordantSamples)]
134 | rsubread = mergeData(rsubreadLuad, rsubreadLusc)
135 | rsubread = mergeData(rsubread, rsubreadLuscDiscordant)
136 |
137 | #discordantDiffExpressedGenes = identifyDiffExpressedGenes(rsubreadLuad[nonOverlappingGenes,], rsubreadLuscDiscordant[nonOverlappingGenes,], 5)
138 | discordantDiffExpressedGenes = c("MIR320A", "MIR1234", "MIR4461", "MIR186")
139 |
140 | colnames(rsubread) = str_replace_all(colnames(rsubread), "\\.", "-")
141 | write.table(rsubread[discordantDiffExpressedGenes,], "RSubread_Discordant_DiffExpressedGenes_Data.txt", sep="\t", quote=F, col.names=NA, row.names=T)
142 |
143 | classes = c(classesLuad, rep("LUSC", ncol(rsubreadLusc)), rep("Discordant LUSC", ncol(rsubreadLuscDiscordant)))
144 | classes = cbind(colnames(rsubread), classes)
145 | write.table(classes, "RSubread_Discordant_Classes.txt", sep="\t", quote=F, col.names=F, row.names=F)
146 |
--------------------------------------------------------------------------------
/Codes/ParseCgHubQueryResults.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 |
3 | inFilePath = sys.argv[1]
4 | sampleFilePath = sys.argv[2]
5 | outDownloadSamplesDirPath = sys.argv[3]
6 | outCancerTypesDirPath = sys.argv[4]
7 |
8 | def parseTagValue(lines, key):
9 | for line in lines:
10 | line = line.strip()
11 |
12 | if line.startswith("<%s>" % key):
13 | return line.replace("/", "").replace("<%s>" % key, "")
14 |
15 | return None
16 |
17 | def saveOutput(outLines):
18 | legacyID = parseTagValue(outLines, "legacy_sample_id")
19 |
20 | if sampleFilePath == "" or legacyID in samplesToKeep:
21 | analysisID = parseTagValue(outLines, "analysis_id")
22 |
23 | if analysisID != None:
24 | outFilePath = "%s/%s" % (outDownloadSamplesDirPath, legacyID)
25 | if os.path.exists(outFilePath):
26 | print "%s already exists" % outFilePath
27 | outFile = open(outFilePath, 'w')
28 | outFile.write("%s\n" % analysisID)
29 | outFile.close()
30 |
31 | cancerType = parseTagValue(outLines, "disease_abbr")
32 | if cancerType == None:
33 | print "Cancer type was not specified for %s." % analysisID
34 | exit(1)
35 | outFile = open("%s/%s" % (outCancerTypesDirPath, legacyID), 'w')
36 | outFile.write("%s\n" % cancerType)
37 | outFile.close()
38 |
39 | return legacyID
40 |
41 | return None
42 |
43 | inFileLines = [line for line in file(inFilePath)]
44 |
45 | headerLine1 = inFileLines.pop(0)
46 | headerLine2 = inFileLines.pop(0)
47 |
48 | if "Query" in inFileLines[0]:
49 | inFileLines.pop(0)
50 | inFileLines.pop(0)
51 |
52 | footerLine = inFileLines.pop(len(inFileLines)-1)
53 |
54 | if sampleFilePath != "":
55 | samplesToKeep = set([line.rstrip() for line in file(sampleFilePath)])
56 |
57 | samplesSaved = set()
58 |
59 | outLines = []
60 |
61 | for line in inFileLines:
62 | if " 0),]
29 |
30 | accent = brewer.pal(8, "Accent")
31 | set3 = brewer.pal(12, "Set3")
32 | cols = c(accent[1], set3[12], accent[7])
33 |
34 | ColSideColors = as.character(classes[,1])
35 | ColSideColors[ColSideColors=="LUAD"] = cols[1]
36 | ColSideColors[ColSideColors=="LUSC"] = cols[2]
37 | ColSideColors[ColSideColors=="Discordant LUSC"] = cols[3]
38 | par(lwd=4)
39 | if (nrow(data) <= 10)
40 | for (gene in rownames(data))
41 | plotHist(gene, data, classes[,1], paste("RSubread_", gene, "_Histogram.pdf", sep=""))
42 |
43 | pdf("RSubread_Discordant_Heatmap.pdf")
44 | colnames(data) = rep("", ncol(data))
45 | if (nrow(data) > 20)
46 | rownames(data) = rep("", nrow(data))
47 | heatmap3(data, Colv=NA, Rowv=TRUE, showRowDendro=T, showColDendro=F, cexRow=3, margins=c(5, 12), ColSideColors=ColSideColors, ColSideLabs="", cex=1.5)
48 | legend("top", legend=c("LUAD", "LUSC", "Discordant LUSC"), col=cols, cex=1.1, lty=1, lwd=4, inset=-0.07, xpd=TRUE, box.lwd=0, box.lty=0, horiz=F)
49 | graphics.off()
50 |
--------------------------------------------------------------------------------
/Codes/PrintMatrixDimensions.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 | import utilities
3 |
4 | inFilePath = sys.argv[1]
5 |
6 | inFile = open(inFilePath)
7 | numCols = len(inFile.readline().rstrip().split("\t"))
8 | numRows = 1
9 | for line in inFile:
10 | numRows += 1
11 | inFile.close()
12 |
13 | print "Number Rows: %i" % numRows
14 | print "Number Columns: %i" % numCols
15 |
--------------------------------------------------------------------------------
/Codes/ProcessClinicalData.R:
--------------------------------------------------------------------------------
1 | if (!require("plyr")) {
2 | install.packages("plyr", dependencies = TRUE)
3 | library(plyr)
4 | }
5 |
6 | data==identifiers=tmp_data=tmp_identifier=NULL
7 | dirname='.'
8 | setwd(dirname)#Set the directory where the clinical data is located for each cancer in separate folder
9 | filenames<-system("ls */nationwidechildrens.org_clinical_patient*", intern=T)
10 | for(i in 1:length(filenames)){#####iterating through each of the clinical files to create new matrix files with ALL clinical variables
11 | print(i)
12 | f<-(read.delim(paste(c(dirname,filenames[i]), collapse=''))) ###reading in the filess one at a time
13 | tmp_data<-f[3:nrow(f),]
14 | tmp_identifier<-f[1:2,]
15 | if(i==1){
16 | data<-tmp_data
17 | identifier<-tmp_identifier
18 | }else{
19 | identifier<-list(identifier,tmp_identifier)
20 | identifier<-rbind.fill.matrix(identifier)
21 | for(j in 1:ncol(identifier)){
22 | if(!is.na(identifier[3,j])){
23 | identifier[1,j]<-identifier[3,j]
24 | identifier[2,j]<-identifier[4,j]
25 | }
26 | }
27 | identifier<-identifier[1:2,]
28 | data<-list(data,tmp_data)
29 | data<-rbind.fill.matrix(data)
30 | #data<-merge(data,f)
31 | }
32 | }
33 | rownames(data)<-data[,2]
34 |
35 | #Now, converting short TCGA ids reported in clinical data to long TCGA ids reported in RNA-Seq dataset using R codes
36 |
37 | sample_names<-rownames(as.matrix(read.table("PANCAN24_CancerType_Samples.txt", row.names=1, sep='\t', check.names = F))) #getting the long TCGA IDs used in RNA-Seq dataset
38 | partial_sample_names<-rownames(data)
39 | counter=0##to check how many replacement has been done
40 | for (j in 1:length(partial_sample_names)){
41 | if(!is.na(pmatch(partial_sample_names[j],sample_names))){
42 | partial_sample_names[j]<-sample_names[pmatch(partial_sample_names[j],sample_names, duplicates.ok=F)]
43 | counter=counter+1
44 | }
45 | }
46 |
47 | rownames(data)<-partial_sample_names
48 | clinical_data<-matrix(NA, nrow=9264,ncol=548) ###instantiating an NA matrix
49 | rownames(clinical_data)<-sample_names
50 | colnames(clinical_data)<-colnames(data)
51 | for(i in 1:length(rownames(clinical_data))){
52 | sample_id<-rownames(clinical_data)[i]
53 | if(sample_id%in%rownames(data)){
54 | clinical_data[sample_id,]<-data[sample_id,]
55 | }
56 | }
57 | clinical_data_identifier<-cbind(t(identifier),t(clinical_data))
58 | write.table(clinical_data_identifier,file="TCGA_clinical_data_ordered_all_clinical_variables_samples_as_columns.txt", sep='\t',col.names=NA, quote=F)
59 |
60 |
--------------------------------------------------------------------------------
/Codes/ProcessRnaSeqFeatureCounts.R:
--------------------------------------------------------------------------------
1 | library(Rsubread)
2 | library(limma)
3 | library(edgeR)
4 | library(tools)
5 | options(digits=2)
6 |
7 | referenceGenomeFastaFilePath = commandArgs()[7]
8 | inFilePath1 = commandArgs()[8]
9 | inFilePath2 = commandArgs()[9] # NULL for single-end analyses or when a BAM file has been specified
10 | gtfFilePath = commandArgs()[10]
11 | tempFilePrefix = commandArgs()[11]
12 | outFpkmFilePath = commandArgs()[12]
13 | outTpmFilePath = commandArgs()[13]
14 | outCountsFilePath = commandArgs()[14]
15 | outStatsFilePath = commandArgs()[15]
16 |
17 | memory = 4000
18 | nthreads = 1
19 |
20 | input_format = "gzFASTQ"
21 | if (file_ext(inFilePath1) == "bam")
22 | input_format = "BAM"
23 | if (file_ext(inFilePath1) %in% c("fastq", "fq"))
24 | input_format = "FASTQ"
25 |
26 | outBamFilePath = paste(tempFilePrefix, "bam", sep=".")
27 |
28 | referenceGenomeIndexFilePrefix = paste(referenceGenomeFastaFilePath, "__reference_index", sep="")
29 |
30 | if (!file.exists(paste(referenceGenomeIndexFilePrefix, ".reads", sep="")))
31 | buildindex(basename=referenceGenomeIndexFilePrefix, reference=referenceGenomeFastaFilePath, memory=memory)
32 |
33 | if (inFilePath2 == "NULL")
34 | inFilePath2 = NULL
35 |
36 | if (!file.exists(outBamFilePath))
37 | align(index=referenceGenomeIndexFilePrefix, readfile1=inFilePath1, readfile2=inFilePath2, output_file=outBamFilePath, nthreads=nthreads, input_format=input_format, tieBreakHamming=TRUE, unique=TRUE, indels=5)
38 |
39 | fCountsList = featureCounts(outBamFilePath, annot.ext=gtfFilePath, isGTFAnnotationFile=TRUE, nthreads=nthreads, isPairedEnd=!is.null(inFilePath2))
40 | dgeList = DGEList(counts=fCountsList$counts, genes=fCountsList$annotation)
41 | fpkm = rpkm(dgeList, dgeList$genes$Length)
42 | tpm = exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
43 |
44 | write.table(fCountsList$stat, outStatsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
45 |
46 | featureCounts = cbind(fCountsList$annotation[,1], fCountsList$counts)
47 | write.table(featureCounts, outCountsFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
48 |
49 | write.table(cbind(fCountsList$annotation[,1], fpkm), outFpkmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
50 | #write.table(cbind(fCountsList$annotation[,1], log2(fpkm + 1)), outFpkmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
51 | write.table(cbind(fCountsList$annotation[,1], tpm), outTpmFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
52 | #write.table(cbind(fCountsList$annotation[,1], log2(tpm + 1)), outTpmLogFilePath, sep="\t", col.names=FALSE, row.names=FALSE, quote=FALSE)
53 |
54 | unlink(outBamFilePath)
55 | unlink(paste(outBamFilePath, ".indel", sep=""))
56 |
--------------------------------------------------------------------------------
/Codes/Split.py:
--------------------------------------------------------------------------------
1 | import os, sys, glob
2 |
3 | inFilePath = sys.argv[1]
4 | outDirPath = sys.argv[2]
5 |
6 | inFile = open(inFilePath)
7 | sampleIDs = inFile.readline().rstrip().split("\t")[1:]
8 |
9 | lineCount = 0
10 |
11 | for line in inFile:
12 | lineItems = line.rstrip().split("\t")
13 | gene = lineItems.pop(0)
14 |
15 | for sampleID in sampleIDs:
16 | outFile = open(outDirPath + "/" + sampleID, 'a')
17 | outFile.write("%s\t%s\n" % (gene, lineItems.pop(0)))
18 | outFile.close()
19 |
20 | lineCount += 1
21 | if lineCount % 1000 == 0:
22 | print lineCount
23 |
24 | inFile.close()
25 |
--------------------------------------------------------------------------------
/Codes/TransposeData.py:
--------------------------------------------------------------------------------
1 | import os, sys
2 | import utilities
3 |
4 | inFilePath = sys.argv[1]
5 | outFilePath = sys.argv[2]
6 |
7 | data = utilities.readMatrixFromFile(inFilePath)
8 |
9 | if len(data) > 1 and len(data[0]) == len(data[1]) - 1:
10 | data[0].insert(0, " ")
11 |
12 | utilities.writeMatrixToFile(utilities.transposeMatrix(data), outFilePath)
13 |
--------------------------------------------------------------------------------
/Codes/biological_rep.R:
--------------------------------------------------------------------------------
1 | find_biological_replicate<-function(matrix){
2 | s=NULL
3 | samples=colnames(matrix)
4 | for(i in 1:ncol(matrix)){
5 | s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
6 | }
7 | sum(duplicated(s))
8 | s=s[duplicated(s)]
9 | dupsamples=NULL
10 | counter=0
11 | for(i in 1:length(samples)){
12 | tmp=paste(strsplit(samples[i],'-')[[1]][1:3],sep='',collapse="-")
13 | print(tmp)
14 | if(tmp%in%s){
15 | print("biological replicate found!!")
16 | print(rownames(samples))[i]
17 | dupsamples=c(dupsamples,samples[i])
18 | counter=counter+1
19 | }
20 | }
21 |
22 | print(paste(counter,"samples are duplicated for biological replicates"))
23 | return (matrix[,colnames(matrix)%in%dupsamples])
24 |
25 | }
26 |
27 |
28 |
29 | library(data.table)
30 |
31 | samples<-read.table("~/Downloads/GSE62944_TCGA_20_CancerType_Samples.txt",row.names=1)
32 | # tcga20<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_FeatureCounts.txt"),row.names=1,check.names = F)
33 | # dim(tcga20)
34 | # filt_20<-find_biological_replicate(rownames(samples),tcga20)
35 | # dim(filt_20)
36 | # tcga20_zero<-apply(tcga20==0,2,sum)
37 | tcga24_tpm<-data.frame(fread("~/Desktop/PANCAN24/PANCAN24.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM.txt"),row.names=1,check.names = F)
38 | dim(tcga20_tpm)
39 |
40 | normal_tpm<-data.frame(fread("~/Desktop/PANCAN24/TCGA24.IlluminaHiSeq_RNASeqV2.normal_Rsubread_TPM.txt"),row.names=1,check.names = F)
41 | dim(normal_tpm)
42 | colnames(normal_tpm)
43 | s=NULL
44 | samples=colnames(normal_tpm)
45 | for(i in 1:ncol(normal_tpm)){
46 | s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
47 | }
48 | tumor_s=NULL
49 | samples=colnames(tcga24_tpm)
50 | for(i in 1:ncol(tcga24_tpm)){
51 | tumor_s[i]=paste(strsplit(samples,'-')[[i]][1:3],sep='',collapse = '-')
52 | }
53 |
54 |
55 |
56 | tcga20_tpm<-data.frame(fread("~/Desktop/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_TPM_10_9.txt"),row.names=1,check.names = F)
57 | dim(tcga20_tpm)
58 | filt_20_tpm<-find_biological_replicate(tcga20_tpm)
59 | dim(filt_20_tpm)
60 | tcga20_zero<-apply(tcga20_tpm==0,2,sum)
61 |
62 |
63 | rsem<-data.frame(fread("~/Desktop/PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist"),row.names=1,check.names = F)
64 | filt_12<-find_biological_replicate(rsem)
65 | dim(filt_12)##16 samples with 2 replicates
66 | rsem_zero<-apply(rsem==0,2,sum)
67 | filt_12<-filt_12[30:nrow(filt_12),]
68 | biological_rep_12<-subset(filt_12,select=colnames(filt_12)%in%colnames(filt_20_tpm))
69 | rownames_biological_rep_12<-gsub("[|].*","",rownames(biological_rep_12))
70 |
71 | biological_rep_12_o<-biological_rep_12[rownames_biological_rep_12%in%rownames(biological_rep_20),order(colnames(biological_rep_12))]
72 | dim(biological_rep_12_o)
73 |
74 | #biological_rep_20<-subset(filt_20,select=colnames(filt_20)%in%colnames(filt_12))
75 | #biological_rep_20_o<-biological_rep_20[,order(colnames(biological_rep_20))]
76 | biological_rep_20<-subset(filt_20_tpm,select=colnames(filt_20_tpm)%in%colnames(filt_12))
77 | biological_rep_20_o<-biological_rep_20[rownames(biological_rep_20)%in%rownames_biological_rep_12,order(colnames(biological_rep_20))]
78 | dim(biological_rep_20_o)
79 |
80 |
81 | total_20<-log2(apply(biological_rep_20_o,2,sum))
82 | total_12<-log2(apply(biological_rep_12_o,2,sum))
83 |
84 |
85 | #plot(total_12,ylim=c(22.75,26.5),main="PANCAN12 Level 3",ylab="log2(Gene Counts)")
86 | cor_res_12=cor_res_20=NULL
87 | pdf("~/Dropbox/Bioinformatics submission/Resubmission/scatter_plot.pdf")
88 |
89 | cor_res_12=cor_res_20=NULL
90 | for(i in 1:13){
91 | #points((i*2-1):(i*2),total_12[(i*2-1):(i*2)],col=i,lwd = 4,pch=i)
92 | plot(log2(biological_rep_12_o[,(i*2-1)]+1),log2(biological_rep_12_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_12_o)[(i*2-1)],"log2(Normalized gene counts)",sep='\n'),ylab=paste(colnames(biological_rep_12_o)[(i*2)],"log2(Normalized gene counts)",sep=' '))
93 | c=cor.test(biological_rep_12_o[,(i*2-1)],biological_rep_12_o[,(i*2)])#,method="spearman")
94 | cor_res_12=rbind(cor_res_12,c(colnames(biological_rep_12_o)[(i*2-1)],round(total_12[(i*2-1)],digits = 3),colnames(biological_rep_12_o)[(i*2)],round(total_12[(i*2)],digits = 3),round(c$estimate,digits = 3)))
95 | #title(paste(paste(strsplit(colnames(biological_rep_12_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep=""))
96 | title(paste("TCGA Level 3 \nPearson's correlation=",round(c$estimate,digits = 2),sep=""))
97 | print(i)
98 | print(colnames(biological_rep_12_o)[i])
99 | plot(log2(biological_rep_20_o[,(i*2-1)]+1),log2(biological_rep_20_o[,(i*2)]+1),xlim=c(0,20),ylim=c(0,20),xlab=paste(colnames(biological_rep_20_o)[(i*2-1)],"log2(TPM)",sep='\n'),ylab=paste(colnames(biological_rep_20_o)[(i*2)],"log2(TPM)",sep=''))
100 | c<-cor.test(biological_rep_20_o[,(i*2-1)],biological_rep_20_o[,(i*2)])#,method="spearman")
101 | cor_res_20=rbind(cor_res_20,c(colnames(biological_rep_20_o)[(i*2-1)],round(total_20[(i*2-1)],digits = 3),colnames(biological_rep_20_o)[(i*2)],round(total_20[(i*2)],digits=3),round(c$estimate,digits = 3)))
102 | #title(paste(paste(strsplit(colnames(biological_rep_20_o)[(i*2)],"-")[[1]][1:3],sep="",collapse = "-")," \nrho=",round(c$estimate,digits = 3),sep=""))
103 | title(paste("Rsubread TPM \nPearson's correlation=",round(c$estimate,digits = 2),sep=""))
104 |
105 | }
106 | colnames(cor_res_12)=c("Replicate_1","log2 Level 3 gene counts","Replicate_2","log2 Level 3 gene counts","Pearson's correlation between replicates(Level 3)")
107 | colnames(cor_res_20)=c("Replicate_1","log2 Rsubread gene counts","Replicate_2","log2 Rsubread gene counts","Pearson's correlation between replicates(Rsubread)")
108 |
109 | par( mfrow = c(2, 1 ) ,lwd=4)
110 | hist(as.numeric(cor_res_12[,5]),main = "TCGA Level 3 Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5)
111 | abline(v=mean(as.numeric(cor_res_12[,5])),col="red")
112 | abline(v=median(as.numeric(cor_res_12[,5])),col="blue")
113 | hist(as.numeric(cor_res_20[,5]),main = "Rsubread Replicates Two Replicates\n Each for 13 Samples",xlab = "Pearson's Correlation ", xlim=c(0.88,1),breaks = 5)
114 | abline(v=mean(as.numeric(cor_res_20[,5])),col="red")
115 | abline(v=median(as.numeric(cor_res_20[,5])),col="blue")
116 |
117 | write.table(cbind(cor_res_12,cor_res_20),"~/Desktop/correlations.txt",sep='\t',col.names=NA,quote=1)
118 | #************************************************************************
119 |
120 |
121 |
122 | ####
123 | ecdf_all_ex<-apply(log2(biological_rep_12_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf)
124 | plot(ecdf_all_ex[[1]],xlab="log2 Level 3 reads", ylab = NA,xlim=c(0,20),col="blue",main="TCGA Level 3",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5)
125 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown")
126 |
127 | ###using Rsubread pipeline aligned data
128 | # ecdf_all<-apply(rsub_fpkmlog,2,ecdf)
129 | # plot(ecdf_all[[1]],col="blue",main="Rsubread FPKM",ylim=c(0,1),xlim = c(0,20),cex.axis=1.5, cex.lab=1.5,xlab="log2(normalized expression)",ylab="Cumulative proportion")
130 | # for(i in 2:12){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="blue")}
131 | # for(i in 13:17){lines(ecdf_all[[i]],xlab=NA,ylab = NA,col="brown")}
132 |
133 | ecdf_all_ex<-apply(log2(biological_rep_20_o[,c("TCGA-50-5066-01A-01R-1628-07","TCGA-50-5066-02A-11R-2090-07")]+1),2,ecdf)
134 | plot(ecdf_all_ex[[1]],xlab="log2TPM reads", ylab = NA,xlim=c(0,20),col="blue",main="Rsubread",ylim=c(0,1),cex.axis=1.5, cex.lab=1.5,)
135 | lines(ecdf_all_ex[[2]],xlab=NA, ylab = NA,col="brown")
136 |
137 |
138 |
139 |
140 | ############zero
141 | setwd("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/")
142 | rsem_her2_expected_counts<-read.table("GFP18_HER2_TCGA_Pipeline_Expected_Gene_Counts.txt", sep='\t', header=1, row.names=1, check.names=F)
143 | feature<-read.table("GFP18_HER2_Rsubread_geneCounts.txt", sep='\t',header=1, row.names=1, check.names = F)
144 | TCGA_her2<-read.table("GFP18_HER2_TCGA_Pipeline_Normalized_Genes_Results.txt", sep='\t', header=1, check.names=F)
145 | rsub_tpm<-log2(read.table("GFP18_HER2_Rsubread_TPM.txt", sep='\t',header=1, row.names=1, check.names = F)+1)
146 | TCGA_her2_filtered<-TCGA_her2[!duplicated(TCGA_her2$Gene),]
147 | rownames(TCGA_her2_filtered)<-TCGA_her2_filtered$Gene
148 | TCGA_her2<-subset(TCGA_her2_filtered,select=-Gene)
149 | TCGA_her2_log2<-log2(subset(TCGA_her2_filtered,select=-Gene)+1)
150 |
151 | com_genes_TCGA<-TCGA_her2[rownames(TCGA_her2)%in%rownames(rsub_tpm),]
152 | com_genes_TCGA<-com_genes_TCGA[order(rownames(com_genes_TCGA)),]
153 | com_genes_tpm<-rsub_tpm[rownames(rsub_tpm)%in%rownames(com_genes_TCGA),]
154 | com_genes_tpm<-com_genes_tpm[order(rownames(com_genes_tpm)),]
155 | zero_genes_rsem<-com_genes_TCGA[apply(com_genes_TCGA[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates
156 | sum_zero_genes_rsem<-mean(apply(zero_genes_rsem==0,1,sum))##average of how many replicates have same zero expression
157 |
158 |
159 | nrow(zero_genes_rsem)
160 | nrow(zero_genes_rsem)/nrow(com_genes_TCGA)
161 | mean(apply(zero_genes_rsem,1,mean))#228.859 if TCGA counts are used
162 | zero_genes_f<-com_genes_tpm[apply(com_genes_tpm[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates
163 | sum_zero_genes_feature<-mean(apply(zero_genes_f==0,1,sum))##average of how many replicates have same zero expression
164 |
165 | nrow(zero_genes_f)
166 | nrow(zero_genes_f)/nrow(com_genes_tpm)
167 | mean(apply(zero_genes_f,1,mean))#0.55 if Rsubread counts are used.
168 | par( mfrow = c(1, 2 ) ,lwd=4)
169 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="TCGA Level 3",breaks=12)
170 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2)
171 |
172 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngene counts per sample",main="Rsubread TPM",breaks=2)
173 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2)
174 | pro_t<-zero_sum/nrow(rsem_f)
175 | prop<-cbind(pro_t,pro_r)
176 | colnames(prop)<-c("TCGA Level 3","Rsubread TPM")
177 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F)
178 |
179 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1,]),sep=" "))
180 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>0,1,mean)==1&apply(com_genes_tpm[,1:12]<=100,1,mean)==1,]),sep=" "))
181 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>100,1,mean)==1&apply(com_genes_tpm[,1:12]<=1000,1,mean)==1,]),sep=" "))
182 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>1000,1,mean)==1&apply(com_genes_tpm[,1:12]<=10000,1,mean)==1,]),sep=" "))
183 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(com_genes_tpm[apply(com_genes_TCGA[,1:12]==0,1,mean)==1&apply(com_genes_tpm[,1:12]>10000,1,mean)==1,]),sep=" "))
184 |
185 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1,]),sep=" "))
186 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>0,1,mean)==1&apply(com_genes_TCGA[,1:12]<=100,1,mean)==1,]),sep=" "))
187 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>100,1,mean)==1&apply(com_genes_TCGA[,1:12]<=1000,1,mean)==1,]),sep=" "))
188 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>1000,1,mean)==1&apply(com_genes_TCGA[,1:12]<=10000,1,mean)==1,]),sep=" "))
189 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(com_genes_TCGA[apply(com_genes_tpm[,1:12]==0,1,mean)==1&apply(com_genes_TCGA[,1:12]>10000,1,mean)==1,]),sep=" "))
190 |
191 |
192 |
193 |
194 |
195 | # feature_f<-feature[rownames(feature)%in%rownames(rsem_her2_expected_counts),]
196 | # rsem_f<-rsem_her2_expected_counts[rownames(rsem_her2_expected_counts)%in%rownames(feature),]
197 | # zero_sum_r<-apply(feature_f==0,2,sum)
198 | # pro_r<-zero_sum_r/nrow(feature_f)
199 | # pro_r<-pro_r[order(names(pro_r))]
200 | # zero_sum_tcga<-apply(rsem_f==0,2,sum)/nrow(rsem_f)
201 | # print(paste("Total number of zero expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]==0,1,mean)==1),]),sep=" "))
202 | # print(paste("Total number of zero expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]==0,1,mean)==1),]),sep=" "))
203 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" "))
204 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" "))
205 | # # print(paste("Total number of 1-100 expressing genes in Rsubread data=",nrow(feature_f[(apply(feature_f[,1:17]<100&feature_f[,1:17]<0,1,mean)==1),]),sep=" "))
206 | # # print(paste("Total number of 1-100 expressing genes in TCGA data=",nrow(rsem_f[(apply(rsem_f[,1:17]<100&rsem_f[,1:17]!=0,1,mean)==1),]),sep=" "))
207 | #
208 | # dim(rsem_f)
209 | # dim(feature_f)
210 | # rsem_f_o<-rsem_f[order(rownames(rsem_f)),]
211 | # feature_f_o<-feature_f[order(rownames(feature_f)),]
212 | # head(rownames(rsem_f_o))
213 | # head(rownames(feature_f_o))
214 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)==1,]#common zero expressing genes
215 | # nrow(zero_genes_rsem)
216 | # nrow(feature_f_o[apply(feature_f_o[,1:17]==0,1,mean)==1&apply(rsem_f_o[,1:17]==0,1,mean)==1,])
217 | # zero_genes_f<-feature_f_o[apply(rsem_f_o[,1:17]==0,1,mean)!=1&apply(feature_f_o[,1:17]==0,1,mean)==1,]##gene that are zero expressing in feature counts but nonzero in TCGA
218 | # nrow(zero_genes_f)
219 | # zero_genes_r<-rsem_f_o[apply(rsem_f_o[,1:17]==0,1,mean)==1&apply(feature_f_o[,1:17]==0,1,mean)!=1,]##gene that are zero expressing in Level 3 but nonzero in feature
220 | # nrow(zero_genes_r)
221 | # -------
222 | # zero_genes_rsem<-rsem_f_o[apply(rsem_f_o[,1:12]==0,1,mean)!=0,1:12]#atleast one zero in 12 GFP replicates
223 | # nrow(zero_genes_rsem)
224 | # nrow(zero_genes_rsem)/nrow(rsem_f_o)
225 | # mean(apply(zero_genes_rsem,1,mean))
226 | # zero_genes_f<-feature_f_o[apply(feature_f_o[,1:12]==0,1,mean)!=0,1:12]##at least one zero in 12 GFP replicates
227 | # nrow(zero_genes_f)
228 | # nrow(zero_genes_f)/nrow(feature_f_o)
229 | # mean(apply(zero_genes_f,1,mean))
230 |
231 | print(paste("Total number of nonzero rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1,]),sep=" "))
232 | print(paste("Total number of 1-100 reads in rsubread but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>0,1,mean)==1&apply(feature_f_o[,1:12]<=100,1,mean)==1,]),sep=" "))
233 | print(paste("Total number of 101-1000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>100,1,mean)==1&apply(feature_f_o[,1:12]<=1000,1,mean)==1,]),sep=" "))
234 | print(paste("Total number of 1001-10000 rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>1000,1,mean)==1&apply(feature_f_o[,1:12]<=10000,1,mean)==1,]),sep=" "))
235 | print(paste("Total number of 10000+ rsubreads but zero expressing TCGA genes:",nrow(feature_f_o[apply(rsem_f_o[,1:12]==0,1,mean)==1&apply(feature_f_o[,1:12]>10000,1,mean)==1,]),sep=" "))
236 |
237 | print(paste("Total number of nonzero TCGA reads but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1,]),sep=" "))
238 | print(paste("Total number of 1-100 reads in TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>0,1,mean)==1&apply(rsem_f_o[,1:12]<=100,1,mean)==1,]),sep=" "))
239 | print(paste("Total number of 101-1000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>100,1,mean)==1&apply(rsem_f_o[,1:12]<=1000,1,mean)==1,]),sep=" "))
240 | print(paste("Total number of 1001-10000 TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>1000,1,mean)==1&apply(rsem_f_o[,1:12]<=10000,1,mean)==1,]),sep=" "))
241 | print(paste("Total number of 10000+ TCGA but zero expressing Rsubread genes:",nrow(rsem_f_o[apply(feature_f_o[,1:12]==0,1,mean)==1&apply(rsem_f_o[,1:12]>10000,1,mean)==1,]),sep=" "))
242 |
243 |
244 |
245 |
246 |
247 | #interesting_genes<-feature_f[rownames(feature_f)%in%rownames(zero_genes),]
248 | #zero_genes_feature<-interesting_genes[apply(interesting_genes[,1:17]==0,1,mean)==1,]#
249 | par( mfrow = c(2, 1 ) ,lwd=4)
250 | hist(apply(zero_genes_rsem[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="TCGA",breaks=12)
251 | abline(v=median(apply(zero_genes_rsem[,1:12]==0,2,sum)),col="red",lty=2)
252 |
253 | hist(apply(zero_genes_f[,1:12]==0,2,sum),ylim=c(0,10),xlim=c(3900,6500),xlab="Total number of zero expressed \ngenes counts per sample",main="Rsubread",breaks=2)
254 | abline(v=median(apply(zero_genes_f[,1:12]==0,2,sum)),col="red",lty=2)
255 | pro_t<-zero_sum/nrow(rsem_f)
256 | prop<-cbind(pro_t,pro_r)
257 | colnames(prop)<-c("TCGA","Rsubread")
258 | write.table(prop,"~/Dropbox/Bioinformatics submission/Resubmission/zero_prop.txt",sep='\t',col.names = NA,quote=F)
259 | #########LUSC but LUAD-like analysis##
260 | class_12<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_12_LUAD_LUSC_Predictions.txt", header=1,row.names=1)
261 | class_20<-read.table("~/Desktop/TCGA_RNASeq_Clinical/Analysis_datasets/Classification_20_LUAD_LUSC_Predictions.txt", header=1,row.names=1)
262 | mismatch12<-class_12[class_12[,1]!=class_12[,2],]
263 | mismatch20<-class_20[class_20[,1]!=class_20[,2],]
264 | mismatches_all<-merge(class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],class_12[rownames(class_12)%in%rownames(mismatch12)|rownames(class_12)%in%rownames(mismatch20),],by=0)
265 | rownames(mismatches_all)<-gsub("01A-.*-07","01",mismatches_all$Row.names)
266 | mismatches_all<-mismatches_all[,2:ncol(mismatches_all)]
267 | colnames(mismatches_all)<-c("ActualClass.TCGA","PredictedClass.TCGA","LUAD_Probability.TCGA", "LUSC_Probability.TCGA", "ActualClass.Rsubread","PredictedClass.Rsubread",
268 | "LUAD_Probability.Rsubread","LUSC_Probability.Rsubread")
269 | lusc_but_luad<-read.table("~/Dropbox/TCGA_RNASeq_Clinical/Analysis_datasets/LUSC_but_LUAD_like.txt",sep='\t', header=1)
270 | discord<-merge(mismatches_all,lusc_but_luad,by.x=0,by.y=1,all.y=T)
271 | mismatches_all[gsub("01A-.*-07","01",mismatches_all$Row.names)%in%lusc_but_luad$sample,]#identifies the missclassified LUSC, but LUAD-like samples identified by
272 | lusc_but_luad[!lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),]
273 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",mismatches_all$Row.names),]
274 | lusc_but_luad[lusc_but_luad$sample%in%gsub("01A-.*-07","01",rownames(mismatch20)),]
275 |
276 |
--------------------------------------------------------------------------------
/Codes/numZero.R:
--------------------------------------------------------------------------------
1 |
2 | ##Manually download Pancan12 RNA_Seq dataset from https://www.synapse.org/#!Synapse:syn1695324 and filtered for gene symbols.Additionally, download Rsubread TPM RNA_Seq data from GEO accession number GSM1536837.
3 | pan12<-read.table("PANCAN12.IlluminaHiSeq_RNASeqV2.geneExp.tumor_whitelist", header=1,row.names=1)
4 | pan20<-read.table("GSM1536837_TCGA_20.Illumina.tumor_Rsubread_TPM.txt",header=1,row.names=1)
5 |
6 | pan12_f<-pan12[rownames(pan12)%in%rownames(pan20),colnames(pan12)%in%colnames(pan20)]
7 | pan20_f<-pan20[rownames(pan20)%in%rownames(pan12),colnames(pan20)%in%colnames(pan12)]
8 |
9 |
10 |
11 | write.table(apply((pan12_f==0),2,sum),"PANCAN12_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F)
12 | write.table(apply((pan20_f==0),2,sum),"PANCAN20_19583_by_3380_numZeroes.txt",sep='\t',col.names=F,quote=F)
13 |
--------------------------------------------------------------------------------
/Codes/utilities.py:
--------------------------------------------------------------------------------
1 | import glob, os, posix, sys, math, collections, json, difflib
2 | #import scipy
3 | #from scipy.stats import *
4 | from operator import itemgetter, attrgetter
5 | import itertools
6 | from random import uniform, sample
7 | #import numpy
8 | from collections import defaultdict
9 | #from fisher import *
10 | #from transcendental import stdtr
11 |
12 | def printFlush(text, outFilePath=None):
13 | print text
14 | sys.stdout.flush()
15 |
16 | if outFilePath != None:
17 | outFile = open(outFilePath, 'a')
18 | outFile.write(text + "\n")
19 | outFile.close()
20 |
21 | def printMatrix(data):
22 | for x in data:
23 | print x
24 | print ""
25 |
26 | def smartDivide(numerator, denominator):
27 | if float(denominator) == 0.0:
28 | return float('nan')
29 |
30 | return float(numerator) / float(denominator)
31 |
32 | def getProbes(probeTabFilePath):
33 | probes = []
34 |
35 | probeTabFile = open(probeTabFilePath)
36 | headerItems = [x.lower() for x in probeTabFile.readline().rstrip().split("\t")]
37 |
38 | for line in probeTabFile:
39 | lineItems = line.rstrip().split("\t")
40 | if headerItems.count("probe set name") > 0:
41 | probeset = lineItems[headerItems.index("probe set name")]
42 | else:
43 | if headerItems.count("probe set id") > 0:
44 | probeset = lineItems[headerItems.index("probe set id")]
45 | else:
46 | print "No probe set name or probe set id column in %s" % probeTabFilePath
47 |
48 | probeX = lineItems[headerItems.index("probe x")]
49 | probeY = lineItems[headerItems.index("probe y")]
50 | probe = probeset + "#" + probeX + "_" + probeY
51 | probes.append(probe)
52 |
53 | return probes
54 |
55 | def getProbesetProbesDict(probes):
56 | probesetProbesDict = {}
57 |
58 | for probe in probes:
59 | probeset = probe[:probe.find("#")]
60 | probesetProbesDict[probeset] = probesetProbesDict.setdefault(probeset, []) + [probe]
61 |
62 | return probesetProbesDict
63 |
64 | def getPatientIDs(normDirPath, normFileSuffix):
65 | ids = []
66 |
67 | #print normDirPath + "*" + normFileSuffix
68 | #sys.exit(0)
69 | for filePath in glob.glob(normDirPath + "*" + normFileSuffix):
70 | ids.append(filePath.replace(normDirPath, "").replace(normFileSuffix, ""))
71 |
72 | ids.sort()
73 | return ids
74 |
75 | def readScalarFromFile(filePath):
76 | return readMatrixFromFile(filePath)[0][0]
77 |
78 | def writeScalarToFile(x, filePath):
79 | outFile = open(filePath, 'w')
80 | outFile.write(x)
81 | outFile.close()
82 |
83 | def readVectorFromFile(filePath):
84 | return [line.rstrip() for line in file(filePath)]
85 |
86 | def writeVectorToFile(data, filePath):
87 | outFile = open(filePath, 'w')
88 | for x in data:
89 | outFile.write(str(x) + "\n")
90 | outFile.close()
91 |
92 | def readMatrixFromFile(filePath, numLines=None):
93 | matrix = []
94 | for line in file(filePath):
95 | if numLines != None and len(matrix) >= numLines:
96 | break
97 |
98 | matrix.append(line.rstrip().split("\t"))
99 |
100 | if len(matrix) % 100000 == 0:
101 | print len(matrix)
102 |
103 | return matrix
104 |
105 | def writeMatrixToFile(x, filePath, writeMode='w'):
106 | outFile = open(filePath, writeMode)
107 | writeMatrixToOpenFile(x, outFile)
108 | outFile.close()
109 |
110 | def writeMatrixToOpenFile(x, outFile):
111 | for y in x:
112 | outFile.write("\t".join([str(z) for z in y]) + "\n")
113 |
114 | def appendMatrixToFile(x, filePath):
115 | writeMatrixToFile(x, filePath, writeMode='a')
116 |
117 | def readTextFromFile(filePath):
118 | text = ""
119 |
120 | for line in file(filePath):
121 | text += line
122 |
123 | return text
124 |
125 | def writeDictToFile(dictionary, filePath):
126 | writeScalarToFile(json.dumps(dictionary), filePath)
127 |
128 | def readDictFromFile(filePath):
129 | txt = readTextFromFile(filePath)
130 | dictionary = json.loads(txt)
131 |
132 | dictionary2 = {}
133 |
134 | for key in dictionary:
135 | value = dictionary[key]
136 |
137 | if isNumeric(key):
138 | key = int(key)
139 |
140 | dictionary2[key] = value
141 |
142 | return dictionary2
143 |
144 | def calculateMean(values):
145 | if len(values) == 0:
146 | return float('nan')
147 |
148 | return sum(values) / len(values)
149 |
150 | def calculateVarianceMean(values):
151 | mu = calculateMean(values)
152 | diffValues = [(x - mu)**2 for x in values]
153 | return calculateMean(diffValues) / (len(diffValues) - 1)
154 |
155 | def calculateWeightedMean(values, weights):
156 | if len(values) != len(weights):
157 | print "When calculating a weighted mean, the values must be the same length as the weights."
158 | raise
159 |
160 | def calculateStandardDeviation(values):
161 | xbar = calculateMean(values)
162 | residuals = [x - xbar for x in values]
163 | residualsSquared = [x**2 for x in residuals]
164 | return math.sqrt(sum(residualsSquared) / (len(values) - 1))
165 |
166 | def calculateZscore(x):
167 | mean = calculateMean(x)
168 | standardDeviation = calculateStandardDeviation(x)
169 | return [(y - mean) / standardDeviation for y in x]
170 |
171 | def calculateTrimmedMean(values, trimProportion=0.10):
172 | if values == None or len(values) == 0:
173 | return None
174 |
175 | values = sorted([float(x) for x in values])
176 |
177 | if len(values) < 3:
178 | return calculateMean(values)
179 | elif len(values) == 3:
180 | return values[1]
181 | elif len(values) == 4:
182 | return calculateMean(values[1:3])
183 | elif len(values) == 5:
184 | return calculateMean(values[1:4])
185 |
186 | values = scipy.stats.trimboth(values, trimProportion)
187 |
188 | return float(calculateMean(values))
189 |
190 | def calculateEuclideanDistance(xList, yList):
191 | zSum = 0.0
192 |
193 | for i in range(len(xList)):
194 | x = xList[i]
195 | y = yList[i]
196 | z = math.pow(x - y, 2)
197 | zSum += z
198 |
199 | return math.sqrt(zSum)
200 |
201 | def calculateCorrelationCoefficient(xList, yList):
202 | return numpy.corrcoef(xList, yList)[0,1]
203 |
204 | def calculatePearsonCoefficient(xList, yList):
205 | return stats.pearsonr(xList, yList)[0]
206 |
207 | def calculateSpearmanCoefficient(xList, yList):
208 | return stats.spearmanr(xList, yList)[0]
209 |
210 | def calculateTTest(xList, yList):
211 | xList = numpy.array([x for x in xList if not math.isnan(x)])
212 | yList = numpy.array([y for y in yList if not math.isnan(y)])
213 |
214 | if len(xList) == 1 and len(yList) > 1:
215 | return calculateOneSampleTTest(xList[0], yList)
216 | if len(xList) > 1 and len(yList) == 1:
217 | return calculateOneSampleTTest(yList[0], xList)
218 |
219 | return ttest_ind(xList, yList, 0)[1]
220 |
221 | # From http://stackoverflow.com/questions/10038543/tracking-down-the-assumptions-made-by-scipys-ttest-ind-function
222 | def calculateWelchTTest(pop1, pop2):
223 | num1 = numpy.array(pop1).shape[0]
224 | num2 = numpy.array(pop2).shape[0]
225 |
226 | t_stat = (numpy.mean(pop1) - numpy.mean(pop2))/numpy.sqrt( numpy.var(pop1)/num1 + numpy.var(pop2)/num2)
227 | df = ((numpy.var(pop1)/num1 + numpy.var(pop2)/num2)**(2.0)) / ((numpy.var(pop1)/num1)**(2.0)/(num1-1) + (numpy.var(pop2)/num2) ** (2.0) / (num2-1))
228 |
229 | #one_tailed_p_value = 1.0 - scipy.stats.t.cdf(t_stat,df)
230 | two_tailed_p_value = 1.0 - (scipy.stats.t.cdf(numpy.abs(t_stat),df) - scipy.stats.t.cdf(-numpy.abs(t_stat), df))
231 |
232 | return two_tailed_p_value
233 |
234 | def calculateOneSampleTTest(x, yList):
235 | return stats.ttest_1samp(yList, x)[1]
236 |
237 | def isValueAberrant(x, yList, numStandardDeviations):
238 | std = calculateStandardDeviation(yList)
239 | lowerLimit = calculateMean(yList) - float(numStandardDeviations) * std
240 | upperLimit = calculateMean(yList) + float(numStandardDeviations) * std
241 |
242 | return x < lowerLimit or x > upperLimit
243 |
244 | def calculateMedian(values):
245 | sortedValues = sorted(values)
246 |
247 | if len(sortedValues) % 2 == 1:
248 | return sortedValues[(len(sortedValues)+1)/2-1]
249 | else:
250 | lower = sortedValues[len(sortedValues)/2-1]
251 | upper = sortedValues[len(sortedValues)/2]
252 | return (float(lower + upper)) / 2
253 |
254 | def calculateFoldChange(values1, values2):
255 | overallMin = min(min(values1), min(values2))
256 |
257 | values1 = [x - overallMin + 1 for x in values1]
258 | values2 = [x - overallMin + 1 for x in values2]
259 |
260 | mean1 = calculateMean(values1)
261 | mean2 = calculateMean(values2)
262 |
263 | return mean1 / mean2
264 |
265 | def calculateAbsoluteFoldChange(values1, values2):
266 | overallMin = min(min(values1), min(values2))
267 |
268 | values1 = [x - overallMin + 1 for x in values1]
269 | values2 = [x - overallMin + 1 for x in values2]
270 |
271 | mean1 = calculateMean(values1)
272 | mean2 = calculateMean(values2)
273 |
274 | ratioA = mean1 / mean2
275 | ratioB = mean2 / mean1
276 |
277 | return min(ratioA, ratioB)
278 |
279 | def getNormalizedProbes(normFilePath):
280 | print "Getting normalized probes"
281 | return [line.split(" ")[0] for line in file(normFilePath)]
282 |
283 | def getKeyProbeDict(filePath, probesToKeep=None, minProbesPerKey=1):
284 | probesToKeepSet = set(probesToKeep)
285 | keyProbeDict = {}
286 |
287 | for line in file(filePath):
288 | lineItems = line.rstrip().split("\t")
289 | key = lineItems[0]
290 |
291 | if len(lineItems) > 1:
292 | fileProbes = [x for x in lineItems[1].split(",") if x != ""]
293 |
294 | if len(fileProbes) >= 0:
295 | keyProbeDict[key] = keyProbeDict.setdefault(key, []) + fileProbes
296 |
297 | return keyProbeDict
298 |
299 | def getTranscriptProbeDict(filePath, normFilePath):
300 | normalizedProbes = set(getNormalizedProbes(normFilePath))
301 |
302 | print "Getting transcript-probe dictionary"
303 | transcriptProbeDict = {}
304 | for line in file(filePath):
305 | lineItems = line.rstrip().split("\t")
306 | transcript = lineItems[0]
307 | probes = lineItems[1].split(",")
308 | probes = list(set(probes) & normalizedProbes)
309 |
310 | transcriptProbeDict[transcript] = probes
311 |
312 | return transcriptProbeDict
313 |
314 | def getPatientsKeyValuesDict(sourceDir, patientIDs, fileSuffix, dataValueIndex, keys=None):
315 | patientsKeyValuesDict = collections.defaultdict(dict)
316 |
317 | if len(patientIDs) == 0:
318 | return patientsKeyValuesDict
319 |
320 | keyLineIndicesDict = {}
321 | lineCount = 0
322 |
323 | for line in file(sourceDir + patientIDs[0] + fileSuffix):
324 | key = line.rstrip().split("\t")[0]
325 | keyLineIndicesDict[key] = lineCount
326 |
327 | lineCount += 1
328 | #if lineCount % 100000 == 0:
329 | # print "Parsing file line indices: %i" % lineCount
330 |
331 | #print "Creating key line indices list from dict"
332 | if keys == None:
333 | keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keyLineIndicesDict.keys()]
334 | else:
335 | keyLineIndices = [(key, keyLineIndicesDict[key]) for key in keys if key in keyLineIndicesDict.keys()]
336 |
337 | #print "Sorting key line indices"
338 | keyLineIndices.sort(key=itemgetter(1))
339 |
340 | patientFileHandles = {}
341 | for patientID in patientIDs:
342 | patientFileHandles[patientID] = open(checkDirPath(sourceDir) + patientID + fileSuffix)
343 |
344 | for patientID in patientIDs:
345 | #print patientID
346 | patientFile = open(checkDirPath(sourceDir) + patientID + fileSuffix)
347 |
348 | previousLineIndex = 0
349 | for keyLineIndex in keyLineIndices:
350 | for i in range(previousLineIndex, keyLineIndex[1]):
351 | patientFile.readline()
352 | previousLineIndex = keyLineIndex[1] + 1
353 |
354 | lineItems = patientFile.readline().rstrip().split("\t")
355 | patientsKeyValuesDict[patientID][lineItems[0]] = lineItems[dataValueIndex]
356 |
357 | patientFile.close()
358 |
359 | return patientsKeyValuesDict
360 |
361 | def getPatientKeyValuesDict(filePath, dataColumnIndex, probes=None):
362 | probeValues = {}
363 |
364 | for line in file(filePath):
365 | lineItems = line.rstrip().split("\t")
366 | probe = lineItems[0]
367 | value = lineItems[dataColumnIndex]
368 |
369 | probeValues[probe] = value
370 |
371 | if not probes:
372 | return probeValues
373 | else:
374 | modProbeValues = {}
375 | for probe in probes:
376 | modProbeValues[probe] = probeValues[probe]
377 | return modProbeValues
378 |
379 | def savePatientKeyValuesDict(patientDict, outFilePath):
380 | outFile = open(outFilePath, 'w')
381 |
382 | for key in sorted(patientDict.keys()):
383 | outFile.write("%s\t%s\n" % (key, patientDict[key]))
384 |
385 | outFile.close()
386 |
387 | def checkDirPath(dirPath):
388 | if not os.path.exists(dirPath):
389 | posix.mkdir(dirPath)
390 |
391 | if not dirPath.endswith("/"):
392 | dirPath = dirPath + "/"
393 |
394 | return dirPath
395 |
396 | def lastIndexOf(theList, value):
397 | return len(theList) - 1 - theList[::-1].index(value)
398 |
399 | def getTranscriptGeneDict(filePath):
400 | transcriptGeneDict = {}
401 |
402 | for line in file(filePath):
403 | lineItems = line.rstrip().split("\t")
404 | transcript = lineItems[0]
405 |
406 | gene = lineItems[1]
407 | if len(lineItems) == 3:
408 | gene = lineItems[2]
409 |
410 | transcriptGeneDict[transcript] = gene
411 |
412 | return transcriptGeneDict
413 |
414 | def getGeneTranscriptDict(filePath):
415 | geneTranscriptDict = {}
416 |
417 | for line in file(filePath):
418 | lineItems = line.rstrip().split("\t")
419 | transcript = lineItems[0]
420 |
421 | gene = lineItems[1]
422 | if len(lineItems) == 3:
423 | gene = lineItems[2]
424 |
425 | geneTranscriptDict[gene] = geneTranscriptDict.setdefault(gene, []) + [transcript]
426 |
427 | return geneTranscriptDict
428 |
429 | def transposeMatrix(x):
430 | transposed = zip(*x)
431 |
432 | for i in range(len(transposed)):
433 | transposed[i] = list(transposed[i])
434 |
435 | return transposed
436 |
437 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/
438 | def order(x, NoneIsLast = True, decreasing = False):
439 | """
440 | Returns the ordering of the elements of x. The list
441 | [ x[j] for j in order(x) ] is a sorted version of x.
442 |
443 | Missing values in x are indicated by None. If NoneIsLast is true,
444 | then missing values are ordered to be at the end.
445 | Otherwise, they are ordered at the beginning.
446 | """
447 | omitNone = False
448 | if NoneIsLast == None:
449 | NoneIsLast = True
450 | omitNone = True
451 |
452 | n = len(x)
453 | ix = range(n)
454 | if None not in x:
455 | ix.sort(reverse = decreasing, key = lambda j : x[j])
456 | else:
457 | # Handle None values properly.
458 | def key(i, x = x):
459 | elem = x[i]
460 | # Valid values are True or False only.
461 | if decreasing == NoneIsLast:
462 | return not(elem is None), elem
463 | else:
464 | return elem is None, elem
465 | ix = range(n)
466 | ix.sort(key=key, reverse=decreasing)
467 |
468 | if omitNone:
469 | n = len(x)
470 | for i in range(n-1, -1, -1):
471 | if x[ix[i]] == None:
472 | n -= 1
473 | return ix[:n]
474 | return ix
475 |
476 | # Copied from: http://code.activestate.com/recipes/491268-ordering-and-ranking-for-lists/
477 | def rankSmart(x, NoneIsLast=True, decreasing = False, ties = "first"):
478 | """
479 | Returns the ranking of the elements of x. The position of the first
480 | element in the original vector is rank[0] in the sorted vector.
481 |
482 | Missing values are indicated by None. Calls the order() function.
483 | Ties are NOT averaged by default. Choices are:
484 | "first" "average" "min" "max" "random" "average"
485 | """
486 | omitNone = False
487 | if NoneIsLast == None:
488 | NoneIsLast = True
489 | omitNone = True
490 | O = order(x, NoneIsLast = NoneIsLast, decreasing = decreasing)
491 | R = O[:]
492 | n = len(O)
493 | for i in range(n):
494 | R[O[i]] = i
495 | if ties == "first" or ties not in ["first", "average", "min", "max", "random"]:
496 | return R
497 |
498 | blocks = []
499 | isnewblock = True
500 | newblock = []
501 | for i in range(1,n) :
502 | if x[O[i]] == x[O[i-1]]:
503 | if i-1 not in newblock:
504 | newblock.append(i-1)
505 | newblock.append(i)
506 | else:
507 | if len(newblock) > 0:
508 | blocks.append(newblock)
509 | newblock = []
510 | if len(newblock) > 0:
511 | blocks.append(newblock)
512 |
513 | for i, block in enumerate(blocks):
514 | # Don't process blocks of None values.
515 | if x[O[block[0]]] == None:
516 | continue
517 | if ties == "average":
518 | s = 0.0
519 | for j in block:
520 | s += j
521 | s /= float(len(block))
522 | for j in block:
523 | R[O[j]] = s
524 | elif ties == "min":
525 | s = min(block)
526 | for j in block:
527 | R[O[j]] = s
528 | elif ties == "max":
529 | s =max(block)
530 | for j in block:
531 | R[O[j]] = s
532 | elif ties == "random":
533 | s = sample([O[i] for i in block], len(block))
534 | for i,j in enumerate(block):
535 | R[O[j]] = s[i]
536 | else:
537 | for i,j in enumerate(block):
538 | R[O[j]] = j
539 | if omitNone:
540 | R = [ R[j] for j in range(n) if x[j] != None]
541 | return R
542 |
543 | # The following function came from http://stackoverflow.com/questions/3071415/efficient-method-to-calculate-the-rank-vector-of-a-list-in-python
544 | def rank2(a):
545 | n = len(a)
546 | ivec=rank_simple(a)
547 | svec=[a[rank] for rank in ivec]
548 | sumranks = 0
549 | dupcount = 0
550 | newarray = [0]*n
551 | for i in xrange(n):
552 | sumranks += i
553 | dupcount += 1
554 | if i==n-1 or svec[i] != svec[i+1]:
555 | averank = sumranks / float(dupcount) + 1
556 | for j in xrange(i-dupcount+1,i+1):
557 | newarray[ivec[j]] = averank
558 | sumranks = 0
559 | dupcount = 0
560 |
561 | return newarray
562 |
563 | def globFilesSortedByModTime(pattern):
564 | def getModifiedTime(filename):
565 | return os.stat(filename).st_mtime
566 |
567 | return sorted(glob.glob(pattern), key=getModifiedTime)
568 |
569 | ## From http://stackoverflow.com/questions/34518/natural-sorting-algorithm
570 | def naturalSort(x, reverse=False):
571 | def natural_key(s):
572 | return tuple(
573 | int(''.join(chars)) if isdigit else ''.join(chars)
574 | for isdigit, chars in itertools.groupby(s, str.isdigit)
575 | )
576 |
577 | return sorted(x, key=natural_key, reverse=reverse)
578 |
579 | def getItemFrequencyMap(x):
580 | d = defaultdict(int)
581 | for item in x:
582 | d[item] += 1
583 |
584 | return d
585 |
586 | from math import modf, floor
587 |
588 | def quantile(x, q, qtype = 7, issorted = False):
589 | """
590 | Args:
591 | x - input data
592 | q - quantile
593 | qtype - algorithm
594 | issorted- True if x already sorted.
595 |
596 | Compute quantiles from input array x given q.For median,
597 | specify q=0.5.
598 |
599 | References:
600 | http://reference.wolfram.com/mathematica/ref/Quantile.html
601 | http://wiki.r-project.org/rwiki/doku.php?id=rdoc:stats:quantile
602 |
603 | Author:
604 | Ernesto P.Adorio Ph.D.
605 | UP Extension Program in Pampanga, Clark Field.
606 | """
607 | if not issorted:
608 | y = sorted(x)
609 | else:
610 | y = x
611 | if not (1 <= qtype <= 9):
612 | return None # error!
613 |
614 | # Parameters for the Hyndman and Fan algorithm
615 | abcd = [(0, 0, 1, 0), # inverse empirical distrib.function., R type 1
616 | (0.5, 0, 1, 0), # similar to type 1, averaged, R type 2
617 | (0.5, 0, 0, 0), # nearest order statistic,(SAS) R type 3
618 |
619 | (0, 0, 0, 1), # California linear interpolation, R type 4
620 | (0.5, 0, 0, 1), # hydrologists method, R type 5
621 | (0, 1, 0, 1), # mean-based estimate(Weibull method), (SPSS,Minitab), type 6
622 | (1, -1, 0, 1), # mode-based method,(S, S-Plus), R type 7
623 | (1.0/3, 1.0/3, 0, 1), # median-unbiased , R type 8
624 | (3/8.0, 0.25, 0, 1) # normal-unbiased, R type 9.
625 | ]
626 |
627 | a, b, c, d = abcd[qtype-1]
628 | n = len(x)
629 | g, j = modf( a + (n+b) * q -1)
630 | if j < 0:
631 | return y[0]
632 | elif j >= n:
633 | return y[n-1] # oct. 8, 2010 y[n]???!! uncaught off by 1 error!!!
634 |
635 | j = int(floor(j))
636 | if g == 0:
637 | return y[j]
638 | else:
639 | return y[j] + (y[j+1]- y[j])* (c + d * g)
640 |
641 | def calculateInterquartileRange(x):
642 | firstQ = quantile(x, 0.25)
643 | thirdQ = quantile(x, 0.75)
644 |
645 | return thirdQ - firstQ
646 |
647 | def isNumeric(x):
648 | return str(x).replace(".", "").replace("-", "").isdigit()
649 |
650 | def getUniqueMatrixColumnValues(filePath, columnIndex):
651 | uniqueValues = set()
652 |
653 | for line in file(filePath):
654 | uniqueValues.add(line.rstrip().split("\t")[columnIndex])
655 |
656 | return sorted(list(uniqueValues))
657 |
658 | def fisherExactTest(x):
659 | return FishersExactTest.probability_of_table(x)
660 |
661 | def complementGenomicSequence(sequence):
662 | mod = ""
663 |
664 | for base in sequence:
665 | mod += complementGenomicBase(base)
666 |
667 | return mod
668 |
669 | def complementGenomicBase(base):
670 | base = base.upper()
671 |
672 | if base == "A":
673 | return "T"
674 | if base == "T":
675 | return "A"
676 | if base == "C":
677 | return "G"
678 | return "C"
679 |
680 | def reverseComplementGenomicSequence(dnaSequence):
681 | return reverseString(complementGenomicSequence(dnaSequence))
682 |
683 | def reverseString(string):
684 | return string[::-1]
685 |
686 | def getDictValue(dictionary, key, default=""):
687 | if key in dictionary:
688 | return dictionary[key]
689 | return default
690 |
691 | def getDiffPositions(string1, string2):
692 | matcher = difflib.SequenceMatcher(a=string1, b=string2)
693 | blocks = matcher.get_matching_blocks()
694 |
695 | diffPositions = []
696 | for block in blocks:
697 | if block[2] == 0 or block[2] == len(string1):
698 | continue
699 |
700 | if len(diffPositions) == 0:
701 | diffPositions.append(block[2])
702 | else:
703 | diffPositions.append(block[2] + diffPositions[-1])
704 |
705 | return diffPositions
706 |
707 | def getSimilarityPercent(string1, string2):
708 | blocks = difflib.SequenceMatcher(None, a=string1, b=string2).get_matching_blocks()
709 |
710 | totalMatching = 0.0
711 | for block in blocks:
712 | totalMatching += block[2]
713 |
714 | return (totalMatching / float(len(string1))) * 100.0
715 |
716 | def getLineItems(line, separator="\t"):
717 | return line.rstrip().split(separator)
718 |
719 | def sortMatrix(data, columnIndex, reverse=False):
720 | data.sort(key=itemgetter(columnIndex), reverse=reverse)
721 | return data
722 |
723 | def uniqueSort(values):
724 | # Slow but keeps values in order and uniquifies
725 | out = []
726 |
727 | for value in values:
728 | if value not in out:
729 | out.append(value)
730 |
731 | return out
732 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 mumtahena
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This repository includes code for processing RNA-Seq FASTQ files and clinical data from The Cancer Genome Atlas. In addition, we have included the code used for analyzing data in our manuscript, "Alternative preprocessing of RNA-Sequencing data in The Cancer Genome Atlas leads to improved analysis results" (Rahman, Mumtahena, et al. _Bioinformatics_ [2015:10.1093/bioinformatics/btv377](http://bioinformatics.oxfordjournals.org/content/early/2015/08/14/bioinformatics.btv377.full).
2 |
3 | ## What is this repository for?
4 |
5 | * We used the 'Rsubread' R package to align and summarize reads at the gene level for 9264 tumor and 741 normal TCGA RNA-Seq samples. The R scripts we provide here can also be used to process samples that did not come from TCGA. We have also included the code for compiling clinical data available for these tumors into a matrix format and matching the clinical IDs with the RNA-Seq IDs.
6 | * We have provided the code and various intermediate data files that we produced in performing the analyses we describe in the manuscript.
7 |
8 | ## How to normalize raw RNA-Seq data and process clinical data from TCGA
9 |
10 | This pipeline is designed to be executed on Unix-based systems. Most of the code is written in the R programming language. But it also requires "bash" scripts to be executed at the command line.
11 |
12 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0.
13 |
14 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org):
15 | * Rsubread
16 | * limma
17 | * edgeR
18 | * tools
19 |
20 | 3. Clone this git repository to your local computer.
21 |
22 | 4. Via [dbGAP](http://www.ncbi.nlm.nih.gov/gap), obtain access to the raw TCGA data. Then obtain a private key that allows you download raw data via the [Cancer Genomics Hub](https://cghub.ucsc.edu/access/get_access.html). Store this key file as ```cghub.key``` in the current directory.
23 |
24 | 5. In the ```Genome``` directory, store the reference genome file and GTF file that can be obtained from [here](http://support.illumina.com/sequencing/sequencing_software/igenome.html). We used version hg19. After extracting these files, you will find the reference genome in Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa and the GTF file in Homo_sapiens/UCSC/hg19/Annotation/Archives/archive-2012-03-09-03-24-41/Genes/genes.gtf. Move these directly to the local Genome directory. **Update [08/04/2020]: These files are no longer available. You can find a copy of them [here](https://osf.io/cqkfp/). You will need to decompress the files using the `gunzip` utility. However, if you are going to run this pipeline now, you might consider using a newer version of the human reference genome.**
25 |
26 | 6. Execute Scripts/process_tcga_rsubread at the command line to begin downloading and normalizing samples.
27 |
28 | All the RNA-Seq and clinical data files that we have processed are available from Gene Expression Omnibus (accession numbers: GSE62820 and GSE62944).
29 |
30 | For informational purposes, we have also provided a bash script (Scripts/process_tcga_level_3) that contains the steps for producing "Level 3" values using the same steps that are performed by the TCGA consortium. These steps are described in more detail here: https://cghub.ucsc.edu/docs/tcga/UNC_mRNAseq_summary.pdf.
31 |
32 | ### Process clinical data
33 |
34 | 1. Install R package 'plyr' using the ```install.packages``` function in R.
35 |
36 | 2. Download the Clinical data for individual cancer type from [TCGA Data Portal] (https://tcga-data.nci.nih.gov/tcga/dataAccessMatrix.htm) in Biotab format.
37 |
38 | 3. Download [GSE62944_06_01_15_TCGA_24_CancerType_Samples.txt.gz](http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE62944&format=file&file=GSE62944%5F06%5F01%5F15%5FTCGA%5F24%5FCancerType%5FSamples%2Etxt%2Egz) from GEO (Accession number GSM1536837) and save the unzipped file to 'Datasets' folder.
39 |
40 | 4. Set directory to where all the clinical data folders for each cancer type is located.
41 |
42 | 5. Run the R script at Codes/ProcessClinicalData.R.
43 |
44 | ## How to reanalyze our findings
45 |
46 | We also provide an R Markdown file (Analysis/TCGA_24_manuscript_analysis.Rmd) that contains the analysis code that we used for our manuscript. If you desire to reexecute this analysis, please complete the following steps:
47 |
48 | 1. Install the [R statistical package](http://r-project.org). We used version 3.1.0.
49 |
50 | 2. Install the following R packages, which can be obtained using either the ```install.packages``` function in R or via the [Bioconductor framework](http://www.bioconductor.org):
51 | * stats
52 | * ROCR
53 | * pROC
54 | * caret
55 | * knitr
56 | * data.table
57 | * heatmap3
58 | * RColorBrewer
59 |
60 | 3. We used the [BinReg 2](http://www.biomedcentral.com/1471-2105/12/443) algorithm to make HER2 signature predictions on TCGA breast cancer samples. BinReg 2 runs on the MatLab platform. We used our HER2 signature datasets as training samples and the TCGA breast cancer datasets as test samples. We used the following parameters: 200 genes, 2 metagenes, quantile normalization (-g 200 -m 2 -q) to minimize the batch effects between training and test samples. The original outputs from BinReg2 are located within the ```Analysis_datasets/10_14_predictions_raw``` directory. Rerun of the HER2 pathway prediction excluding the two less consistent HER2 training samples is located at ``Analysis_datasets/5_01_predictions_raw``` .These output predictions are summarized in the Analysis_datasets directory folder for further evaluation.
61 |
62 | 4. The code we used to classify TCGA lung adenocarcinoma and squamous carcinoma samples is in Code/Classify_luad_vs_lusc.R. The outputs of this analysis are located in the ```Analysis_datasets``` directory. The bash script describing additional analysis to identify discordant LUAD samples and differentially expressed gene is located at Code/LUSC_LUAD_discordant_analysis.
63 |
64 | 5. Use the ```knitr``` package to compile Analysis/TCGA_24_manuscript_analysis.Rmd. (It is convenient to complete this step within the [RStudio environment](http://www.rstudio.com/).) Also be sure to set the working directory to ```Analysis_datasets```. Our results are stored in the TCGA_24_manuscript_analysis.html file.
65 |
66 | 6. Our analysis datasets and outputs are available [here] (https://www.dropbox.com/sh/4e0c8u7jke694tu/AADEQnB5LbCWihb3A5f04O9va?dl=0).
67 |
68 | ## Contact information
69 |
70 | * Mumtahena Rahman. [moom.rahman@utah.edu](mailto:moom.rahman@utah.edu)
71 | * Stephen R Piccolo. [https://piccolo.byu.edu](https://piccolo.byu.edu)
72 |
--------------------------------------------------------------------------------
/Scripts/LUSC_LUAD_discordant_analysis:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | Rscript --vanilla --max-ppsize=500000 ../Codes/LUSC_vs_LUAD.R
5 |
6 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_AllGenes_ROC.pdf "TCGA Level 3 - All Genes"
7 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_AllGenes_ROC.pdf "RSubread - All Genes"
8 |
9 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/TCGA_CommonGenes_ROC.pdf "TCGA Level 3 - Common Genes"
10 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs LUAD ../Analysis_datasets/RSubread_CommonGenes_ROC.pdf "RSubread - Common Genes"
11 |
12 | Rscript --vanilla ../Codes/CalcAUC.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs ../Analysis_datasets/LUAD RSubread_NonOverlappingGenes_ROC.pdf "RSubread - Non-Overlapping Genes"
13 |
14 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
15 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
16 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
17 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
18 | Rscript --vanilla ../Codes/IdentifyDiscordantPredictions.R ../Analysis_datasets/RSubread_NonOverlappingGenes_Predictions.txt obs pred ../Analysis_datasets/Potentially_Discordant_LUSC_Samples.txt
19 |
20 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_AllGenes_Predictions.txt ../Analysis_datasets/RSubread_AllGenes_Predictions.txt obs pred
21 | Rscript --vanilla ../Codes/IdentifyInconsistentPredictions.R ../Analysis_datasets/TCGA_CommonGenes_Predictions.txt ../Analysis_datasets/RSubread_CommonGenes_Predictions.txt obs pred
22 |
23 | Rscript --vanilla ../Codes/PlotDiscordant.R
24 |
--------------------------------------------------------------------------------
/Scripts/normalize_tcga_rsubread:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 |
5 | sampleIDFile=$1
6 |
7 | tcgaID=$(basename $sampleIDFile)
8 | analysisID=$(cat $sampleIDFile)
9 |
10 | currentDir=$(pwd)
11 | fastqDir=$currentDir/Temp/FASTQ
12 | outFpkmDir=$currentDir/FPKM
13 | outTpmDir=$currentDir/TPM
14 | outFeatureCountsDir=$currentDir/FeatureCounts
15 | outStatsDir=$currentDir/Stats
16 | inProgressFile=$currentDir/InProgress/$tcgaID
17 |
18 | rm -fv $inProgressFile
19 | touch $inProgressFile
20 |
21 | function cleanup {
22 | rm -rfv $fastqDir/${analysisID}*
23 | rm -rfv $fastqDir/${tcgaID}*
24 | rm -fv $inProgressFile
25 | }
26 |
27 | trap 'cleanup' TERM INT EXIT
28 |
29 | mkdir -pv $fastqDir/$tcgaID $outFpkmDir $outFpkmLogDir $outTpmDir $outTpmLogDir $outFeatureCountsDir $outStatsDir
30 |
31 | echo Downloading $tcgaID
32 | mkdir -p $currentDir/XmlFiles
33 | cgquery -o $currentDir/XmlFiles/$tcgaID.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&analysis_id=${analysisID}"
34 | gtdownload -vv -d $currentDir/XmlFiles/$tcgaID.xml -c $currentDir/cghub.key --max-children 1 -p $fastqDir
35 |
36 | echo Rename and extract files $tcgaID
37 | if [ -f $fastqDir/$analysisID/*.tar.gz ]
38 | then
39 | mv -v $fastqDir/$analysisID/*.tar.gz $fastqDir/$tcgaID.tar.gz
40 | tar -zxvf $fastqDir/$tcgaID.tar.gz -C $fastqDir/$tcgaID
41 | rm -fv $fastqDir/$tcgaID.tar.gz
42 | else
43 | mv -v $fastqDir/$analysisID/*.tar $fastqDir/$tcgaID.tar
44 | tar -xvf $fastqDir/$tcgaID.tar -C $fastqDir/$tcgaID
45 | rm -fv $fastqDir/$tcgaID.tar
46 | fi
47 |
48 | fastqFileNamesFile=$fastqDir/$tcgaID/FASTQFiles
49 | for f in $fastqDir/$tcgaID/*fastq* NULL
50 | do
51 | echo $f >> $fastqFileNamesFile
52 | done
53 |
54 | fastqFilePath1=$(head -n 1 $fastqFileNamesFile)
55 | fastqFilePath2=$(head -n 2 $fastqFileNamesFile | tail -n 1)
56 |
57 | Rscript --vanilla $currentDir/Codes/ProcessRnaSeqFeatureCounts.R $currentDir/Genome/genome.fa $fastqFilePath1 $fastqFilePath2 $currentDir/Genome/genes.gtf $fastqDir/$tcgaID $outFpkmDir/$tcgaID $outTpmDir/$tcgaID $outFeatureCountsDir/$tcgaID $outStatsDir/$tcgaID
58 |
59 | rm -fv $currentDir/XmlFiles/$tcgaID.xml
60 |
61 |
--------------------------------------------------------------------------------
/Scripts/process_tcga_level_3:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | fastqFile=$1
4 |
5 | softwareDir=Software/TCGA_RNA_Seq_Pipeline
6 | samTools=$softwareDir/rsem-1.1.13/sam/samtools
7 | bwaDir=$softwareDir/MapSplice_multithreads_12_07/bowtie-0.12.7_fusion
8 | mapspliceDir=$softwareDir/MapSplice_multithreads_12_07/bin
9 | picardDir=$softwareDir/picard-tools-1.82
10 | ubu=$softwareDir/ubu-1.2-jar-with-dependencies.jar
11 | rsemDir=$softwareDir/rsem-1.2.12
12 | bedToolsDir=$softwareDir/bedtools-2.17.0/bin
13 | referenceGenomeRef=Genomes/hg19_M_rCRS_ref
14 | referenceGenomeFile=Genomes/hg19_M_rCRS.fa
15 | referenceGenomeIndexFile=Genomes/hg19_M_rCRS/chromosomes
16 | referenceChromosomesDir=Genomes/hg19_M_rCRS/ebwt
17 | referenceBedFile=Genomes/unc_hg19.bed
18 | referenceTranscriptsFile=Genomes/hg19_M_rCRS_ref.transcripts.fa
19 |
20 | workingDir=Level_3_Temp
21 |
22 | mkdir -p $workingDir
23 |
24 |
25 | sampleID=`basename $fastqFile`
26 | sampleID=${sampleID/\.fastq/}
27 | outDir=${fastqFile/\.fastq/_rsem}
28 | mkdir -p $outDir
29 | mkdir -p $outDir/working
30 | tmpFastqFile=$outDir/`basename $fastqFile`
31 | outBamFile1=$outDir/alignments.bam
32 | outBamFile2=$outDir/rg_alignments.bam
33 | outBamFile3=$outDir/phred33_alignments.bam
34 | outBamFile4=$outDir/sorted_genome_alignments
35 | echo processing $sampleID
36 |
37 | #1. Format fastq 1 for Mapsplice
38 | java -Xmx512M -jar $ubu fastq-format --phred33to64 --strip --suffix /1 --in $fastqFile --out $tmpFastqFile> $outDir/working/mapsplice_prep.log
39 | echo preprocessing is done
40 |
41 | #2.Mapsplice
42 | python $mapspliceDir/mapsplice_multi_thread.py --fusion --all-chromosomes-files $referenceGenomeFile -X 8 -Q fq --chromosome-files-dir $referenceChromsomesFile --Bowtieidx $referenceGenomeIndexFile -1 $tmpFastqFile -o $outDir
43 | #echo initial bam file is created now.. deleting the processed FASTQ file
44 | rm $tmpFastqFile
45 |
46 | #3.Add read groups
47 | java -Xmx2G -jar $picardDir/AddOrReplaceReadGroups.jar INPUT=$outBamFile1 OUTPUT=$outBamFile2 RGSM=$sampleID RGID=$sampleID RGLB=TruSeq RGPL=illumina RGPU=barcode VALIDATION_STRINGENCY=SILENT TMP_DIR=$outDir/working/add_rg_tag_tmp > $outDir/working/add_rg_tag.log
48 | echo read groups added or replaced now!
49 |
50 | #4.Convert back to phred33
51 | java -Xmx512M -jar $ubu sam-convert --phred64to33 --in $outBamFile2 --out $outBamFile3 > $outDir/working/sam_convert.log
52 | echo bam file converted back to phred33
53 |
54 | #5.Sort by coordinate
55 | $samTools sort $outBamFile3 $outBamFile4
56 | echo converted Bam file is sorted now
57 |
58 | #6.Flagstat
59 | $samTools flagstat ${outBamFile4}.bam > ${outBamFile4}.flagstat
60 | echo flagstat file created now!
61 |
62 | #7.Index
63 | $samTools index ${outBamFile4}.bam
64 | echo Bam file is sorted now
65 |
66 | #8. Sort By chromosome, then read id
67 | echo using perl script from $softwareDir
68 | perl $softwareDir/sort_bam_by_reference_and_name.pl --input ${outBamFile4}.bam --output $outDir/sorted_by_chr_read.bam --temp-dir ${outDir}.tmp --samtools $samTools > $outDir/working/sorted_by_chr_read.log
69 | echo sorted by chromosome then id
70 |
71 | #9. Translate to transcriptome coors
72 | echo in directory $outDir
73 | java -Xmx3G -jar $ubu sam-xlate --single --bed $referenceBedFile --in $outDir/sorted_by_chr_read.bam --out $outDir/transcriptome_alignments.bam --order $referenceTranscriptsFile --xgtags --reverse > $outDir/working/genome_to_transcriptome.log
74 | echo translation to transcriptome coors done!
75 |
76 | #10. Filter indels, large inserts, zero mapping quality from transcriptome bam $ubu 1.2 version needed for this step to use '--single' parameter
77 | java -Xmx512M -jar $ubu sam-filter --single --in $outDir/transcriptome_alignments.bam --out $outDir/transcriptome_alignments_filtered.bam --strip-indels --max-insert 10000 --mapq 1 > $outDir/working/sam_filter.log
78 | echo Filtered indels, large inserts, zero mapping quality from transcriptome bam
79 |
80 | #11. RSEM
81 | echo starting rsem normalization in $outDir for $sampleID
82 |
83 | $rsemDir/rsem-calculate-expression --bam -p 8 --estimate-rspd --temporary-folder ${outDir}.temp_rsem --no-bam-output $outDir/transcriptome_alignments_filtered.bam $referenceGenomeRef $sampleID > $outDir/working/rsem.log
84 |
85 |
86 | echo data is RSEM normalized
87 |
88 | #12. Strip trailing tabs from rsem.isoforms.results
89 | echo moving output files for $sampleID for final processing...
90 | mv ${sampleID}* $workingDir/
91 |
92 | perl $softwareDir/strip_trailing_tabs.pl --input $workingDir/${sampleID}.isoforms.results --temp $outDir/working/${sampleID}.orig.isoforms.results
93 |
94 | #13. Prune isoforms from gene quant file
95 | mv $workingDir/${sampleID}.genes.results $outDir/working/${sampleID}.orig.genes.results; sed /^uc0/d $outDir/working/${sampleID}.orig.genes.results >$workingDir/${sampleID}.genes.results
96 |
97 | #14. Normalize gene quant
98 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 1000 -o $workingDir/${sampleID}.rsem.genes.normalized_results $workingDir/${sampleID}.genes.results
99 |
100 | #16. Normalize isoform quant
101 | perl $softwareDir/quartile_norm.pl -c 5 -q 75 -t 300 -o $workingDir/${sampleID}.rsem.isoforms.normalized_results $workingDir/${sampleID}.isoforms.results
102 |
103 | #********************************************************
104 | #outDir=/data2/u01_hmec_batch01/fastq/f1/FASTQ/f1
105 | #********************************************************
106 | #17. Junction counts
107 | #java -Xmx512M -jar $ubu sam-junc --junctions $softwareDir/splice_junctions.txt --in $outDir/$outDir/sorted_genome_alignments.bam --out $workingDir/${sampleID}.junction_quantification.txt > $outDir/working/${sampleID}_junction_quantification.log
108 |
109 | #18. Exon counts
110 | #$bedToolsDir/coverageBed -split -abam $outDir/sorted_genome_alignments.bam -b $softwareDir/composite_exons.bed | perl $softwareDir/normalizeBedToolsExonQuant.pl $softwareDir/composite_exons.bed > $outDir/${sampleID}.bt.exon_quantification.txt
111 |
112 | #19. Cleanup large intermediate output
113 | #rm alignments.bam logs/* working/phred33_alignments.bam working/rg_alignments.bam working/sorted_by_chr_read.bam working/transcriptome_alignments.bam working/transcriptome_alignments_filtered.bam working/prep_1.fastq working/prep_2.fastq > working/cleanup.log
114 |
--------------------------------------------------------------------------------
/Scripts/process_tcga_rsubread:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o errexit
4 |
5 | mkdir -p DownloadSamples CancerTypes Temp FeatureCounts InProgress
6 |
7 | cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178"
8 | #cgquery -o Query.xml -a "state=live&library_strategy=RNA-Seq&filetype=fasta&sample_type=0*&study=phs000178&disease_abbr=DLBC"
9 |
10 | rm -rfv DownloadSamples/* CancerTypes/*
11 | python Codes/ParseCgHubQueryResults.py Query.xml "" DownloadSamples CancerTypes
12 |
13 | rm -rf Temp/*
14 |
15 | for f in $(pwd)/DownloadSamples/*
16 | do
17 | sampleID=$(basename $f)
18 | sampleID=${sampleID/\.xml/}
19 |
20 | if [ -f FeatureCounts/$sampleID ]
21 | then
22 | echo $sampleID already processed
23 | continue
24 | fi
25 |
26 | if [ -f InProgress/$sampleID ]
27 | then
28 | echo $sampleID currently being processed
29 | continue
30 | fi
31 |
32 | $(pwd)/Scripts/normalize_tcga_rsubread $f
33 | done
34 |
--------------------------------------------------------------------------------
/Scripts/summarize_tcga_rsubread:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function buildCancerTypesFile {
4 | rm -rf Temp/CancerTypes
5 | mkdir -p Temp/CancerTypes
6 |
7 | for f in FeatureCounts/*
8 | do
9 | sampleID=$(basename $f)
10 | sampleCancerType=$(cat CancerTypes/$sampleID)
11 | cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt)
12 |
13 | # Make sure we can publish on this sample
14 | if [ "$cancerTypeMatch" == "$sampleCancerType" ]
15 | then
16 | cp -v CancerTypes/$sampleID Temp/CancerTypes/
17 | fi
18 | done
19 |
20 | python Codes/CombineScalarValues.py "Temp/CancerTypes/*" PANCAN20_CancerType_Samples.txt
21 |
22 | rm -rf Temp/CancerTypes
23 | }
24 |
25 | function matricize {
26 | subDir=$1
27 |
28 | tempSummDir=Temp/Summarize_${subDir}
29 | rm -rf $tempSummDir
30 | mkdir -p $tempSummDir
31 |
32 | for f in $subDir/*
33 | do
34 | sampleID=$(basename $f)
35 | sampleCancerType=$(cat CancerTypes/$sampleID)
36 | cancerTypeMatch=$(grep $sampleCancerType TCGA_CancerTypes_Publishable.txt)
37 |
38 | # Make sure we can publish on this sample
39 | if [ "$cancerTypeMatch" == "$sampleCancerType" ]
40 | then
41 | cp -v $f $tempSummDir/
42 | fi
43 | done
44 |
45 | outFile=matrices/PANCAN20.IlluminaHiSeq_RNASeqV2.tumor_Rsubread_${subDir}.txt
46 |
47 | python Codes/BuildMatrixFile.py "$tempSummDir/*" $outFile
48 | python Codes/PrintMatrixDimensions.py $outFile
49 |
50 | rm -f $outFile.gz
51 |
52 | echo Zipping $outFile
53 | gzip -v $outFile
54 |
55 | rm -rf $tempSummDir
56 | }
57 |
58 | buildCancerTypesFile
59 |
60 | matricize RPKMlog &
61 | matricize RPKM &
62 | matricize FeatureCounts &
63 | wait
64 |
--------------------------------------------------------------------------------
/TCGA_CancerType_Abbreviations.txt:
--------------------------------------------------------------------------------
1 | LAML Acute Myeloid Leukemia
2 | ACC Adrenocortical carcinoma
3 | BLCA Bladder Urothelial Carcinoma
4 | LGG Brain Lower Grade Glioma
5 | BRCA Breast invasive carcinoma
6 | CESC Cervical squamous cell carcinoma and endocervical adenocarcinoma
7 | CHOL Cholangiocarcinoma
8 | LCML Chronic Myelogenous Leukemia
9 | COAD Colon adenocarcinoma
10 | CNTL Controls
11 | ESCA Esophageal carcinoma
12 | GBM Glioblastoma multiforme
13 | HNSC Head and Neck squamous cell carcinoma
14 | KICH Kidney Chromophobe
15 | KIRC Kidney renal clear cell carcinoma
16 | KIRP Kidney renal papillary cell carcinoma
17 | LIHC Liver hepatocellular carcinoma
18 | LUAD Lung adenocarcinoma
19 | LUSC Lung squamous cell carcinoma
20 | DLBC Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
21 | MESO Mesothelioma
22 | MISC Miscellaneous
23 | OV Ovarian serous cystadenocarcinoma
24 | PAAD Pancreatic adenocarcinoma
25 | PCPG Pheochromocytoma and Paraganglioma
26 | PRAD Prostate adenocarcinoma
27 | READ Rectum adenocarcinoma
28 | SARC Sarcoma
29 | SKCM Skin Cutaneous Melanoma
30 | STAD Stomach adenocarcinoma
31 | TGCT Testicular Germ Cell Tumors
32 | THYM Thymoma
33 | THCA Thyroid carcinoma
34 | UCS Uterine Carcinosarcoma
35 | UCEC Uterine Corpus Endometrial Carcinoma
36 | UVM Uveal Melanoma
37 |
--------------------------------------------------------------------------------
/TCGA_CancerType_Publishable.txt:
--------------------------------------------------------------------------------
1 | ACC
2 | BLCA
3 | BRCA
4 | CESC
5 | COAD
6 | DLBC
7 | GBM
8 | HNSC
9 | KICH
10 | KIRC
11 | KIRP
12 | LAML
13 | LGG
14 | LIHC
15 | LUAD
16 | LUSC
17 | OV
18 | PRAD
19 | READ
20 | SKCM
21 | STAD
22 | THCA
23 | UCEC
24 | UCS
25 |
--------------------------------------------------------------------------------