├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── NAMESPACE
├── NEWS.md
├── R
├── CNV.R
├── Merge_methylation.R
├── RNA_seq.R
├── SNP.R
├── TCGA_id_conversion.R
├── arrayDiff.R
├── calculate_mean_module.R
├── calculate_mean_profile.R
├── data.R
├── fpkm_count_conversion.r
├── metap.R
├── prepareChi.r
├── rep.R
└── sysdata.rda
├── README.Rmd
├── README.md
├── data
├── GSE66705_sample2.rda
├── geneExpress.rda
├── gene_cov.rda
├── kegg_liver.rda
├── module.rda
├── profile.rda
└── ventricle.rda
├── docs
├── news
│ └── index.html
├── reference
│ ├── arrayDiff.html
│ └── index.html
└── sitemap.xml
├── inst
└── extdata
│ ├── build_data.R
│ ├── cnv
│ ├── 00de3
│ │ └── HONGS_p_TCGAb3_75_76_77_NSP_G.txt
│ ├── 00e8
│ │ └── MICHE_p_TCGAb_428_429_NS.txt
│ ├── 00f9
│ │ └── MINAE_p_TCGA_200_202_203_S.txt
│ └── 0a01
│ │ └── KNELT_p_TCGA_b123_131_S.txt
│ ├── methy
│ ├── 0a0b4
│ │ ├── jhu-usc.e.H.4.lvl-3.TCGA-13-1405-01A-01D-0460-05.g.txt
│ │ └── logs
│ │ │ └── file1.parcel
│ ├── 0a6b
│ │ ├── jhu-usc.e.H.10.lvl-3.TCGA-30-1880-01A-01D-0652-05.g.txt
│ │ └── logs
│ │ │ └── file2.parcel
│ ├── 0ae7
│ │ ├── jhu-usc.H.8.l.I.TCGA-30-1714-01A-02D-0563-05.g.txt
│ │ └── logs
│ │ │ └── file3.parcel
│ └── 0b32
│ │ ├── jhu-usc.e.H.5.l.TCGA-13-1510-01A-02D-0475-05.gdc_hg38.txt
│ │ └── logs
│ │ └── file4.parcel
│ └── tcga_cli
│ ├── at.orl.TCGA-2V-A95S.xml
│ ├── ati.org.TCGA-2Y-A9GT.xml
│ └── ats.org.TCGA-2Y-A9GS.xml
├── man
├── GSE66705_sample2.Rd
├── Merge_methy_tcga.Rd
├── SNP_QC.Rd
├── array_preprocess.Rd
├── cal_mean_module.Rd
├── cluster_array.Rd
├── combine_pvalue.Rd
├── countToFpkm.Rd
├── countToTpm.Rd
├── differential_RNA.Rd
├── differential_SNP.Rd
├── differential_SNP_GEO.Rd
├── differential_SNP_tcga.Rd
├── differential_array.Rd
├── differential_cnv.Rd
├── differential_limma.Rd
├── differential_methy.Rd
├── fpkmToTpm.Rd
├── geneExpress.Rd
├── gene_ave.Rd
├── gene_cov.Rd
├── get_geo_array.Rd
├── id_conversion_TCGA.Rd
├── kegg_liver.Rd
├── module.Rd
├── prepare_chi.Rd
├── profile.Rd
├── repAssign.Rd
├── repRemove.Rd
└── ventricle.Rd
├── tests
├── testthat.R
└── testthat
│ ├── fpkmToTpm.R
│ ├── test_Merge_methy_tcga.R
│ ├── test_arrayDiff.R
│ ├── test_cal_mean_module.R
│ ├── test_countToFpkm.R
│ ├── test_countToTpm.R
│ ├── test_diff_RNA_ucsc.R
│ ├── test_differential_CNV.R
│ ├── test_differential_RNA.R
│ ├── test_differential_limma.R
│ ├── test_gene_ave.R
│ ├── test_prepare_chi.R
│ ├── test_repAssign.R
│ └── test_repRemove.R
└── vignettes
├── .gitignore
└── GeoTcgaData.Rmd
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^data-raw$
4 | ^README\.Rmd$
5 | ^cran-comments\.md$
6 | ^NEWS\.md$
7 | ^.*\.jpg$
8 | ^\.github$
9 | ^\.json$
10 | ^Thumbs\.db$
11 | ^build_site.R$
12 | ^pkgdown$
13 | ^\.DS_Store
14 | ^_drake\.R$
15 | ^\.drake$
16 | ^\.drake_history$
17 | ^\.future$
18 | ^\.git$
19 | ^\.github$
20 | ^\.gitignore$
21 | ^\.Rapp.history$
22 | ^\.RData$
23 | ^\.Rbuildignore$
24 | ^\.Rhistory$
25 | ^\.lintr$
26 | ^CODE_OF_CONDUCT.md$
27 | ^CONTRIBUTING.md$
28 | ^pkgdown\.R$
29 | ^pkgdown\.sh$
30 | ^paper\.bib$
31 | ^paper\.md$
32 | ^LICENSE$
33 | ^NOTICE$
34 | vignettes/.*\.html
35 | vignettes/.*\.md
36 | vignettes/.*\.log
37 | vignettes/.*_files
38 | ^.*\.css$
39 | ^.*\.gcda$
40 | ^.*\.gcno$
41 | ^.*\.js$
42 | ^.*\.log$
43 | ^.*\.out$
44 | ^.*\.nfs.*$
45 | ^.*\.svg$
46 | ^.*\.yaml$
47 | ^.*\.yml$
48 | ^docs$
49 | ^_pkgdown\.yml$
50 | ^codemeta\.json$
51 | ^\.httr-oauth$
52 | ^CRAN-RELEASE$
53 | CONDUCT.md
54 |
55 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | inst/doc
6 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: GeoTcgaData
2 | Type: Package
3 | Title: Processing Various Types of Data on GEO and TCGA
4 | Version: 1.99.2
5 | Authors@R: person(given = "Erqiang", family = "Hu", email = "13766876214@163.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-1798-7513"))
6 | Description: Gene Expression Omnibus(GEO) and The Cancer Genome Atlas (TCGA)
7 | provide us with a wealth of data, such as RNA-seq, DNA Methylation, SNP
8 | and Copy number variation data. It's easy to download data from TCGA using the
9 | gdc tool, but processing these data into a format suitable for bioinformatics
10 | analysis requires more work. This R package was developed to handle these data.
11 | Depends: R (>= 4.2.0)
12 | License: Artistic-2.0
13 | Encoding: UTF-8
14 | RoxygenNote: 7.2.3
15 | Suggests:
16 | knitr,
17 | rmarkdown,
18 | DESeq2,
19 | S4Vectors,
20 | ChAMP,
21 | impute,
22 | tidyr,
23 | clusterProfiler,
24 | org.Hs.eg.db,
25 | edgeR,
26 | limma,
27 | quantreg,
28 | minfi,
29 | IlluminaHumanMethylation450kanno.ilmn12.hg19,
30 | dearseq,
31 | NOISeq,
32 | testthat (>= 3.0.0),
33 | CATT,
34 | TCGAbiolinks,
35 | enrichplot,
36 | GEOquery,
37 | BiocGenerics
38 | VignetteBuilder: knitr
39 | Imports:
40 | utils,
41 | data.table,
42 | plyr,
43 | cqn,
44 | topconfects,
45 | stats,
46 | SummarizedExperiment,
47 | methods
48 | Language: en-US
49 | URL: https://github.com/YuLab-SMU/GeoTcgaData
50 | BugReports: https://github.com/YuLab-SMU/GeoTcgaData/issues
51 | biocViews: GeneExpression, DifferentialExpression, RNASeq,
52 | CopyNumberVariation, Microarray, Software, DNAMethylation,
53 | DifferentialMethylation, SNP, ATACSeq, MethylationArray
54 | Config/testthat/edition: 3
55 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | export(Merge_methy_tcga)
4 | export(SNP_QC)
5 | export(array_preprocess)
6 | export(cal_mean_module)
7 | export(cluster_array)
8 | export(combine_pvalue)
9 | export(countToFpkm)
10 | export(countToTpm)
11 | export(differential_CNV)
12 | export(differential_RNA)
13 | export(differential_SNP)
14 | export(differential_SNP_GEO)
15 | export(differential_SNP_tcga)
16 | export(differential_array)
17 | export(differential_limma)
18 | export(differential_methy)
19 | export(fpkmToTpm)
20 | export(gene_ave)
21 | export(get_geo_array)
22 | export(id_conversion_TCGA)
23 | export(prepare_chi)
24 | export(repAssign)
25 | export(repRemove)
26 | import(cqn)
27 | import(methods)
28 | importFrom(SummarizedExperiment,assays)
29 | importFrom(SummarizedExperiment,colData)
30 | importFrom(plyr,rename)
31 | importFrom(stats,as.dist)
32 | importFrom(stats,cor)
33 | importFrom(stats,cutree)
34 | importFrom(stats,hclust)
35 | importFrom(stats,na.fail)
36 | importFrom(stats,p.adjust)
37 | importFrom(stats,pchisq)
38 | importFrom(stats,pnorm)
39 | importFrom(stats,qnorm)
40 | importFrom(utils,methods)
41 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # GeoTcgaData 0.99.2
2 |
3 | + use SummarizedExperiment input (2023-1-29, Sun)
4 | + fix return value of `differential_array` (2022-10-8, Sat)
5 | + fix gene length bug in `countToTpm()` and `countToFpkm()`(2022_9_22, Tue)
6 | + fix a bug in `id_conversion` (2022-8-27, Sat)
7 | + fix a bug in `differential_RNA(useTopconfects = TRUE)` (2022-8-12, Fir)
8 | + add function `methydifferential_ucsc` and `methydifferential_limma`(2021-10-24, Sun)
9 | + update hgnc_file data(2021-10-24, Sun)
10 | + add function `differential_RNA` to do difference analysis of RNA-seq data(2021-7-20, Tue)
11 | + add data hgnc_file
12 | + update function: id_ava()
13 | + add functions: ann_merge(), countToFpkm(), countToTpm()
14 |
15 |
--------------------------------------------------------------------------------
/R/CNV.R:
--------------------------------------------------------------------------------
1 | #' Do difference analysis of gene level copy number variation data
2 | #'
3 | #' @param cnvData data.frame of CNV data, each column is a sample,
4 | #' and each row is a CNV.
5 | #' @param sampleGroup vector of sample group
6 | #' @param method method to do diffenenital analysis,
7 | #' one of "Chisquare", "fisher",
8 | #' and "CATT"(Cochran-Armitage trend test)
9 | #' @param adjust.method adjust.method, one of "holm", "hochberg", "hommel",
10 | #' "bonferroni", "BH", "BY", "fdr", and "none".
11 | #' @param ... parameters for "Chisquare", "fisher",
12 | #' and "CATT"(Cochran-Armitage trend test)
13 | #' @return data.frame with pvalue and estimate
14 | #' @export
15 | #'
16 | #' @examples
17 | #' \donttest{
18 | #' # use TCGAbiolinks data as example
19 | #' library(TCGAbiolinks)
20 | #' query <- GDCquery(
21 | #' project = "TCGA-ACC",
22 | #' data.category = "Copy Number Variation",
23 | #' data.type = "Gene Level Copy Number",
24 | #' access = "open"
25 | #' )
26 | #' GDCdownload(query)
27 | #' cnvData <- GDCprepare(query)
28 | #' aa <- assays(cnvData)$copy_number
29 | #' bb <- aa
30 | #' aa[bb == 2] <- 0
31 | #' aa[bb < 2] <- -1
32 | #' aa[bb > 2] <- 1
33 | #' sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE)
34 | #' diffCnv <- differential_CNV(aa, sampleGroup)
35 | #'
36 | #' # Use sangerbox CNV data as example
37 | #' cnvData <- fread("Merge_GeneLevelCopyNumber.txt")
38 | #' class(cnvData) <- "data.frame"
39 | #' rownames(cnvData) <- cnvData[, 1]
40 | #' cnvData <- cnvData[, -c(1, 2, 3)]
41 | #' sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE)
42 | #' diffCnv <- differential_CNV(cnvData, sampleGroup)
43 | #' }
44 | #' # use random data as example
45 | #' aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8)
46 | #' rownames(aa) <- paste0("gene", 1:25)
47 | #' colnames(aa) <- paste0("sample", 1:8)
48 | #' sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE)
49 | #' diffCnv <- differential_CNV(aa, sampleGroup)
50 | differential_CNV <- function(cnvData, sampleGroup,
51 | method = "Chisquare",
52 | adjust.method = "BH", ...) {
53 | type1 <- which(sampleGroup == unique(sampleGroup)[1])
54 | type2 <- which(sampleGroup == unique(sampleGroup)[2])
55 | pvalue <- rep(1, nrow(cnvData))
56 | estimate <- rep(0, nrow(cnvData))
57 | for (i in seq_len(nrow(cnvData))) {
58 | type1_freq <- table(as.character(cnvData[i, type1]))
59 | type2_freq <- table(as.character(cnvData[i, type2]))
60 | df <- data.frame(
61 | type1 = as.numeric(type1_freq[c("-1", "0", "1")]),
62 | type2 = as.numeric(type2_freq[c("-1", "0", "1")])
63 | )
64 | df[is.na(df)] <- 0
65 | df <- df[rowSums(df) > 0, ]
66 | if (nrow(df) > 2) {
67 | if (method == "fisher") {
68 | fish <- stats::fisher.test(df, ...)
69 | pvalue[i] <- fish$p.value
70 | if (nrow(df) == 2) {
71 | estimate[i] <- fish$estimate
72 | }
73 | }
74 |
75 | if (method == "Chisquare") {
76 | pvalue[i] <- stats::chisq.test(df, ...)$p.value
77 | }
78 |
79 | if(method == "CATT") {
80 | pvalue[i] <- CATT::CATT(table = t(df), ...)$p.value
81 | }
82 | }
83 | }
84 |
85 | adj.P.Val <- stats::p.adjust(pvalue, method = adjust.method)
86 | gene <- gsub("\\..*", "", rownames(cnvData))
87 | result <- data.frame(gene = gene, P.Value = pvalue,
88 | adj.P.Val = adj.P.Val, estimate = estimate)
89 | rownames(result) <- gene
90 | result
91 | }
92 |
93 |
--------------------------------------------------------------------------------
/R/Merge_methylation.R:
--------------------------------------------------------------------------------
1 | #' Get methylation difference gene
2 | #'
3 | #' @title differential_methy
4 | #' @rdname differential_methy
5 | #' @param cpgData data.frame of cpg beta value, , or SummarizedExperiment object
6 | #' @param sampleGroup vector of sample group
7 | #' @param groupCol group column
8 | #' @param combineMethod method to combine the cpg pvalues,
9 | #' a function or one of "stouffer", "fisher" and "rhoScores".
10 | #' @param missing_value Method to impute missing expression data,
11 | #' one of "zero" and "knn".
12 | #' @param cpg2gene data.frame to annotate cpg locus to gene
13 | #' @param normMethod Method to do normalization: "PBC" or "BMIQ".
14 | #' @param region region of genes, one of "Body", "TSS1500", "TSS200",
15 | #' "3'UTR", "1stExon", "5'UTR", and "IGR". Only used when cpg2gene is NULL.
16 | #' @param model if "cpg", step1: calculate difference cpgs;
17 | #' step2: calculate difference genes.
18 | #' if "gene", step1: calculate the methylation level of genes;
19 | #' step2: calculate difference genes.
20 | #' @param adjust.method character string specifying the method
21 | #' used to adjust p-values for multiple testing.
22 | #' See \link{p.adjust} for possible values.
23 | #' @param ucscData Logical, whether the data comes from UCSC Xena.
24 | #' @param adjPvalCutoff adjusted pvalue cutoff
25 | #' @importFrom stats p.adjust
26 | #' @return data.frame
27 | #' @export
28 | #' @examples
29 | #' \donttest{
30 | #' # use TCGAbiolinks data
31 | #' library(TCGAbiolinks)
32 | #' query <- GDCquery(project = "TCGA-ACC",
33 | #' data.category = "DNA Methylation",
34 | #' data.type = "Methylation Beta Value",
35 | #' platform = "Illumina Human Methylation 450")
36 | #' GDCdownload(query, method = "api", files.per.chunk = 5,
37 | #' directory = Your_Path)
38 | #' merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data)
39 | #' library(ChAMP) # To avoid reporting errors
40 | #' differential_gene <- differential_methy(cpgData = merge_result,
41 | #' sampleGroup = sample(c("C","T"),
42 | #' ncol(merge_result[[1]]), replace = TRUE))
43 | #' }
44 | #' # use user defined data
45 | #' library(ChAMP)
46 | #' cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
47 | #' rownames(cpgData) <- paste0("cpg", seq_len(200))
48 | #' colnames(cpgData) <- paste0("sample", seq_len(10))
49 | #' sampleGroup <- c(rep("group1", 5), rep("group2", 5))
50 | #' names(sampleGroup) <- colnames(cpgData)
51 | #' cpg2gene <- data.frame(cpg = rownames(cpgData),
52 | #' gene = rep(paste0("gene", seq_len(20)), 10))
53 | #' result <- differential_methy(cpgData, sampleGroup,
54 | #' cpg2gene = cpg2gene, normMethod = NULL)
55 | #' # use SummarizedExperiment object input
56 | #' library(ChAMP)
57 | #' cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
58 | #' rownames(cpgData) <- paste0("cpg", seq_len(200))
59 | #' colnames(cpgData) <- paste0("sample", seq_len(10))
60 | #' sampleGroup <- c(rep("group1", 5), rep("group2", 5))
61 | #' names(sampleGroup) <- colnames(cpgData)
62 | #' cpg2gene <- data.frame(cpg = rownames(cpgData),
63 | #' gene = rep(paste0("gene", seq_len(20)), 10))
64 | #' colData <- S4Vectors::DataFrame(
65 | #' row.names = colnames(cpgData),
66 | #' group = sampleGroup
67 | #' )
68 | #' data <- SummarizedExperiment::SummarizedExperiment(
69 | #' assays=S4Vectors::SimpleList(counts=cpgData),
70 | #' colData = colData)
71 | #' result <- differential_methy(cpgData = data,
72 | #' groupCol = "group", normMethod = NULL,
73 | #' cpg2gene = cpg2gene)
74 | differential_methy <- function(cpgData, sampleGroup,
75 | groupCol,
76 | # combineMethod = RobustRankAggreg::rhoScores,
77 | combineMethod = "stouffer",
78 | missing_value = "knn",
79 | cpg2gene = NULL,
80 | normMethod = "PBC",
81 | region = "TSS1500",
82 | model = "gene",
83 | adjust.method = "BH",
84 | adjPvalCutoff = 0.05,
85 | ucscData = FALSE) {
86 | region <- match.arg(region, c("Body", "TSS1500", "TSS200",
87 | "3'UTR", "1stExon", "5'UTR", "IGR"))
88 | model <- match.arg(model, c("cpg", "gene"))
89 |
90 | if (inherits(cpgData, "SummarizedExperiment")) {
91 | cpgData2 <- cpgData
92 | cpgData <- assays(cpgData2)$counts
93 | sampleGroup <- colData(cpgData2)[, groupCol]
94 | names(sampleGroup) <- rownames(colData(cpgData2))
95 | } else {
96 | if (inherits(cpgData, "SummarizedExperiment")) {
97 | cpgData <- cpgData[[1]]
98 | }
99 | }
100 |
101 |
102 | if (ucscData) {
103 | class(methy) <- "data.frame"
104 | rownames(methy) <- methy[, 1]
105 | cpgs <- rownames(methy)
106 | methy <- methy[, -1]
107 | group <- sampleGroup
108 | if (is.null(group)) {
109 | group <- lapply(colnames(methy), function(x) {
110 | strsplit(x, "-")[[1]][4]
111 | }) |> unlist()
112 |
113 | group <- substring(group, 1, 1)
114 | }
115 | }
116 |
117 |
118 |
119 | cpgData <- as.matrix(cpgData)
120 | # Use KNN to fill in missing values
121 | if (missing_value == "zero") {
122 | cpgData[is.na(cpgData)] <- 0
123 | data.m <- cpgData
124 | } else {
125 | data.m <- impute::impute.knn(cpgData)$data
126 | }
127 |
128 | # normalize data
129 | myNorm <- data.m
130 | if (!is.null(normMethod)) {
131 | myNorm <- ChAMP::champ.norm(beta = data.m, rgSet = NULL,
132 | mset = NULL, method = normMethod)
133 | }
134 | if (!is.null(cpg2gene)) {
135 | cpg_gene <- cpg2gene
136 | } else {
137 | cpg_gene <- get_cpg_annotation(region = region)
138 | }
139 |
140 |
141 | if (model == "gene") {
142 | cpg_gene <- split(cpg_gene[, 2], cpg_gene[, 1])
143 | genes <- unlist(lapply(cpg_gene, function(x) {paste(x,collapse = ";")}))
144 | cpg_gene <- data.frame(cpg = names(cpg_gene), gene = genes)
145 | rownames(cpg_gene) <- cpg_gene[, 1]
146 | myNorm <- as.data.frame(myNorm)
147 | myNorm$gene <- cpg_gene[rownames(myNorm), 2]
148 | # myNorm <- myNorm[, c(ncol(myNorm), 1:(ncol(myNorm) - 1))]
149 | myNorm <- myNorm[, c(ncol(myNorm), seq_len(ncol(myNorm) - 1))]
150 | myNorm <- myNorm[!is.na(myNorm$gene), ]
151 |
152 |
153 | myNorm$gene <- as.character(myNorm$gene)
154 | myNorm2 <- repAssign(myNorm, ";")
155 | myNorm3 <- gene_ave(myNorm2)
156 |
157 | ## use limma to do differential expression analysis
158 | gene_pvalue <- differential_limma(myNorm3, group = sampleGroup,
159 | adjust.method = adjust.method)
160 | gene_pvalue$gene <- rownames(gene_pvalue)
161 | } else {
162 | # Identify Differential Methylation Positions (DMP)
163 | myDMP <- ChAMP::champ.DMP(beta = myNorm,
164 | pheno = sampleGroup, adjPVal = 1)
165 | myDMP <- as.data.frame(myDMP)
166 |
167 | # use cpg_gene to annotate CpGs
168 | pvalues <- cpg_gene
169 | pvalues$pvalue <- myDMP[cpg_gene[, 1], 4]
170 | # rownames(pvalues) <- pvalues[, 1]
171 | pvalues <- pvalues[!is.na(pvalues$pvalue), ]
172 |
173 | if (is.function(combineMethod)) {
174 | gene_pvalue <- stats::aggregate(pvalues[, 4],
175 | by = list(pvalues[, 2]),
176 | # FUN = combine_pvalue, combineMethod = combineMethod
177 | FUN = combineMethod
178 | )
179 | colnames(gene_pvalue) <- c("gene", "pvalue")
180 | } else {
181 | aa <- pvalues$pvalue
182 | bb <- split(aa, pvalues$gene)
183 | gene_pvalue <- data.frame(gene = names(bb),
184 | pvalue = unlist(lapply(bb, function(x) x[1])))
185 | if (combineMethod == "stouffer") {
186 |
187 | myBetas <- myNorm[pvalues$cpg, ]
188 | myBetas <- split(as.data.frame(myBetas), pvalues$gene)
189 | correl <- lapply(myBetas, function(x) cor(t(x)))
190 | weights <- lapply(correl, function(x) 1/apply(x^2,1,sum))
191 |
192 | for (i in seq_len(nrow(gene_pvalue))) {
193 | if (length(bb[[i]]) > 1) {
194 | gene_pvalue[i, 2] <- sumz(bb[[i]], weights[[i]])$p
195 | }
196 | }
197 | }
198 |
199 | if (combineMethod == "fisher") {
200 | for (i in seq_len(nrow(gene_pvalue))) {
201 | if (length(bb[[i]]) > 1) {
202 | gene_pvalue[i, 2] <- sumlog(bb[[i]])$p
203 | }
204 | }
205 | }
206 | }
207 |
208 |
209 |
210 | # get logFC of genes
211 | myNorm2 <- myNorm[pvalues[, 1], ]
212 | myNorm2 <- stats::aggregate(myNorm2,
213 | by = list(pvalues[, 2]), FUN = mean)
214 |
215 | myNorm2 <- myNorm2[myNorm2[, 1] != "", ]
216 | rownames(myNorm2) <- myNorm2[, 1]
217 | myNorm2 <- myNorm2[, -1]
218 | groups <- sort(unique(sampleGroup))
219 | mean1 <- rowMeans(myNorm2[, sampleGroup == groups[1]], na.rm = TRUE)
220 | mean2 <- rowMeans(myNorm2[, sampleGroup == groups[2]], na.rm = TRUE)
221 | logFC <- mean1 - mean2
222 |
223 | gene_pvalue$logFC <- logFC[gene_pvalue[, 1]]
224 | colnames(gene_pvalue) <- c("gene", "P.Value", "logFC")
225 | gene_pvalue$gene <- as.character(gene_pvalue$gene)
226 | gene_pvalue$adj.P.Val <- p.adjust(gene_pvalue$P.Value,
227 | method = adjust.method)
228 | rownames(gene_pvalue) <- gene_pvalue$gene
229 | }
230 | gene_pvalue <- gene_pvalue[gene_pvalue$adj.P.Val < adjPvalCutoff, ]
231 | return(gene_pvalue)
232 | }
233 |
234 |
235 |
236 | #' differential_limma
237 | #'
238 | #' @param df data.frame of the omic data
239 | #' @param group a vector, group of samples.
240 | #' @param adjust.method adjust.method.
241 | #' @return data.frame
242 | #' @export
243 | #' @examples
244 | #' df <- matrix(runif(200), 25, 8)
245 | #' df <- as.data.frame(df)
246 | #' rownames(df) <- paste0("gene", 1:25)
247 | #' colnames(df) <- paste0("sample", 1:8)
248 | #' group <- sample(c("group1", "group2"), 8, replace = TRUE)
249 | #' result <- differential_limma(df = df, group = group)
250 | differential_limma <- function(df, group, adjust.method = "BH") {
251 | groups <- unique(group)
252 | # if group is a numberic vector(even for c("0", "1")), will get errors.
253 | group <- gsub(groups[1], "nromal", group)
254 | group <- gsub(groups[2], "disease", group)
255 | design <- stats::model.matrix(~ 0 + factor(group))
256 | colnames(design) <- levels(factor(group))
257 | contrast.matrix <- limma::makeContrasts(
258 | contrasts = paste(colnames(design)[2:1],
259 | collapse = "-"
260 | ), levels = colnames(design))
261 |
262 | fit <- limma::lmFit(df, design)
263 | fit <- limma::contrasts.fit(fit, contrast.matrix)
264 | fit <- limma::eBayes(fit)
265 | limma::topTable(fit, adjust.method = adjust.method, number = Inf)
266 | ## or limma::topTable(fit, coef = 1, adjust='BH', number=Inf)
267 | ## contrasts.fit is not necessory
268 | # groups <- unique(group)
269 | # group <- gsub(groups[1], "nromal", group)
270 | # group <- gsub(groups[2], "disease", group)
271 | # design <- stats::model.matrix(~factor(group))
272 |
273 | # fit2 <- lmFit(df, design)
274 | # fit2 <- eBayes(fit2)
275 | # topTable(fit2,coef=2, adjust='BH', number=Inf)
276 |
277 | ## coef parameter is not necessory:
278 | # opTable(fit2, adjust='BH', number=Inf)
279 | }
280 |
281 | #' Merge methylation data downloaded from TCGA
282 | #'
283 | #' When the methylation data is downloaded from TCGA,
284 | #' each sample is saved in a folder, which contains the methylation value file
285 | #' and the descriptive file. This function can directly
286 | #' extract and consolidate all folders.
287 | #' @param dirr a string for the directory of methylation data download from tcga
288 | #' useing the tools gdc
289 | #' @return a matrix, a combined methylation expression spectrum matrix
290 | #' @export
291 | #'
292 | #' @examples
293 | #' merge_result <- Merge_methy_tcga(system.file(file.path("extdata", "methy"),
294 | #' package = "GeoTcgaData"))
295 | Merge_methy_tcga <- function(dirr = NULL) {
296 | options(warn = -1)
297 | # file_num=1
298 | if (is.null(dirr)) stop("please give your directory of methylation data!")
299 | tcga_dir <- dir(dirr)
300 | filePath <- file.path(dirr, tcga_dir[1])
301 | methyFile <- get_methy_df(filePath)
302 | methyResult <- matrix(0, nrow = nrow(methyFile), ncol = length(tcga_dir))
303 | rownames(methyResult) <- methyFile[, "Composite Element REF"]
304 | samples <- rep(0, length(tcga_dir))
305 | methyResult[, 1] <- methyFile[, 2]
306 | samples[1] <- colnames(methyFile)[2]
307 | message("file", 1, " is over")
308 | for (i in 2:length(tcga_dir)) {
309 | message("file", i, " is over")
310 | filePath <- file.path(dirr, tcga_dir[i])
311 | methyFile <- get_methy_df(filePath)
312 | methyResult[, i] <- methyFile[, 2]
313 | samples[i] <- colnames(methyFile)[2]
314 | gc()
315 | }
316 | colnames(methyResult) <- samples
317 | cpg_info <- methyFile[, -2]
318 | return(list(methyResult = methyResult, cpg_info = cpg_info))
319 | }
320 |
321 | #' Read methylated data file and turn it into data frame
322 | #'
323 | #' @param filePath Path of files
324 | #' @return data.frame
325 | #' @noRd
326 | get_methy_df <- function(filePath) {
327 | methyDir <- dir(filePath)
328 | for (j in seq_len(length(methyDir))) {
329 | if (length(grep("jhu-usc", methyDir[j])) > 0) {
330 | file_name <- file.path(filePath, dir(filePath)[j])
331 | sample <- unlist(strsplit(dir(filePath)[j], "\\."))[6]
332 | }
333 | }
334 | methyFile <- data.table::fread(file_name, header = TRUE)
335 | class(methyFile) <- "data.frame"
336 | colnames(methyFile)[2] <- sample
337 | return(methyFile)
338 | }
339 |
340 |
341 | get_cpg_annotation <- function(region = "TSS1500") {
342 | ## library to avoid errors.
343 | # library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
344 | ann <- minfi::getAnnotation(
345 | IlluminaHumanMethylation450kanno.ilmn12.hg19::IlluminaHumanMethylation450kanno.ilmn12.hg19)
346 | ann <- as.data.frame(ann)
347 | cpg_gene <- ann[, c("Name", "UCSC_RefGene_Name", "UCSC_RefGene_Group")]
348 | # cpg_gene <- cpg_gene[grep(region, cpg_gene$UCSC_RefGene_Group), ]
349 | cpg_gene <- cpg_gene[cpg_gene[, 2] != "", ]
350 | genelist <- strsplit(cpg_gene[, 2], ";")
351 | regionlist <- strsplit(cpg_gene[, 3], ";")
352 | geneLength <- unlist(lapply(genelist, length))
353 | cpgs <- rep(cpg_gene[, 1], times = geneLength)
354 | cpg_gene2 <- data.frame(cpg = cpgs, gene = unlist(genelist),
355 | region = unlist(regionlist))
356 | cpg_gene2 <- cpg_gene2[grep(region, cpg_gene2$region), ]
357 | return(unique(cpg_gene2))
358 | }
359 |
--------------------------------------------------------------------------------
/R/RNA_seq.R:
--------------------------------------------------------------------------------
1 | #' Do difference analysis of RNA-seq data
2 | #'
3 | #' @title differential_RNA
4 | #' @rdname differential_RNA
5 | #' @param counts a dataframe or numeric matrix of raw counts data,
6 | #' or SummarizedExperiment object
7 | #' @param group sample groups
8 | #' @param groupCol group column
9 | #' @param method one of "DESeq2", "edgeR" , "limma", "dearseq",
10 | #' "NOISeq", "Wilcoxon", and "auto".
11 | #' @param geneLength a vector of gene length.
12 | #' @param gccontent a vector of gene GC content.
13 | #' @param filter if TRUE, use filterByExpr to filter genes.
14 | #' @param edgeRNorm if TRUE, use edgeR to do normalization for dearseq method.
15 | #' @param adjust.method character string specifying the method used to
16 | #' adjust p-values for multiple testing.
17 | #' See \link{p.adjust} for possible values.
18 | #' @param useTopconfects if TRUE, use topconfects to provide a
19 | #' more biologically useful ranked gene list.
20 | #' @param ucscData Logical, whether the data comes from UCSC Xena.
21 | #' @importFrom plyr rename
22 | #' @importFrom SummarizedExperiment assays
23 | #' @importFrom SummarizedExperiment colData
24 | #' @importFrom utils methods
25 | #' @import methods
26 | #' @import cqn
27 | #' @return data.frame
28 | #' @export
29 | #'
30 | #' @examples
31 | #' \donttest{
32 | # use `TCGAbiolinks` to download TCGA data
33 | #' library(TCGAbiolinks)
34 | #'
35 | #' query <- GDCquery(
36 | #' project = "TCGA-ACC",
37 | #' data.category = "Transcriptome Profiling",
38 | #' data.type = "Gene Expression Quantification",
39 | #' workflow.type = "STAR - Counts"
40 | #' )
41 | #'
42 | #' GDCdownload(query,
43 | #' method = "api", files.per.chunk = 3,
44 | #' directory = Your_Path
45 | #' )
46 | #'
47 | #' dataRNA <- GDCprepare(
48 | #' query = query, directory = Your_Path,
49 | #' save = TRUE, save.filename = "dataRNA.RData"
50 | #' )
51 | #' ## get raw count matrix
52 | #' dataPrep <- TCGAanalyze_Preprocessing(
53 | #' object = dataRNA,
54 | #' cor.cut = 0.6,
55 | #' datatype = "STAR - Counts"
56 | #' )
57 | #'
58 | #' # Use `differential_RNA` to do difference analysis.
59 | #' # We provide the data of human gene length and GC content in `gene_cov`.
60 | #' group <- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE)
61 | #' library(cqn) # To avoid reporting errors: there is no function "rq"
62 | #' ## get gene length and GC content
63 | #' library(org.Hs.eg.db)
64 | #' genes_bitr <- bitr(rownames(gene_cov),
65 | #' fromType = "ENTREZID", toType = "ENSEMBL",
66 | #' OrgDb = org.Hs.eg.db, drop = TRUE
67 | #' )
68 | #' genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ]
69 | #' gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ]
70 | #' rownames(gene_cov2) <- genes_bitr$ENSEMBL
71 | #' genes <- intersect(rownames(dataPrep), rownames(gene_cov2))
72 | #' dataPrep <- dataPrep[genes, ]
73 | #' geneLength <- gene_cov2(genes, "length")
74 | #' gccontent <- gene_cov2(genes, "GC")
75 | #' names(geneLength) <- names(gccontent) <- genes
76 | #' ## Difference analysis
77 | #' DEGAll <- differential_RNA(
78 | #' counts = dataPrep, group = group,
79 | #' geneLength = geneLength, gccontent = gccontent
80 | #' )
81 | #' # Use `clusterProfiler` to do enrichment analytics:
82 | #' diffGenes <- DEGAll$logFC
83 | #' names(diffGenes) <- rownames(DEGAll)
84 | #' diffGenes <- sort(diffGenes, decreasing = TRUE)
85 | #' library(clusterProfiler)
86 | #' library(enrichplot)
87 | #' library(org.Hs.eg.db)
88 | #' gsego <- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
89 | #' dotplot(gsego)
90 | #' }
91 | #' # use user-defined data
92 | #' df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
93 | #' df <- as.data.frame(df)
94 | #' rownames(df) <- paste0("gene", 1:25)
95 | #' colnames(df) <- paste0("sample", 1:16)
96 | #' group <- sample(c("group1", "group2"), 16, replace = TRUE)
97 | #' result <- differential_RNA(counts = df, group = group,
98 | #' filte = FALSE, method = "Wilcoxon")
99 | #' # use SummarizedExperiment object input
100 | #' df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
101 | #' rownames(df) <- paste0("gene", 1:25)
102 | #' colnames(df) <- paste0("sample", 1:16)
103 | #' group <- sample(c("group1", "group2"), 16, replace = TRUE)
104 | #'
105 | #' nrows <- 200; ncols <- 20
106 | #' counts <- matrix(
107 | #' runif(nrows * ncols, 1, 1e4), nrows,
108 | #' dimnames = list(paste0("cg",1:200),paste0("S",1:20))
109 | #' )
110 | #'
111 | #' colData <- S4Vectors::DataFrame(
112 | #' row.names = paste0("sample", 1:16),
113 | #' group = group
114 | #' )
115 | #' data <- SummarizedExperiment::SummarizedExperiment(
116 | #' assays=S4Vectors::SimpleList(counts=df),
117 | #' colData = colData)
118 | #'
119 | #' result <- differential_RNA(counts = data, groupCol = "group",
120 | #' filte = FALSE, method = "Wilcoxon")
121 | #' @importFrom plyr rename
122 | #' @import cqn
123 | differential_RNA <- function(counts, group, groupCol, method = "limma",
124 | geneLength = NULL,
125 | gccontent = NULL, filter = TRUE, edgeRNorm = TRUE,
126 | adjust.method = "BH", useTopconfects = TRUE,
127 | ucscData = FALSE) {
128 | method <- match.arg(method, c("DESeq2", "edgeR", "limma",
129 | "dearseq", "Wilcoxon", "NOISeq", "auto"))
130 |
131 | if (ucscData) {
132 | class(counts) <- "data.frame"
133 | counts[, 1] <- gsub("\\..*", "", counts[, 1])
134 | rownames(counts) <- counts[, 1]
135 | counts <- counts[, -1]
136 | counts <- round(2^counts) - 1
137 | }
138 |
139 |
140 | if (inherits(counts, "SummarizedExperiment")) {
141 | se <- counts
142 | counts <- assays(se)$counts
143 | group <- colData(se)[, groupCol]
144 | names(group) <- rownames(colData(se))
145 | }
146 |
147 |
148 | cols <- !duplicated(colnames(counts))
149 | counts <- counts[, cols]
150 | group <- group[cols]
151 | if (min(table(group)) > 4 && method == "auto") {
152 | method <- "Wilcoxon"
153 | } else {
154 | method <- "limma"
155 | }
156 |
157 | ## use cqn to correct the bias
158 | correct<- TRUE
159 | uCovar <- NULL
160 | if (is.null(geneLength) || is.null(gccontent)) {
161 | correct<- FALSE
162 | } else {
163 | genes_gc <- intersect(names(geneLength), names(gccontent))
164 | uCovar <- data.frame(length = geneLength[genes_gc],
165 | gccontent = gccontent[genes_gc])
166 | rownames(uCovar) <- genes_gc
167 | counts <- counts[genes_gc, ]
168 | }
169 | d.mont <- edgeR::DGEList(counts = counts, group = group, genes = uCovar)
170 | if (filter) {
171 | keep <- edgeR::filterByExpr(d.mont)
172 | d.mont <- d.mont[keep, keep.lib.sizes = FALSE]
173 | counts <- counts[keep, ]
174 | }
175 | if (correct) {
176 | geneLength <- geneLength[rownames(counts)]
177 | gccontent <- gccontent[rownames(counts)]
178 | cqn.subset <- cqn::cqn(counts, lengths = geneLength, x = gccontent)
179 | }
180 |
181 |
182 | if (method == "DESeq2") {
183 | coldata <- data.frame(group)
184 |
185 | dds <- DESeq2::DESeqDataSetFromMatrix(
186 | countData = counts,
187 | colData = coldata, design = ~group
188 | )
189 |
190 |
191 | if (correct) {
192 | cqnOffset <- cqn.subset$glm.offset
193 | cqnNormFactors <- exp(cqnOffset)
194 | ## divide out the geometric mean
195 | ## https://support.bioconductor.org/p/89239/
196 | ## https://support.bioconductor.org/p/95683/
197 | normFactors <- cqnNormFactors / exp(rowMeans(log(cqnNormFactors)))
198 | DESeq2::normalizationFactors(dds) <- normFactors
199 | }
200 | DEGAll <- DESeq2::DESeq(dds)
201 | DEGAll_table <- NULL
202 | if (useTopconfects) {
203 | DEGAll_table <-
204 | topconfects::deseq2_confects(DEGAll, step = 0.05)$table
205 | rownames(DEGAll_table) <- DEGAll_table$name
206 | }
207 | DEGAll <- DEGAll |>
208 | DESeq2::results(pAdjustMethod = adjust.method) |>
209 | as.data.frame() |>
210 | rename(c("log2FoldChange" = "logFC")) |>
211 | rename(c("pvalue" = "P.Value")) |>
212 | rename(c("padj" = "adj.P.Val"))
213 | DEGAll$length <- geneLength[rownames(DEGAll)]
214 | DEGAll$gccontent <- gccontent[rownames(DEGAll)]
215 | # DEGAll <- DEGAll[, c(ncol(DEGAll) - 1,
216 | # ncol(DEGAll), 1:(ncol(DEGAll) - 2))]
217 | DEGAll <- DEGAll[, c(ncol(DEGAll) - 1, ncol(DEGAll),
218 | seq_len(ncol(DEGAll) - 2))]
219 | if (!is.null(DEGAll_table)) {
220 | genes <- intersect(rownames(DEGAll), rownames(DEGAll_table))
221 | DEGAll <- cbind(DEGAll[genes, ], DEGAll_table[genes, ])
222 | DEGAll <- DEGAll[order(DEGAll$P.Value), ]
223 | }
224 | } else {
225 | if (correct) {
226 | ## with cqn, there is no need to normalize
227 | ## using the normalization tools
228 | ## from edgeR, such as calcNormFactors.
229 | d.mont$offset <- cqn.subset$glm.offset
230 | } else {
231 | ## TMM Normalization
232 | d.mont <- edgeR::calcNormFactors(d.mont, method = "TMM")
233 | }
234 | if (method == "edgeR") {
235 | # design <- stats::model.matrix(~ group)
236 | design <- stats::model.matrix(~ d.mont$sample$group)
237 | if (min(table(d.mont$sample$group)) > 1) {
238 | d.mont <- edgeR::estimateDisp(d.mont, design) |>
239 | edgeR::estimateGLMCommonDisp(design = design)
240 | DEGAll <- edgeR::glmQLFit(d.mont, design = design)
241 | DEGAll_table <- NULL
242 | if (useTopconfects) {
243 | DEGAll_table <- topconfects::edger_confects(DEGAll,
244 | fdr = 0.05,
245 | coef = ncol(DEGAll$design),
246 | step = 0.05
247 | )$table
248 | rownames(DEGAll_table) <- DEGAll_table$name
249 | }
250 | # edgeR::topTags(n = nrow(d.mont$counts)) |>
251 | DEGAll <- DEGAll |>
252 | edgeR::glmQLFTest(coef = ncol(DEGAll$design)) |>
253 | edgeR::topTags(n = Inf, adjust.method = adjust.method) |>
254 | as.data.frame() |>
255 | rename(c("FDR" = "adj.P.Val")) |>
256 | rename(c("PValue" = "P.Value"))
257 | if (!is.null(DEGAll_table)) {
258 | genes <- intersect(rownames(DEGAll), rownames(DEGAll_table))
259 | DEGAll <- cbind(DEGAll[genes, ], DEGAll_table[genes, ])
260 | DEGAll <- DEGAll[order(DEGAll$P.Value), ]
261 | }
262 | } else {
263 | DEGAll <- edgeR::glmFit(d.mont, dispersion = 0)
264 | DEGAll <- DEGAll |>
265 | edgeR::glmLRT(coef = ncol(DEGAll$design)) |>
266 | edgeR::topTags(n = Inf, adjust.method = adjust.method) |>
267 | as.data.frame() |>
268 | rename(c("FDR" = "adj.P.Val")) |>
269 | rename(c("PValue" = "P.Value"))
270 | }
271 | }
272 |
273 | if (method == "limma") {
274 | comparison <- paste(unique(group), collapse = "-")
275 | group <- factor(group)
276 | design <- stats::model.matrix(~ 0 + group)
277 | colnames(design) <- levels(group)
278 | contrast.matrix <- limma::makeContrasts(
279 | contrasts = comparison,
280 | levels = design
281 | )
282 | DEGAll <- limma::voom(d.mont, design = design, plot = FALSE) |>
283 | limma::lmFit(design)
284 | DEGAll_table <- NULL
285 | if (useTopconfects) {
286 | DEGAll_table <- topconfects::limma_confects(DEGAll,
287 | coef = 1,
288 | fdr = 0.05
289 | )$table
290 | rownames(DEGAll_table) <- DEGAll_table$name
291 | }
292 | DEGAll <- DEGAll |>
293 | limma::contrasts.fit(contrast.matrix) |>
294 | limma::eBayes() |>
295 | limma::topTable(number = Inf, adjust.method = adjust.method)
296 | if (!is.null(DEGAll_table)) {
297 | genes <- intersect(rownames(DEGAll), rownames(DEGAll_table))
298 | DEGAll <- cbind(DEGAll[genes, ], DEGAll_table[genes, ])
299 | DEGAll <- DEGAll[order(DEGAll$P.Value), ]
300 | }
301 | }
302 |
303 | if (method == "dearseq") {
304 | group[group == unique(group)[1]] <- 1
305 | group[group == unique(group)[2]] <- 2
306 | conditions <- matrix(as.numeric(group), ncol = 1)
307 | dearseqTest <- "asymptotic"
308 | if (edgeRNorm) {
309 | count_norm <- edgeR::cpm(d.mont, log = TRUE)
310 | DEGAll <- dearseq::dear_seq(
311 | exprmat = count_norm, variables2test = conditions,
312 | which_test = dearseqTest, parallel_comp = FALSE,
313 | preprocessed = TRUE
314 | )
315 | } else {
316 | DEGAll <- dearseq::dear_seq(
317 | exprmat = as.matrix(counts), variables2test = conditions,
318 | which_test = dearseqTest, parallel_comp = FALSE,
319 | preprocessed = FALSE,
320 | padjust_methods = adjust.method
321 | )
322 | }
323 | DEGAll <- DEGAll$pvals |>
324 | rename(c("adjPval" = "adj.P.Val")) |>
325 | rename(c("rawPval" = "P.Value"))
326 | }
327 |
328 | if (method == "Wilcoxon") {
329 | count_norm <- edgeR::cpm(d.mont, log = TRUE) |> as.data.frame()
330 | pvalues <- rep(0, nrow(count_norm))
331 |
332 | count_disease <- as.matrix(count_norm[, group == unique(group)[1]])
333 | count_normal <- as.matrix(count_norm[, group == unique(group)[2]])
334 | for (i in seq_len(length(pvalues))) {
335 | pvalues[i] <- stats::wilcox.test(count_disease[i, ],
336 | count_normal[i, ])$p.value
337 | }
338 | fdr <- stats::p.adjust(pvalues, method = adjust.method)
339 | DEGAll <- data.frame(P.Value = pvalues, adj.P.Val = fdr)
340 | rownames(DEGAll) <- rownames(count_norm)
341 | }
342 |
343 | if (method == "NOISeq") {
344 | conditions <- factor(group)
345 | data <- NOISeq::readData(data = counts,
346 | factors = as.data.frame(conditions))
347 | res <- NOISeq::noiseqbio(data,
348 | k = 0.5, norm = "tmm", factor = "conditions",
349 | random.seed = 12345, filter = 1, cv.cutoff = 100, cpm = 1
350 | )
351 | DEGAll <- NOISeq::degenes(res, q = 0, M = NULL) |>
352 | rename(c("prob" = "P.Value"))
353 | DEGAll$adj.P.Val <- DEGAll$P.Value
354 | }
355 | }
356 | if ("P.Value" %in% colnames(DEGAll)) {
357 | DEGAll <- DEGAll[!is.na(DEGAll[, "P.Value"]), ]
358 | }
359 |
360 | return(DEGAll)
361 | }
362 |
--------------------------------------------------------------------------------
/R/SNP.R:
--------------------------------------------------------------------------------
1 | #' Do difference analysis of SNP data
2 | #'
3 | #' @param snpDf data.frame of SNP data, each column is a sample,
4 | #' and each row is a SNP.
5 | #' @param sampleGroup vector of sample group.
6 | #' @param combineMethod Method of combining the
7 | #' pvalue of multiple snp in a gene.
8 | #' @return data.frame
9 | #' @export
10 | #'
11 | #' @examples
12 | #' \donttest{
13 | #' library(TCGAbiolinks)
14 | #' query <- GDCquery(
15 | #' project = "TCGA-CHOL",
16 | #' data.category = "Simple Nucleotide Variation",
17 | #' access = "open",
18 | #' legacy = FALSE,
19 | #' data.type = "Masked Somatic Mutation",
20 | #' workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
21 | #' )
22 | #' GDCdownload(query)
23 | #' data_snp <- GDCprepare(query)
24 | #' samples <- unique(data_snp$Tumor_Sample_Barcode)
25 | #' sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE)
26 | #' names(sampleGroup) <- samples
27 | #' pvalue <- differential_SNP_tcga(snpData = data_snp,
28 | #' sampleGroup = sampleGroup)
29 | #' }
30 | #' # use demo data
31 | #' snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10)
32 | #' snpDf <- as.data.frame(snpDf)
33 | #' sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
34 | #' result <- differential_SNP(snpDf, sampleGroup)
35 | differential_SNP <- function(snpDf, sampleGroup, combineMethod = min) {
36 | snpDf[!is.na(snpDf)] <- "mutation"
37 | snpDf[is.na(snpDf)] <- "wild"
38 | sampleGroup <- sampleGroup[!is.na(sampleGroup)]
39 | type1 <- which(sampleGroup == names(table(sampleGroup))[1])
40 | type2 <- which(sampleGroup == names(table(sampleGroup))[2])
41 | pvalue <- rep(0, nrow(snpDf))
42 | estimate <- rep(0, nrow(snpDf))
43 | for (i in seq_len(nrow(snpDf))) {
44 | type1_freq <- table(as.character(snpDf[i, type1]))
45 | type2_freq <- table(as.character(snpDf[i, type2]))
46 | df <- data.frame(
47 | type1 = as.numeric(type1_freq[c("wild", "mutation")]),
48 | type2 = as.numeric(type2_freq[c("wild", "mutation")])
49 | )
50 | df[is.na(df)] <- 0
51 | fish <- stats::fisher.test(df)
52 | pvalue[i] <- fish$p.value
53 | estimate[i] <- fish$estimate
54 | }
55 | names(pvalue) <- names(estimate) <- sub("_.*", "", rownames(snpDf))
56 | if (!is.null(combineMethod)) {
57 | pvalue <- stats::aggregate(pvalue, by = list(names(pvalue)),
58 | FUN = combineMethod)
59 | estimate <- stats::aggregate(estimate,
60 | by = list(names(estimate)), FUN = mean)
61 | return(data.frame(gene = pvalue[, 1], pvalue = pvalue[, 2],
62 | estimate = estimate[, 2]))
63 | } else {
64 | return(data.frame(pvalue = pvalue, estimate = estimate))
65 | }
66 | }
67 |
68 | #' combine pvalues of SNP difference analysis result
69 | #'
70 | #' @param snpResult data.frame of SNP difference analysis result.
71 | #' @param snp2gene data frame of two column: snp and gene.
72 | #' @param combineMethod Method of combining the
73 | #' pvalue of multiple snp in a gene.
74 | #' @return data.frame
75 | #' @export
76 | #' @examples
77 | #' snpResult <- data.frame(pvalue = runif(100), estimate = runif(100))
78 | #' rownames(snpResult) <- paste0("snp", seq_len(100))
79 | #' snp2gene <- data.frame(snp = rownames(snpResult),
80 | #' gene = rep(paste0("gene", seq_len(20)), 5))
81 | #' result <- combine_pvalue(snpResult, snp2gene)
82 | combine_pvalue <- function(snpResult, snp2gene, combineMethod = min) {
83 | pvalue <- snpResult$pvalue
84 | estimate <- snpResult$estimate
85 | genes <- snp2gene[, 2]
86 | names(genes) <- snp2gene[, 1]
87 | snps <- rownames(snpResult)
88 | names(pvalue) <- names(estimate) <- genes[snps]
89 | pvalue <- stats::aggregate(pvalue, by = list(names(pvalue)),
90 | FUN = combineMethod)
91 | estimate <- stats::aggregate(estimate, by = list(names(estimate)),
92 | FUN = mean)
93 | return(data.frame(gene = pvalue[, 1], pvalue = pvalue[, 2],
94 | estimate = estimate[, 2]))
95 |
96 | }
97 |
98 | #' Do difference analysis of SNP data downloaded from TCGAbiolinks
99 | #'
100 | #' @param snpData data.frame of SNP data downloaded from TCGAbiolinks
101 | #' @param sampleGroup vector of sample group
102 | #' @param combineMethod Method of combining the pvalue of
103 | #' multiple snp in a gene.
104 | #' @return data.frame
105 | #' @export
106 | #'
107 | #' @examples
108 | #' \donttest{
109 | #' library(TCGAbiolinks)
110 | #' query <- GDCquery(
111 | #' project = "TCGA-CHOL",
112 | #' data.category = "Simple Nucleotide Variation",
113 | #' access = "open",
114 | #' legacy = FALSE,
115 | #' data.type = "Masked Somatic Mutation",
116 | #' workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
117 | #' )
118 | #' GDCdownload(query)
119 | #' data_snp <- GDCprepare(query)
120 | #' samples <- unique(data_snp$Tumor_Sample_Barcode)
121 | #' sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE)
122 | #' names(sampleGroup) <- samples
123 | #' pvalue <- differential_SNP_tcga(snpData = data_snp,
124 | #' sampleGroup = sampleGroup)
125 | #' }
126 | #' # use demo data
127 | #' snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10)
128 | #' snpDf <- as.data.frame(snpDf)
129 | #' sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
130 | #' result <- differential_SNP(snpDf, sampleGroup)
131 | differential_SNP_tcga <- function(snpData, sampleGroup, combineMethod = NULL) {
132 | Tumor_Sample_Barcode <- Variant_Classification <- NULL
133 | snpName <- paste(snpData$Hugo_Symbol, snpData$Start_Position, sep = "_")
134 | # snpData <- snpData[, c("Hugo_Symbol", "Start_Position", "Chromosome",
135 | # "Variant_Classification", "Tumor_Sample_Barcode",
136 | # "Variant_Type", "dbSNP_RS", "Mutation_Status",
137 | # # "MAX_AF",
138 | # "Reference_Allele", "Tumor_Seq_Allele1", "Tumor_Seq_Allele2",
139 | # "Match_Norm_Validation_Allele1", "Match_Norm_Validation_Allele2")]
140 | snpData <- snpData[, c("Variant_Classification", "Tumor_Sample_Barcode")]
141 | snpData$snp <- snpName
142 | snpData <- tidyr::spread(snpData, Tumor_Sample_Barcode,
143 | Variant_Classification)
144 | snpData <- as.data.frame(snpData)
145 | i <- match(colnames(snpData), names(sampleGroup))
146 | sampleGroup <- sampleGroup[i]
147 | rownames(snpData) <- snpData$snp
148 | snpData <- snpData[, -1]
149 | pvalue <- differential_SNP(snpDf = snpData, sampleGroup = sampleGroup,
150 | combineMethod = combineMethod)
151 | return(pvalue)
152 | }
153 |
154 | #' Do difference analysis of SNP data downloaded from GEO
155 | #'
156 | #' @param snpData data.frame of SNP data downloaded from GEO
157 | #' @param sampleGroup vector of sample group
158 | #' @param method one of "Chisquare", "fisher",
159 | #' and "CATT"(Cochran-Armitage trend test)
160 | #' @return data.frame
161 | #' @export
162 | #' @examples
163 | #' \donttest{
164 | #' file1 <- read.table("GSE66903_series_matrix.txt.gz",
165 | #' fill=TRUE, comment.char="!", header = TRUE)
166 | #' rownames(file1) <- file1[, 1]
167 | #' snpData <- file1[, -1]
168 | #' sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE)
169 | #' names(sampleGroup) <- colnames(snpData)
170 | #' snpData <- SNP_QC(snpData)
171 | #' sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE)
172 | #' result1 <- differential_SNP_GEO(snpData = snpData,
173 | #' sampleGroup = sampleGroup, method = "Chisquare")
174 | #' }
175 | #' # use demo data
176 | #' snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10)
177 | #' snpDf <- as.data.frame(snpDf)
178 | #' sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
179 | #' result <- differential_SNP_GEO(snpDf, sampleGroup, method = "fisher")
180 | differential_SNP_GEO <- function(snpData, sampleGroup, method = "Chisquare") {
181 | snpDf <- as.matrix(snpData)
182 | sampleGroup <- sampleGroup[!is.na(sampleGroup)]
183 | type1 <- which(sampleGroup == names(table(sampleGroup))[1])
184 | type2 <- which(sampleGroup == names(table(sampleGroup))[2])
185 | pvalue <- rep(1, nrow(snpDf))
186 | estimate <- rep(0, nrow(snpDf))
187 | for (i in seq_len(nrow(snpDf))) {
188 | type1_freq <- table(snpDf[i, type1])
189 | type2_freq <- table(snpDf[i, type2])
190 | types <- unique(snpDf[i, ])
191 | df <- data.frame(
192 | type1_freq = as.numeric(type1_freq[types]),
193 | type2_freq = as.numeric(type2_freq[types])
194 | )
195 | df[is.na(df)] <- 0
196 | if (nrow(df) > 2) {
197 | if (method == "fisher") {
198 | fish <- stats::fisher.test(df)
199 | pvalue[i] <- fish$p.value
200 | if (nrow(df) == 2) {
201 | estimate[i] <- fish$estimate
202 | }
203 | }
204 |
205 | if (method == "Chisquare") {
206 | pvalue[i] <- stats::chisq.test(df)$p.value
207 | }
208 |
209 | if(method == "CATT") {
210 | pvalue[i] <- CATT::CATT(table = t(df))$p.value
211 | }
212 | }
213 |
214 | }
215 | names(pvalue) <- names(estimate) <- rownames(snpDf)
216 |
217 | return(data.frame(pvalue = pvalue, estimate = estimate))
218 | }
219 |
220 |
221 | get_maf <- function(x) {
222 | x <- x[x != "NoCall"]
223 | freq <- strsplit(x, split = "") |> unlist() |> table()
224 | min(freq) / sum(freq)
225 | }
226 |
227 | get_hwe <- function(x) {
228 | x <- x[x != "NoCall"]
229 | aa <- table(x)
230 | table_x <- as.numeric(aa)
231 | names(table_x) <- names(aa)
232 | # table_x <- table_x[sort(names(table_x))]
233 | freq <- strsplit(x, split = "") |> unlist() |> table()
234 | # freq <- freq[sort(names(freq))]
235 | table_y <- rep(0, 3)
236 |
237 |
238 | names(table_y) <- names(table_x)
239 | freq1 <- freq[1]/ sum(freq)
240 | freq2 <- freq[2]/ sum(freq)
241 | sum_freq <- length(x)
242 | table_y[paste0(names(freq)[1], names(freq)[1])] <- freq1 * freq1 * sum_freq
243 | if (length(table_x) > 1) {
244 | table_y[paste0(names(freq)[1],
245 | names(freq)[2])] <- freq1 * freq2 * sum_freq * 2
246 | table_y[paste0(names(freq)[2],
247 | names(freq)[1])] <- freq1 * freq2 * sum_freq * 2
248 | }
249 | if (length(table_x) > 2) {
250 | table_y[paste0(names(freq)[2],
251 | names(freq)[2])] <- freq2 * freq2 * sum_freq
252 | }
253 | df <- data.frame(table_x, table_y[names(table(x))])
254 | stats::chisq.test(df)$p.value
255 | }
256 |
257 | #' Do quality control of SNP data downloaded from TCGAbiolinks
258 | #'
259 | #' @param snpData data.frame of SNP data downloaded from TCGAbiolinks
260 | #' @param geon filters out all variants with missing call rates
261 | #' exceeding the provided value (default 0.02) to be removed
262 | #' @param mind filters out all samples with missing call rates exceeding
263 | #' the provided value (default 0.02) to be removed
264 | #' @param maf filters out all variants with minor allele frequency below
265 | #' the provided threshold
266 | #' @param hwe filters out all variants which have Hardy-Weinberg
267 | #' equilibrium exact test p-value below the provided threshold
268 | #' @param miss character of miss value
269 | #' @return data.frame
270 | #' @export
271 | #' @examples
272 | #' # use demo data
273 | #' snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10)
274 | #' snpDf <- as.data.frame(snpDf)
275 | #' sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
276 | #' result <- SNP_QC(snpDf)
277 | SNP_QC <- function(snpData, geon = 0.02, mind = 0.02, maf = 0.05,
278 | hwe = 1e-6, miss = "NoCall") {
279 | snpData_mat <- as.matrix(snpData)
280 | ## filter by 0.2
281 | aa <- snpData_mat |> apply(MARGIN = 2, FUN = function(x) {
282 | table(x)[miss] / length(x)
283 | }
284 | )
285 |
286 | aa[is.na(aa)] <- 0
287 | snpData_mat <- snpData_mat[, aa < 0.2]
288 |
289 | bb <- snpData_mat |> apply(MARGIN = 1, FUN = function(x) {
290 | table(x)[miss] / length(x)
291 | }
292 | )
293 |
294 | bb[is.na(bb)] <- 0
295 | snpData_mat <- snpData_mat[bb < 0.2, ]
296 |
297 | ## filter by cutoff
298 | aa <- snpData_mat |> apply(MARGIN = 2, FUN = function(x)
299 | {table(x)[miss] / length(x)})
300 |
301 | aa[is.na(aa)] <- 0
302 | snpData_mat <- snpData_mat[, aa < mind]
303 |
304 | bb <- snpData_mat |> apply(MARGIN = 1, FUN = function(x)
305 | {table(x)[miss] / length(x)})
306 |
307 | bb[is.na(bb)] <- 0
308 | snpData_mat <- snpData_mat[bb < geon, ]
309 |
310 | ## maf
311 | MAF <- snpData_mat |> apply(MARGIN = 1, FUN = get_maf)
312 | snpData_mat <- snpData_mat[MAF > maf, ]
313 | HWE <- snpData_mat |> apply(MARGIN = 1, FUN = get_hwe)
314 | snpData_mat <- snpData_mat[HWE > hwe, ] |> as.data.frame()
315 | return(snpData_mat)
316 | }
317 |
318 |
319 |
--------------------------------------------------------------------------------
/R/TCGA_id_conversion.R:
--------------------------------------------------------------------------------
1 | #' Convert ENSEMBL gene id to gene Symbol in TCGA
2 | #'
3 | #' @param profiles a data.frame of gene expression data,
4 | #' each column is a sample,
5 | #' and each row is a gene.
6 | #' @param toType one of 'keytypes(org.Hs.eg.db)'
7 | #'
8 | #' @return a data.frame, gene symbols and their expression value
9 | #' @export
10 | #'
11 | #' @examples
12 | #' library(org.Hs.eg.db)
13 | #' data(profile)
14 | #' result <- id_conversion_TCGA(profile)
15 | id_conversion_TCGA <- function(profiles, toType = "SYMBOL") {
16 | rownames(profiles) <- gsub("\\..*", "", rownames(profiles))
17 | genes <- clusterProfiler::bitr(rownames(profiles),
18 | fromType = "ENSEMBL",
19 | toType = toType, OrgDb = org.Hs.eg.db::org.Hs.eg.db, drop = FALSE
20 | )
21 |
22 | genes <- genes[!duplicated(genes[, 1]), ]
23 | rownames(genes) <- genes[, 1]
24 | profiles2 <- as.matrix(profiles)
25 | rownames(profiles2) <- genes[rownames(profiles), 2]
26 | return(profiles2)
27 | }
28 |
--------------------------------------------------------------------------------
/R/arrayDiff.R:
--------------------------------------------------------------------------------
1 | #' Differential analysis of Microarray data
2 | #'
3 | #' @param df data.frame of the omic data, each column is a sample,
4 | #' and each row is a gene.
5 | #' @param group a vector, group of samples.
6 | #' @param method method to do differential analysis,
7 | #' one of "limma", "ttest", "wilcox".
8 | #' @param adjust.method adjust.method, one of "holm", "hochberg", "hommel",
9 | #' "bonferroni", "BH", "BY", "fdr", and "none".
10 | #' @return data.frame
11 | #' @export
12 | #'
13 | #' @examples
14 | #' \donttest{
15 | #' library(GeoTcgaData)
16 | #' library(data.table)
17 | #' # Use real GEO data as example
18 | #' arrayData <- read.table("GSE54807_series_matrix.txt.gz",
19 | #' sep = "\t", header = TRUE,
20 | #' fill=TRUE, comment.char = "!", check.names=FALSE)
21 | #' gpl <- fread("GPL6244-17930.txt", sep = "\t", header = TRUE)
22 | #' gpl <- gpl[, c("ID", "gene_assignment")]
23 | #' class(gpl) <- "data.frame"
24 | #'
25 | #' for (i in seq_len(nrow(gpl))) {
26 | #' aa <- strsplit(gpl[i, 2], " // ")[[1]][5]
27 | #' gpl[i, 2] <- as.character(strsplit(aa, " /// ")[[1]][1])
28 | #' }
29 | #' gpl[,1] <- as.character(gpl[,1])
30 | #' arrayData[, 1] <- as.character(arrayData[, 1])
31 | #' rownames(gpl) <- gpl[, 1]
32 | #' arrayData[, 1] <- gpl[arrayData[, 1], 2]
33 | #'
34 | #'
35 | #' arrayData <- repRemove(arrayData," /// ")
36 | #'
37 | #' # Remove rows that do not correspond to genes
38 | #' arrayData <- arrayData[!is.na(arrayData[, 1]), ]
39 | #' arrayData <- arrayData[!arrayData[, 1] == "", ]
40 | #' arrayData <- arrayData[!arrayData[, 1] == "---", ]
41 | #'
42 | #'
43 | #' arrayData <- arrayData[order(arrayData[, 1]), ]
44 | #' arrayData <- gene_ave(arrayData, 1)
45 | #'
46 | #' keep <- apply(arrayData, 1, function(x) sum(x < 1) < (length(x)/2))
47 | #' arrayData <- arrayData[keep, ]
48 | #'
49 | #' group <- c(rep("group1", 12), rep("group2", 12))
50 | #' result <- differential_array(df = arrayData, group = group)
51 | #' }
52 | #' # Use random data as example
53 | #' arrayData <- matrix(runif(200), 25, 8)
54 | #' rownames(arrayData) <- paste0("gene", 1:25)
55 | #' colnames(arrayData) <- paste0("sample", 1:8)
56 | #' group <- c(rep("group1", 4), rep("group2", 4))
57 | #' result <- differential_array(df = arrayData, group = group)
58 | differential_array <- function(df, group, method = "limma",
59 | adjust.method = "BH") {
60 | method <- match.arg(method, c("limma", "ttest", "wilcox"))
61 | if (method == "limma") {
62 | result <- differential_limma(df, group, adjust.method = adjust.method)
63 | } else {
64 | groups <- unique(group)
65 | which1 <- which(group == groups[1])
66 | which2 <- which(group == groups[2])
67 | P.Value <- rep(0, nrow(df))
68 | if (method == "ttest") {
69 | for (i in seq_len(length(P.Value))) {
70 | P.Value[i] <- stats::t.test(df[i, which1],
71 | df[i, which2])$p.value
72 | }
73 | } else {
74 | for (i in seq_len(length(P.Value))) {
75 | P.Value[i] <- stats::wilcox.test(as.numeric(df[i, which1]),
76 | as.numeric(df[i, which2]))$p.value
77 | }
78 | }
79 | adj.P.Val <- stats::p.adjust(P.Value, method = adjust.method)
80 | result <- data.frame(gene = rownames(df),
81 | P.Value = P.Value, adj.P.Val = adj.P.Val)
82 | }
83 | return(result)
84 | }
85 |
86 |
87 | #' Get Microarray matrix data from GEO
88 | #'
89 | #' @param gse GSE number, such as GSE781.
90 | #'
91 | #' @return a list of matrix
92 | #' @export
93 | #'
94 | #' @examples
95 | #' \donttest{
96 | #' arraylist <- get_geo_array("GSE781")
97 | #' }
98 | get_geo_array <- function(gse) {
99 | gse <- GEOquery::getGEO(gse, GSEMatrix = FALSE, AnnotGPL = TRUE)
100 | gselist <- vector("list", length(GEOquery::GPLList(gse)))
101 | names(gselist) <- names(GEOquery::GPLList(gse))
102 | gsmplatforms <- lapply(GEOquery::GSMList(gse),
103 | function(x) {GEOquery::Meta(x)$platform_id})
104 | for (i in seq_len(length(gselist))) {
105 | gsmlist <- BiocGenerics::Filter(function(gsm) {
106 | GEOquery::Meta(gsm)$platform_id==names(gselist)[i]},
107 | GEOquery::GSMList(gse))
108 | probesets <- GEOquery::Table(GEOquery::GPLList(gse)[[1]])$ID
109 | data.matrix <- do.call('cbind',
110 | lapply(gsmlist, function(x) {tab <- GEOquery::Table(x)
111 | mymatch <- match(probesets,tab$ID_REF)
112 | return(tab$VALUE[mymatch])
113 | }
114 | )
115 | )
116 | data.matrix <- apply(data.matrix,2,
117 | function(x) {as.numeric(as.character(x))})
118 | gpl <- gse@gpls[names(gselist)[i]]
119 | gpl <- gpl[[1]]@dataTable@table
120 | genes <- gpl[match(probesets, gpl[, "ID"]), "Gene Symbol"]
121 | rownames(data.matrix) <- genes
122 | colnames(data.matrix) <- names(gsmlist)
123 | gselist[[i]] <- data.matrix
124 | }
125 | }
126 |
127 | #' Preprocess of Microarray data
128 | #'
129 | #' @param x matrix of Microarray data, each column is a sample,
130 | #' and each row is a gene.
131 | #' @param missing_value Method to impute missing expression data,
132 | #' one of "zero" and "knn".
133 | #' @param string a string, sep of the gene
134 | #'
135 | #' @return matrix
136 | #' @export
137 | #'
138 | #' @examples
139 | #' \donttest{
140 | #' arraylist <- get_geo_array("GSE781")
141 | #' arraylist <- lapply(arraylist, array_preprocess)
142 | #' }
143 | array_preprocess <- function(x, missing_value = "knn", string = " /// ") {
144 | ## filter
145 | x <- x[!is.na(rownames(x)), ]
146 | x <- x[rownames(x) != "", ]
147 | aa <- rowSums(is.na(x))
148 | x <- x[aa < ncol(x)/2, ]
149 |
150 | ## impute missing
151 | if (missing_value == "zero") {
152 | x[is.na(x)] <- 0
153 | } else {
154 | x <- impute::impute.knn(x)$data
155 | }
156 |
157 | ## log
158 | qx <- as.numeric(stats::quantile(x,
159 | c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = TRUE))
160 | LogC <- (qx[5] > 100) || (qx[6]-qx[1] > 50 && qx[2] > 0)
161 | if (LogC) {
162 | x[which(x <= 0)] <- 0.0001
163 | x <- log2(x)
164 | }
165 |
166 | ## gene id conversion
167 | x <- cbind(rownames(x), x)
168 | x <- repAssign(x, string)
169 | # gene_ave(x)
170 | }
171 |
172 | #' cluster probes of Microarray data
173 | #'
174 | #' @param x matrix of Microarray data, the first is the name of the gene,
175 | #' and the others are the expression value.
176 | #' @param clusterCutoff Pearson correlation threshold
177 | #' to cut off the hierarchical tree.
178 | #' @importFrom stats as.dist
179 | #' @importFrom stats cor
180 | #' @importFrom stats cutree
181 | #' @importFrom stats hclust
182 | #' @return data.frame
183 | #' @export
184 | #'
185 | #' @examples
186 | #' \donttest{
187 | #' arraylist <- get_geo_array("GSE781")
188 | #' arraylist <- lapply(arraylist, array_preprocess)
189 | #' arraylist_cluster <- lapply(arraylist, cluster_array)
190 | #' }
191 | cluster_array <- function(x, clusterCutoff = 0.7) {
192 | genes <- x[, 1]
193 | uniqueGenes <- unique(genes)
194 | x <- x[, -1]
195 | matlist <- vector("list", length(uniqueGenes))
196 | for (i in seq_len(length(uniqueGenes))) {
197 | gene <- uniqueGenes[i]
198 | probes <- which(genes == gene)
199 | mat <- x[probes, ]
200 | if (length(probes) == 1) {
201 | rownames(mat) <- gene
202 | matlist[[i]] <- mat
203 | } else {
204 | probeCorrelation <- cor(t(mat),method = 'pearson')
205 | ClusterResults <- hclust(as.dist(1-probeCorrelation),
206 | method = "complete", members = NULL)
207 | #plot(ClusterResults)
208 | Clusters <- cutree(ClusterResults, h = clusterCutoff)
209 | clusterDf <- matrix(0, length(unique(Clusters)), ncol(mat)) |>
210 | as.data.frame()
211 | for (j in seq_len(length(unique(Clusters)))) {
212 | tmpGeneProbes <- which(Clusters == j)
213 | if (length(tmpGeneProbes) > 1) {
214 | clusterDf[j, ] <- colMeans(mat[tmpGeneProbes,])
215 | } else {
216 | clusterDf[j, ] <- mat[tmpGeneProbes,]
217 | }
218 | }
219 | if (nrow(clusterDf) > 1) {
220 | rownames(clusterDf) <-
221 | paste(gene, seq(nrow(clusterDf)), sep = "_")
222 | } else {
223 | rownames(clusterDf) <- gene
224 | }
225 |
226 | colnames(clusterDf) <- colnames(mat)
227 | matlist[[i]] <- clusterDf
228 | }
229 | }
230 | matlist <- do.call("rbind", matlist)
231 | }
--------------------------------------------------------------------------------
/R/calculate_mean_module.R:
--------------------------------------------------------------------------------
1 | #' Find the mean value of the gene in each module
2 | #'
3 | #' @param geneExpress a data.frame of gene expression data.
4 | #' Each column is a sample, and each row is a gene.
5 | #' @param module a data.frame of two column. The first column is module name,
6 | #' the second column are genes in this module.
7 | #'
8 | #' @return a data.frame, means the mean of gene expression value in
9 | #' the same module
10 | #' @export
11 | #'
12 | #' @examples
13 | #' data(geneExpress)
14 | #' data(module)
15 | #' result <- cal_mean_module(geneExpress, module)
16 | cal_mean_module <- function(geneExpress, module) {
17 | genes <- rownames(geneExpress)
18 | output_module <- matrix(0, nrow(module), 2)
19 | rownames(output_module) <- module[, 1]
20 | for (i in seq_len(nrow(module))) {
21 | modulen <- unlist(strsplit(module[i, 2], ","))
22 | modulen <- intersect(modulen, genes)
23 | modulenDf <- geneExpress[modulen, ]
24 | output_module[i, ] <- colMeans(modulenDf)
25 | }
26 | as.data.frame(output_module)
27 | }
28 |
--------------------------------------------------------------------------------
/R/calculate_mean_profile.R:
--------------------------------------------------------------------------------
1 | #' Average the values of same genes in gene expression profile
2 | #'
3 | #' @param file_gene_ave a data.frame of gene expression data,
4 | #' each column is a sample, and each row is a gene.
5 | #' @param k a number, indicates which is the gene column.
6 | #'
7 | #' @return a data.frame, the values of same genes in gene expression profile
8 | #' @export
9 | #'
10 | #' @examples
11 | #' aa <- c("MARCH1", "MARC1", "MARCH1", "MARCH1", "MARCH1")
12 | #' bb <- c(2.969058399, 4.722410064, 8.165514853, 8.24243893, 8.60815086)
13 | #' cc <- c(3.969058399, 5.722410064, 7.165514853, 6.24243893, 7.60815086)
14 | #' file_gene_ave <- data.frame(aa = aa, bb = bb, cc = cc)
15 | #' colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
16 | #'
17 | #' result <- gene_ave(file_gene_ave, 1)
18 | gene_ave <- function(file_gene_ave, k = 1) {
19 | x <- file_gene_ave[, -k]
20 | file_gene_ave <- as.matrix(file_gene_ave)
21 | rownames(file_gene_ave) <- file_gene_ave[, k]
22 | # x <- file_gene_ave
23 | ID <- rownames(file_gene_ave)
24 | ID <- factor(ID, levels = unique(ID))
25 |
26 | y <- rowsum(x, ID, reorder = FALSE, na.rm = TRUE)
27 | n <- rowsum(1L - is.na(x), ID, reorder = FALSE)
28 | return(y / n)
29 | }
30 |
--------------------------------------------------------------------------------
/R/data.R:
--------------------------------------------------------------------------------
1 |
2 | #' a data.frame of gene expression data
3 | #'
4 | #' It is a randomly generated expression data
5 | #' used as an example of functions in this package.
6 | #' the rowname is gene symbols
7 | #' the columns are gene expression values
8 | #'
9 | #' @format A data.frame with 10779 rows and 2 column
10 | #'
11 | "geneExpress"
12 |
13 | # ' a matrix for Converting gene symbol to entrez_id or ensembl_gene_id
14 | # '
15 | # ' the hgnc data comes from HGNC website
16 | # ' the columns represent "symbol", "locus_group", "locus_type",
17 | # ' "entrez_id" and "ensembl_gene_id"
18 | # '
19 | # ' @format A matrix with 37647 rows and 5 column
20 | # '
21 | "hgnc"
22 |
23 | #' a matrix of gene expression data in TCGA
24 | #'
25 | #' It is a randomly generated expression data
26 | #' used as an example of functions in this package.
27 | #' the first column represents the gene symbol
28 | #'
29 | #' the other columns represent the expression(FPKM) of genes
30 | #'
31 | #' @format A matrix with 10 rows and 10 column
32 | #'
33 | "profile"
34 |
35 | #' a matrix of gene expression data in GEO
36 | #'
37 | #' It is a randomly generated expression data
38 | #' used as an example of functions in this package.
39 | #' the first column represents the gene symbol
40 | #'
41 | #' the other columns represent the expression of genes
42 | #'
43 | #' @format A matrix with 32 rows and 20 column
44 | #'
45 | "ventricle"
46 |
47 | #' a matrix of gene expression data in TCGA
48 | #'
49 | #' It is a randomly generated expression data
50 | #' used as an example of functions in this package.
51 | #' the first column represents the gene symbol
52 | #'
53 | #' the other columns represent the expression(count) of genes
54 | #'
55 | #' @format A matrix with 100 rows and 150 column
56 | #'
57 | "kegg_liver"
58 |
59 | #' a matrix of gene expression data in GEO
60 | #'
61 | #' the first column represents the gene symbol
62 | #'
63 | #' the other columns represent the expression of genes
64 | #'
65 | #' @format A matrix with 999 rows and 3 column
66 | #'
67 | "GSE66705_sample2"
68 |
69 | #' a matrix of module name, gene symbols, and the number of gene symbols
70 | #'
71 | #' It is a randomly generated expression data
72 | #' used as an example of functions in this package.
73 | #' @format A matrix with 176 rows and 3 column
74 | #'
75 | "module"
76 |
77 |
78 | # ' a matrix for Converting gene symbol.
79 | # '
80 | # ' the hgnc data comes from HGNC website
81 | # '
82 | # ' @format A matrix with 43547 rows and 52 column
83 | # '
84 | "hgnc_file"
85 |
86 |
87 | #' a data.frame of gene length and GC content
88 | #'
89 | #' the gene length and GC content data comes from
90 | #' TxDb.Hsapiens.UCSC.hg38.knownGene and
91 | #' BSgenome.Hsapiens.UCSC.hg38
92 | #'
93 | #' @format A data.frame with 27341 rows and 2 column
94 | #'
95 | "gene_cov"
96 |
--------------------------------------------------------------------------------
/R/fpkm_count_conversion.r:
--------------------------------------------------------------------------------
1 | countToTpm_internal <- function(counts, effLen) {
2 | rate <- log(counts) - log(effLen)
3 | denom <- log(sum(exp(rate)))
4 | exp(rate - denom + log(1e6))
5 | }
6 | countToFpkm_internal <- function(counts, effLen) {
7 | N <- sum(counts)
8 | exp(log(counts) + log(1e9) - log(effLen) - log(N))
9 | }
10 |
11 | fpkmToTpm_internal <- function(fpkm) {
12 | exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
13 | }
14 |
15 | countToEffCounts_internal <- function(counts, len, effLen) {
16 | counts * (len / effLen)
17 | }
18 |
19 | # if we have fpkm, then we can easily get the rate of counts/sum(counts).
20 | # we can't get the real count value.
21 | fpkmToCount_internal <- function(fpkm, effLen, N = 1e9) {
22 | # rate <- (fpkm * effLen)/10^9
23 | rate <- exp(log(fpkm) + log(effLen) - log(1e9))
24 | counts <- rate * N
25 | }
26 |
27 |
28 |
29 |
30 | #' Convert count to FPKM
31 | #'
32 | #' @param counts_matrix a matrix, colnames of counts_matrix are sample name,
33 | #' rownames of counts_matrix are gene symbols
34 | #' @param keyType keyType, one of keytypes(org.Hs.eg.db).
35 | #' @param gene_cov data.frame of two column, the first column is gene length,
36 | #' the second column is gene GC content
37 | #'
38 | #' @return a matrix
39 | #' @export
40 | #'
41 | #' @examples
42 | #' data(gene_cov)
43 | #' lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
44 | #' rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
45 | #' colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
46 | #' result <- countToFpkm(lung_squ_count2,
47 | #' keyType = "SYMBOL",
48 | #' gene_cov = gene_cov
49 | #' )
50 | countToFpkm <- function(counts_matrix, keyType = "SYMBOL", gene_cov) {
51 | gene_cov2 <- gene_cov
52 | if (keyType != "ENTREZID") {
53 | genes_bitr <- clusterProfiler::bitr(rownames(gene_cov),
54 | fromType = "ENTREZID", toType = keyType,
55 | OrgDb = org.Hs.eg.db::org.Hs.eg.db, drop = TRUE
56 | )
57 | genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ]
58 | gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ]
59 | rownames(gene_cov2) <- genes_bitr[, 2]
60 | }
61 | genes_count <- intersect(rownames(counts_matrix), rownames(gene_cov2))
62 | counts_matrix_new <- counts_matrix[genes_count, ]
63 | gene_loc_len_new <- gene_cov2[genes_count, ]
64 | genes_length <- as.numeric(gene_loc_len_new[, 1])
65 | counts_matrix_new2 <- counts_matrix_new
66 | for (i in seq_len(dim(counts_matrix_new2)[2])) {
67 | counts_matrix_new2[, i] <- countToFpkm_internal(
68 | as.numeric(counts_matrix_new2[, i]),
69 | genes_length)
70 | }
71 | return(counts_matrix_new2)
72 | }
73 |
74 |
75 | #' Convert count to Tpm
76 | #'
77 | #' @param counts_matrix a matrix, colnames of counts_matrix are sample name,
78 | #' rownames of counts_matrix are gene symbols
79 | #' @param keyType keyType, one of keytypes(org.Hs.eg.db).
80 | #' @param gene_cov data.frame of two column, the first column is gene length,
81 | #' the second column is gene GC content
82 | #'
83 | #' @return a matrix
84 | #' @export
85 | #'
86 | #' @examples
87 | #' data(gene_cov)
88 | #' lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
89 | #' rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
90 | #' colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
91 | #' result <- countToTpm(lung_squ_count2,
92 | #' keyType = "SYMBOL",
93 | #' gene_cov = gene_cov
94 | #' )
95 | countToTpm <- function(counts_matrix, keyType = "SYMBOL", gene_cov) {
96 | gene_cov2 <- gene_cov
97 | if (keyType != "ENTREZID") {
98 | genes_bitr <- clusterProfiler::bitr(rownames(gene_cov),
99 | fromType = "ENTREZID", toType = keyType,
100 | OrgDb = org.Hs.eg.db::org.Hs.eg.db, drop = TRUE
101 | )
102 | genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ]
103 | gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ]
104 | rownames(gene_cov2) <- genes_bitr[, 2]
105 | }
106 | genes_count <- intersect(rownames(counts_matrix), rownames(gene_cov2))
107 | counts_matrix_new <- counts_matrix[genes_count, ]
108 | gene_loc_len_new <- gene_cov2[genes_count, ]
109 | genes_length <- as.numeric(gene_loc_len_new[, 1])
110 | counts_matrix_new2 <- counts_matrix_new
111 | for (i in seq_len(dim(counts_matrix_new2)[2])) {
112 | counts_matrix_new2[, i] <- countToTpm_internal(
113 | as.numeric(counts_matrix_new2[, i]),
114 | genes_length)
115 | }
116 | return(counts_matrix_new2)
117 | }
118 |
119 |
120 | #' Convert fpkm to Tpm
121 | #'
122 | #' @param fpkm_matrix a matrix, colnames of fpkm_matrix are sample name,
123 | #' rownames of fpkm_matrix are genes
124 | #'
125 | #' @return a matrix
126 | #' @export
127 | #'
128 | #' @examples
129 | #' lung_squ_count2 <- matrix(c(0.11, 0.22, 0.43, 0.14, 0.875,
130 | #' 0.66, 0.77, 0.18, 0.29), ncol = 3)
131 | #' rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
132 | #' colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
133 | #' result <- fpkmToTpm(lung_squ_count2)
134 | fpkmToTpm <- function(fpkm_matrix) {
135 | fpkm_matrix_new <- apply(fpkm_matrix, 2, fpkmToTpm_internal)
136 | }
137 |
--------------------------------------------------------------------------------
/R/metap.R:
--------------------------------------------------------------------------------
1 | # copy from metap package: https://CRAN.R-project.org/package=metap
2 | #' @importFrom stats na.fail
3 | #' @importFrom stats pchisq
4 | #' @importFrom stats pnorm
5 | #' @importFrom stats qnorm
6 | sumz <- function(p, weights = NULL, data = NULL, subset = NULL,
7 | na.action = na.fail, log.p = FALSE, log.input = FALSE) {
8 | if(is.null(data)) data <- sys.frame(sys.parent())
9 | mf <- match.call()
10 | mf$data <- NULL
11 | mf$subset <- NULL
12 | mf$na.action <- NULL
13 | mf[[1]] <- as.name("data.frame")
14 | mf <- eval(mf, data)
15 | if(!is.null(subset)) mf <- mf[subset,]
16 | mf <- na.action(mf)
17 | p <- as.numeric(mf$p)
18 | weights <- mf$weights
19 | noweights <- is.null(weights)
20 | if(noweights) weights <- rep(1, length(p))
21 | if(length(p) != length(weights)) warning("Length of p and weights differ")
22 | if(log.input) {
23 | keep <- p < 0
24 | } else {
25 | keep <- (p > 0) & (p < 1)
26 | }
27 | invalid <- sum(1L * keep) < 2
28 | if(invalid) {
29 | warning("Must have at least two valid p values")
30 | res <- list(z = NA_real_, p = NA_real_,
31 | validp = p[keep], weights = weights)
32 | } else {
33 | if(sum(1L * keep) != length(p)) {
34 | warning("Some studies omitted")
35 | omitw <- weights[!keep]
36 | if((sum(1L * omitw) > 0) & !noweights)
37 | warning("Weights omitted too")
38 | }
39 | zp <- (qnorm(p[keep], lower.tail = FALSE, log.p = log.input) %*%
40 | weights[keep]) / sqrt(sum(weights[keep]^2))
41 | res <- list(z = zp, p = pnorm(zp, lower.tail = FALSE,
42 | log.p = log.p),
43 | validp = p[keep], weights = weights)
44 | }
45 | class(res) <- c("sumz", "metap")
46 | res
47 | }
48 |
49 | # copy from metap package: https://CRAN.R-project.org/package=metap
50 | print.sumz <- function(x, ...) {
51 | cat("sumz = ", x$z, "p = ", x$p, "\n")
52 | invisible(x)
53 | }
54 |
55 | # copy from metap package: https://CRAN.R-project.org/package=metap
56 | sumlog <- function(p, log.p = FALSE) {
57 | keep <- (p > 0) & (p <= 1)
58 | invalid <- sum(1L * keep) < 2
59 | if(invalid) {
60 | warning("Must have at least two valid p values")
61 | res <- list(chisq = NA_real_, df = NA_integer_,
62 | p = NA_real_, validp = p[keep])
63 | } else {
64 | lnp <- log(p[keep])
65 | chisq <- (-2) * sum(lnp)
66 | df <- 2 * length(lnp)
67 | if(length(lnp) != length(p)) {
68 | warning("Some studies omitted")
69 | }
70 | res <- list(chisq = chisq, df = df,
71 | p = pchisq(chisq, df, lower.tail = FALSE,
72 | log.p = log.p), validp = p[keep])
73 | }
74 | class(res) <- c("sumlog", "metap")
75 | res
76 | }
77 |
78 | # copy from metap package: https://CRAN.R-project.org/package=metap
79 | print.sumlog <- function(x, ...) {
80 | cat("chisq = ", x$chisq, " with df = ", x$df, " p = ", x$p, "\n")
81 | invisible(x)
82 | }
83 |
84 |
85 |
--------------------------------------------------------------------------------
/R/prepareChi.r:
--------------------------------------------------------------------------------
1 | #' Preparer file for chi-square test
2 | #'
3 | #' @param cnv result of ann_merge()
4 | #'
5 | #' @return a matrix
6 | #' @export
7 | #'
8 | #' @examples
9 | #' cnv <- matrix(c(
10 | #' -1.09150, -1.47120, -0.87050, -0.50880,
11 | #' -0.50880, 2.0, 2.0, 2.0, 2.0, 2.0, 2.601962, 2.621332, 2.621332,
12 | #' 2.621332, 2.621332, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
13 | #' 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
14 | #' ), nrow = 5)
15 | #' cnv <- as.data.frame(cnv)
16 | #' rownames(cnv) <- c("AJAP1", "FHAD1", "CLCNKB", "CROCCP2", "AL137798.3")
17 | #' colnames(cnv) <- c(
18 | #' "TCGA-DD-A4NS-10A-01D-A30U-01", "TCGA-ED-A82E-01A-11D-A34Y-01",
19 | #' "TCGA-WQ-A9G7-01A-11D-A36W-01", "TCGA-DD-AADN-01A-11D-A40Q-01",
20 | #' "TCGA-ZS-A9CD-10A-01D-A36Z-01", "TCGA-DD-A1EB-11A-11D-A12Y-01"
21 | #' )
22 | #' cnv_chi_file <- prepare_chi(cnv)
23 | prepare_chi <- function(cnv) {
24 | file1 <- cnv
25 | sampless <- rep(0, ncol(file1))
26 | for (i in seq_len(length(sampless)))
27 | {
28 | a <- unlist(strsplit(colnames(file1)[i], "-"))[4]
29 | sampless[i] <- substring(a, 1, 1)
30 | }
31 |
32 | cnv_chi <- matrix(0, nrow(cnv), 4)
33 | rownames(cnv_chi) <- rownames(cnv)
34 | colnames(cnv_chi) <- c("normalCNV", "normalWild", "tumorCNV", "tumorWild")
35 | for (i in seq_len(nrow(file1)))
36 | {
37 | normalCNV <- 0
38 | normalWild <- 0
39 | tumorCNV <- 0
40 | tumorWild <- 0
41 | for (j in seq_len(ncol(file1)))
42 | {
43 | if ((sampless[j] == "1") &&
44 | (abs(as.numeric(file1[i, j]) - 2) > 0.5)) {
45 | normalCNV <- normalCNV + 1
46 | }
47 | if ((sampless[j] == "1") &&
48 | (abs(as.numeric(file1[i, j]) - 2) <= 0.5)) {
49 | normalWild <- normalWild + 1
50 | }
51 | if ((sampless[j] == "0") &&
52 | (abs(as.numeric(file1[i, j]) - 2) > 0.5)) {
53 | tumorCNV <- tumorCNV + 1
54 | }
55 | if ((sampless[j] == "0") &&
56 | (abs(as.numeric(file1[i, j]) - 2) <= 0.5)) {
57 | tumorWild <- tumorWild + 1
58 | }
59 | }
60 | cnv_chi[i, ] <- c(normalCNV, normalWild, tumorCNV, tumorWild)
61 | }
62 | return(cnv_chi)
63 | }
64 |
65 |
--------------------------------------------------------------------------------
/R/rep.R:
--------------------------------------------------------------------------------
1 | #' Handle the case where one id corresponds to multiple genes
2 | #'
3 | #' @param input_file input file, a data.frame or a matrix,
4 | #' the first column should be genes.
5 | #' @param string a string, sep of the gene
6 | #'
7 | #' @return a data.frame, when an id corresponds to multiple genes,
8 | #' the expression value is assigned to each gene
9 | #' @export
10 | #'
11 | #' @examples
12 | #' aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
13 | #' "MARCH3 /// MARCH4", "MARCH1")
14 | #' bb <- c("2.969058399", "4.722410064", "8.165514853",
15 | #' "8.24243893", "8.60815086")
16 | #' cc <- c("3.969058399", "5.722410064", "7.165514853",
17 | #' "6.24243893", "7.60815086")
18 | #' input_file <- data.frame(aa = aa, bb = bb, cc = cc)
19 | #'
20 | #' repAssign_result <- repAssign(input_file, " /// ")
21 | #'
22 | repAssign <- function(input_file, string) {
23 | name <- colnames(input_file)[1]
24 | genelist <- strsplit(input_file[, 1], string)
25 | geneLength <- unlist(lapply(genelist, length))
26 | input_file <- input_file[, -1]
27 | output <- apply(input_file, 2, rep, times = geneLength)
28 | output2 <- matrix(as.numeric(output), nrow = nrow(output))
29 | colnames(output2) <- colnames(output)
30 | output2 <- data.frame(unlist(genelist), output2, check.names = FALSE)
31 | colnames(output2)[1] <- name
32 | output2
33 | }
34 |
35 | #' Handle the case where one id corresponds to multiple genes
36 | #'
37 | #' @param input_file input file, a data.frame or a matrix,
38 | #' the first column should be genes.
39 | #' @param string a string,sep of the gene
40 | #'
41 | #' @return a data.frame, when an id corresponds to multiple genes,
42 | #' the expression value is deleted
43 | #' @export
44 | #'
45 | #' @examples
46 | #' aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
47 | #' "MARCH3 /// MARCH4", "MARCH1")
48 | #' bb <- c("2.969058399", "4.722410064", "8.165514853",
49 | #' "8.24243893", "8.60815086")
50 | #' cc <- c("3.969058399", "5.722410064", "7.165514853",
51 | #' "6.24243893", "7.60815086")
52 | #' input_file <- data.frame(aa = aa, bb = bb, cc = cc)
53 | #' repRemove_result <- repRemove(input_file, " /// ")
54 | repRemove <- function(input_file, string) {
55 | unKeep <- grep(string, input_file[, 1])
56 | if (length(unKeep) > 0) input_file <- input_file[-unKeep, ]
57 | input_file
58 | }
59 |
--------------------------------------------------------------------------------
/R/sysdata.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/R/sysdata.rda
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 |
6 |
7 | ```{r, include = FALSE}
8 | knitr::opts_chunk$set(
9 | collapse = TRUE,
10 | comment = "#>",
11 | fig.path = "man/figures/",
12 | out.width = "100%"
13 | )
14 | ```
15 |
16 | # GeoTcgaData
17 |
18 | The goal of GeoTcgaData is to deal with RNA-seq, DNA Methylation, single nucleotide Variation and Copy number variation data in GEO and TCGA.
19 |
20 | ## :writing_hand: Authors
21 | Erqiang Hu
22 |
23 | Department of Bioinformatics, School of Basic Medical Sciences, Southern Medical University.
24 |
25 |
26 | ## :arrow\_double\_down: Installation
27 |
28 |
29 | ```{r eval=FALSE}
30 | if(!requireNamespace("devtools", quietly = TRUE))
31 | install.packages("devtools")
32 | devtools::install_github("YuLab-SMU/GeoTcgaData")
33 | ```
34 |
35 | ```{r}
36 | library(GeoTcgaData)
37 | ```
38 | GEO and TCGA provide us with a wealth of data, such as RNA-seq, DNA Methylation, single nucleotide Variation and Copy number variation data. It's easy to download data from TCGA using the gdc tool or `TCGAbiolinks`, and some software provides organized TCGA data, such as [UCSC Xena](http://xena.ucsc.edu/) , [UCSCXenaTools](https://cran.r-project.org/package=UCSCXenaTools),and [sangerbox](http://vip.sangerbox.com/), but processing these data into a format suitable for bioinformatics analysis requires more work. This R package was developed to handle these data.
39 |
40 | ## Example
41 |
42 | This is a basic example which shows you how to solve a common problem:
43 |
44 | ## RNA-seq data differential expression analysis
45 | It is convenient to use [`TCGAbiolinks`](http://www.bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/analysis.html) or [`GDCRNATools`](https://bioconductor.org/packages/GDCRNATools/) to download and analysis Gene expression data. `TCGAbiolinks` use `edgeR` package to do differential expression analysis, while `GDCRNATools` can implement three most commonly used methods: limma, edgeR , and DESeq2 to identify differentially expressed genes (DEGs).
46 |
47 | Alicia Oshlack et. al. claimed that unlike the chip data, the RNA-seq data had one [bias](https://pubmed.ncbi.nlm.nih.gov/20132535/): the larger the transcript length / mean read count , the more likely it was to be identified as a differential gene, [while there was no such trend in the chip data](https://pubmed.ncbi.nlm.nih.gov/19371405/).
48 |
49 |
50 | However, when we use their chip data for difference analysis( using the limma package), we find that chip data has the same trend as RNA-seq data. And we also found this trend in the difference analysis results given by the data [authors](https://genome.cshlp.org/content/18/9/1509.long).
51 |
52 |
53 |
54 |
55 | It is worse noting that [only technical replicate data, which has small gene dispersions, shows this bias](https://pubmed.ncbi.nlm.nih.gov/28545404/). This is because in technical replicate RNA-seq data a long gene has more reads mapping to it compared to a short gene of similar expression, and most of the statistical methods used to detect differential expression have stronger detection ability for genes with more reads. However, we have not deduced why there is such a bias in the current difference analysis algorithms.
56 |
57 | Some software, such as [CQN](http://www.bioconductor.org/packages/cqn/) , present a [normalization algorithm](https://pubmed.ncbi.nlm.nih.gov/22285995/) to correct systematic biases(gene length bias and [GC-content bias](https://pubmed.ncbi.nlm.nih.gov/22177264/). But they did not provide sufficient evidence to prove that the correction is effective. We use the [Marioni dataset](https://pubmed.ncbi.nlm.nih.gov/19371405/) to verify the correction effect of CQN and find that there is still a deviation after correction:
58 |
59 |
60 |
61 | [GOseq](http://bioconductor.org/packages/goseq/) based on [Wallenius' noncentral hypergeometric distribution](https://en.wikipedia.org/wiki/Wallenius%27_noncentral_hypergeometric_distribution) can effectively correct the gene length deviation in enrichment analysis. However, the current RNA-seq data often have no gene length bias, but only the expression amount(read count) bias, GOseq may overcorrect these data, correcting originally unbiased data into reverse bias.
62 |
63 |
64 | GOseq also fails to correct for expression bias, therefore, read count bias correction is still a challenge for us.
65 |
66 |
67 | use `TCGAbiolinks` to download TCGA data
68 |
69 | ```{r eval=FALSE}
70 | # download RNA-seq data
71 | library(TCGAbiolinks)
72 |
73 | query <- GDCquery(project = "TCGA-ACC",
74 | data.category = "Transcriptome Profiling",
75 | data.type = "Gene Expression Quantification",
76 | workflow.type = "STAR - Counts")
77 |
78 | GDCdownload(query, method = "api", files.per.chunk = 3,
79 | directory = Your_Path)
80 |
81 | dataRNA <- GDCprepare(query = query, directory = Your_Path,
82 | save = TRUE, save.filename = "dataRNA.RData")
83 | ## get raw count matrix
84 | dataPrep <- TCGAanalyze_Preprocessing(object = dataRNA,
85 | cor.cut = 0.6,
86 | datatype = "STAR - Counts")
87 |
88 | ```
89 |
90 | Use `differential_RNA` to do difference analysis. We provide the data of human gene length and GC content in `gene_cov`.
91 |
92 | ```{r eval=FALSE}
93 | group <- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE)
94 | library(cqn) # To avoid reporting errors: there is no function "rq"
95 | ## get gene length and GC content
96 | library(org.Hs.eg.db)
97 | genes_bitr <- bitr(rownames(gene_cov), fromType = "ENTREZID", toType = "ENSEMBL",
98 | OrgDb = org.Hs.eg.db, drop = TRUE)
99 | genes_bitr <- genes_bitr[!duplicated(genes_bitr[,2]), ]
100 | gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ]
101 | rownames(gene_cov2) <- genes_bitr$ENSEMBL
102 | genes <- intersect(rownames(dataPrep), rownames(gene_cov2))
103 | dataPrep <- dataPrep[genes, ]
104 | geneLength <- gene_cov2[genes, "length"]
105 | gccontent <- gene_cov2[genes, "GC"]
106 | names(geneLength) <- names(gccontent) <- genes
107 | ## Difference analysis
108 | DEGAll <- differential_RNA(counts = dataPrep, group = group,
109 | geneLength = geneLength, gccontent = gccontent)
110 | ```
111 |
112 | Use `clusterProfiler` to do enrichment analytics:
113 |
114 | ```{r eval=FALSE}
115 | diffGenes <- DEGAll$logFC
116 | names(diffGenes) <- rownames(DEGAll)
117 | diffGenes <- sort(diffGenes, decreasing = TRUE)
118 | library(clusterProfiler)
119 | library(enrichplot)
120 | library(org.Hs.eg.db)
121 | gsego <- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
122 | dotplot(gsego)
123 | ```
124 |
125 |
126 |
127 | ## DNA Methylation data integration
128 | use `TCGAbiolinks` to download TCGA data.
129 |
130 | The codes may need to be modified if `TCGAbiolinks` updates. So please read its [documents](https://www.bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html).
131 |
132 | ```{r eval=FALSE}
133 | library(TCGAbiolinks)
134 | query <- GDCquery(project = "TCGA-ACC",
135 | data.category = "DNA Methylation",
136 | data.type = "Methylation Beta Value",
137 | platform = "Illumina Human Methylation 450")
138 | GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
139 | ```
140 |
141 | The function `Merge_methy_tcga` could Merge methylation data downloaded from TCGA official website or TCGAbiolinks. This makes it easier to extract differentially methylated genes in the downstream analysis. For example:
142 |
143 | ```{r eval=FALSE}
144 | merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data)
145 | ```
146 | Then use differential_methy() to do difference analysis.
147 |
148 | ```{r eval=FALSE}
149 | # if (!requireNamespace("ChAMP", quietly = TRUE))
150 | # BiocManager::install("ChAMP")
151 | library(ChAMP) # To avoid reporting errors
152 | differential_gene <- differential_methy(cpgData = merge_result, sampleGroup = sample(c("C","T"),
153 | ncol(merge_result[[1]]), replace = TRUE))
154 | ```
155 |
156 | **Note:** `ChAMP`has a large number of dependent packages. If you cannot install it successfully, you can download each dependent package separately(Source or Binary) and install it locally.
157 |
158 |
159 |
160 | If your methylation data was downloaded from [UCSC Xena](http://xena.ucsc.edu/), you can use `methydifferential_ucsc` to get differential genes.
161 |
162 | ```{r eval=FALSE}
163 | methy_file <- "TCGA.THCA.sampleMap_HumanMethylation450.gz"
164 | methy <- fread(methy_file, sep = "\t", header = T)
165 | library(ChAMP)
166 | myImport <- champ.import(directory=system.file("extdata",package="ChAMPdata"))
167 | myfilter <- champ.filter(beta=myImport$beta,pd=myImport$pd,detP=myImport$detP,beadcount=myImport$beadcount)
168 | cpg_gene <- hm450.manifest.hg19[, c("probeID", "gene_HGNC")]
169 | ## or use IlluminaHumanMethylation450kanno.ilmn12.hg19 to get annotation data
170 | # library(IlluminaHumanMethylation450kanno.ilmn12.hg19)
171 | # ann <- getAnnotation(IlluminaHumanMethylation450kanno.ilmn12.hg19)
172 | # class(ann) <- "data.frame"
173 | # cpg_gene <- ann[,c("Name", "UCSC_RefGene_Name", "UCSC_RefGene_Group")]
174 |
175 | methy_df <- differential_methy(methy, cpg_gene, ucscData = TRUE)
176 | ```
177 |
178 | We provide three models to get methylation difference genes:
179 |
180 | if model = "cpg", step1: calculate difference cpgs; step2: calculate difference genes;
181 |
182 | if model = "gene", step1: calculate the methylation level of genes; step2: calculate difference genes.
183 |
184 | We find that only model = "gene" has no deviation of CpG number.
185 |
186 |
187 | Use `clusterProfiler` to do enrichment analytics:
188 |
189 | ```{r eval=FALSE}
190 | differential_gene$p.adj <- p.adjust(differential_gene$pvalue)
191 | genes <- differential_gene[differential_gene$p.adj < 0.05, "gene"]
192 | library(clusterProfiler)
193 | library(enrichplot)
194 | library(org.Hs.eg.db)
195 | ego <- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
196 | dotplot(ego)
197 | ```
198 |
199 |
200 |
201 | ## Copy number variation data integration and differential gene extraction
202 |
203 | use TCGAbiolinks to download TCGA data(Gene Level Copy Number Scores)
204 |
205 | ```{r eval=FALSE}
206 | library(TCGAbiolinks)
207 | query <- GDCquery(project = "TCGA-LGG",
208 | data.category = "Copy Number Variation",
209 | data.type = "Gene Level Copy Number Scores")
210 |
211 | GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
212 |
213 | data <- GDCprepare(query = query,
214 | directory = Your_Path)
215 | ```
216 |
217 |
218 |
219 | Do difference analysis of gene level copy number variation data using `differential_CNV`
220 |
221 | ```{r eval=FALSE}
222 | class(data) <- "data.frame"
223 | cnvData <- data[, -c(1,2,3)]
224 | rownames(cnvData) <- data[, 1]
225 | sampleGroup = sample(c("A","B"), ncol(cnvData), replace = TRUE)
226 | diffCnv <- differential_CNV(cnvData, sampleGroup)
227 | ```
228 |
229 | Use `clusterProfiler` to do enrichment analytics:
230 |
231 | ```{r eval=FALSE}
232 | pvalues <- diffCnv$pvalue * sign(diffCnv$odds)
233 | genes <- rownames(diffCnv)[diffCnv$pvalue < 0.05]
234 | library(clusterProfiler)
235 | library(enrichplot)
236 | library(org.Hs.eg.db)
237 | ego <- enrichGO(gene = genes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
238 | dotplot(ego)
239 | ```
240 |
241 |
242 |
243 | ## Difference analysis of single nucleotide Variation data
244 |
245 | Use TCGAbiolinks to download TCGA data
246 |
247 | ```{r eval=FALSE}
248 | library(TCGAbiolinks)
249 | query <- GDCquery(project = "TCGA-ACC",
250 | data.category = "Simple Nucleotide Variation",
251 | data.type = "Masked Somatic Mutation",
252 | workflow.type = "MuSE Variant Aggregation and Masking")
253 |
254 | GDCdownload(query, method = "api", files.per.chunk = 5, directory = Your_Path)
255 |
256 | data_snp <- GDCprepare(query = query,
257 | directory = Your_Path)
258 |
259 | ```
260 |
261 |
262 |
263 | Use `differential_SNP_tcga` to do difference analysis
264 |
265 | ```{r eval=FALSE}
266 | samples <- unique(data_snp$Tumor_Sample_Barcode)
267 | sampleType <- sample(c("A","B"), length(samples), replace = TRUE)
268 | names(sampleType) <- samples
269 | pvalue <- differential_SNP_tcga(snpData = data_snp, sampleType = sampleType)
270 | # merge pvalue
271 |
272 |
273 | ```
274 |
275 |
276 |
277 | Use `clusterProfiler` to do enrichment analysis
278 |
279 | ```{r eval=FALSE}
280 | pvalue2 <- sort(pvalue, decreasing = TRUE)
281 | library(clusterProfiler)
282 | library(enrichplot)
283 | library(org.Hs.eg.db)
284 | gsego <- gseGO(pvalue2, OrgDb = org.Hs.eg.db, keyType = "SYMBOL")
285 | dotplot(gsego)
286 | ```
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 | ## GEO chip data processing
295 |
296 | The function `gene_ave` could average the expression data of different ids for the same gene in the GEO chip data. For example:
297 |
298 | ```{r eval=FALSE}
299 | aa <- c("MARCH1","MARC1","MARCH1","MARCH1","MARCH1")
300 | bb <- c(2.969058399,4.722410064,8.165514853,8.24243893,8.60815086)
301 | cc <- c(3.969058399,5.722410064,7.165514853,6.24243893,7.60815086)
302 | file_gene_ave <- data.frame(aa=aa,bb=bb,cc=cc)
303 | colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
304 | result <- gene_ave(file_gene_ave, 1)
305 | ```
306 |
307 | Multiple genes symbols may correspond to a same chip id. The result of function `repAssign` is to assign the expression of this id to each gene, and function `repRemove` deletes the expression. For example:
308 |
309 | ```{r}
310 | aa <- c("MARCH1 /// MMA","MARC1","MARCH2 /// MARCH3","MARCH3 /// MARCH4","MARCH1")
311 | bb <- c("2.969058399","4.722410064","8.165514853","8.24243893","8.60815086")
312 | cc <- c("3.969058399","5.722410064","7.165514853","6.24243893","7.60815086")
313 | input_file <- data.frame(aa=aa,bb=bb,cc=cc)
314 |
315 | repAssign_result <- repAssign(input_file," /// ")
316 | repRemove_result <- repRemove(input_file," /// ")
317 | ```
318 |
319 | ## Other downstream analyses
320 | 1. Especially, the function id_conversion could convert ENSEMBL gene id to gene Symbol in TCGA. For example:
321 |
322 | ```{r}
323 | data(profile)
324 | result <- id_conversion_TCGA(profile)
325 | ```
326 |
327 | The parameter profile is a data.frame or matrix of gene expression data in TCGA.
328 |
329 | **Note:** In previous versions(< 1.0.0) the `id_conversion` and `id_conversion` used HGNC data to convert human gene id. In future versions, we will use `clusterProfiler::bitr` for ID conversion.
330 |
331 | ```{r}
332 | library(clusterProfiler)
333 | library(org.Hs.eg.db)
334 | bitr(c("A2ML1", "A2ML1-AS1", "A4GALT", "A12M1", "AAAS"), fromType = "SYMBOL",
335 | toType = "ENSEMBL", OrgDb = org.Hs.eg.db, drop = FALSE)
336 | ```
337 |
338 |
339 |
340 | 2. The function `countToFpkm` and `countToTpm` could convert count data to FPKM or TPM data.
341 |
342 | ```{r}
343 | data(gene_cov)
344 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
345 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
346 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
347 | result <- countToFpkm(lung_squ_count2,
348 | keyType = "SYMBOL",
349 | gene_cov = gene_cov
350 | )
351 | result
352 | ```
353 |
354 | ```{r}
355 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
356 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
357 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
358 | result <- countToTpm(lung_squ_count2,
359 | keyType = "SYMBOL",
360 | gene_cov = gene_cov
361 | )
362 | result
363 | ```
364 |
365 | **Note:** Now the combined clinical data can be downloaded directly from [TCGAbiolinks](http://www.bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/clinical.html).
366 |
367 | ```{r eval=FALSE}
368 | library(TCGAbiolinks)
369 | ## get BCR Biotab data
370 | query <- GDCquery(project = "TCGA-ACC",
371 | data.category = "Clinical",
372 | data.type = "Clinical Supplement",
373 | data.format = "BCR Biotab")
374 | GDCdownload(query)
375 | clinical.BCRtab.all <- GDCprepare(query)
376 | names(clinical.BCRtab.all)
377 |
378 | ## get indexed data
379 | clinical <- GDCquery_clinic(project = "TCGA-ACC", type = "clinical")
380 | ```
381 |
--------------------------------------------------------------------------------
/data/GSE66705_sample2.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/GSE66705_sample2.rda
--------------------------------------------------------------------------------
/data/geneExpress.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/geneExpress.rda
--------------------------------------------------------------------------------
/data/gene_cov.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/gene_cov.rda
--------------------------------------------------------------------------------
/data/kegg_liver.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/kegg_liver.rda
--------------------------------------------------------------------------------
/data/module.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/module.rda
--------------------------------------------------------------------------------
/data/profile.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/profile.rda
--------------------------------------------------------------------------------
/data/ventricle.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuLab-SMU/GeoTcgaData/544df0696c75c5a8bb378a8154d41a5269d7d38e/data/ventricle.rda
--------------------------------------------------------------------------------
/docs/news/index.html:
--------------------------------------------------------------------------------
1 |
2 |
Changelog • GeoTcgaData
6 |
7 |
8 |
9 |
41 |
42 |
46 |
47 |
48 |
49 |
- fix return value of
differential_array
(2022_10_8, Sat)
50 | - fix gene length bug in
countToTpm()
and countToFpkm()
(2022_9_22, Tue)
51 | - fix a bug in
id_conversion
(2022-8-27, Sat)
52 |
53 |
54 |
55 |
- fix a bug in
differential_RNA(useTopconfects = TRUE)
(2022-8-12, Fir)
56 |
57 |
58 |
- add function
methydifferential_ucsc
and methydifferential_limma
(2021-10-24, Sun)
59 |
60 |
61 |
- update hgnc_file data(2021-10-24, Sun)
62 |
63 |
64 |
- add function
differential_RNA
to do difference analysis of RNA-seq data(2021-7-20, Tue)
65 |
66 |
67 |
- add data hgnc_file
68 | - update function: id_ava()
69 |
70 |
71 |
72 |
- add functions: ann_merge(), countToFpkm(), countToTpm()
73 |
74 |
75 |
78 |
79 |
80 |
81 |
82 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/docs/reference/index.html:
--------------------------------------------------------------------------------
1 |
2 | Function reference • GeoTcgaData
6 |
7 |
8 |
9 |
41 |
42 |
45 |
46 |
47 | All functions
48 |
49 | |
50 |
---|
51 | differential_array()
52 | |
53 | differential_array |
54 |
55 | cal_mean_module()
56 | |
57 | Find the mean value of the gene in each module |
58 |
59 | countToFpkm()
60 | |
61 | Convert count to FPKM |
62 |
63 | countToTpm()
64 | |
65 | Convert count to Tpm |
66 |
67 | differential_cnv()
68 | |
69 | Do chi-square test to find differential genes |
70 |
71 | differential_CNV()
72 | |
73 | Do difference analysis of gene level copy number variation data |
74 |
75 | differential_limma()
76 | |
77 | differential_limma |
78 |
79 | differential_RNA()
80 | |
81 | Do difference analysis of RNA-seq data |
82 |
83 | differential_RNA_ucsc()
84 | |
85 | Do difference analysis of RNA-seq data downloaded from ucsc |
86 |
87 | differential_SNP()
88 | |
89 | Do difference analysis of SNP data |
90 |
91 | differential_SNP_tcga()
92 | |
93 | Do difference analysis of SNP data downloaded from TCGAbiolinks |
94 |
95 | fpkmToTpm_matrix()
96 | |
97 | Convert fpkm to Tpm |
98 |
99 | geneExpress
100 | |
101 | a data.frame of gene expression data |
102 |
103 | gene_ave()
104 | |
105 | Average the values of same genes in gene expression profile |
106 |
107 | gene_cov
108 | |
109 | a data.frame of gene length and GC content |
110 |
111 | GSE66705_sample2
112 | |
113 | a matrix of gene expression data in GEO |
114 |
115 | id_ava()
116 | |
117 | Gene id conversion types |
118 |
119 | id_conversion()
120 | |
121 | Convert ENSEMBL gene id to gene Symbol in TCGA |
122 |
123 | id_conversion()
124 | |
125 | Gene id conversion |
126 |
127 | kegg_liver
128 | |
129 | a matrix of gene expression data in TCGA |
130 |
131 | Merge_methy_tcga()
132 | |
133 | Merge methylation data downloaded from TCGA |
134 |
135 | methyDiff()
136 | |
137 | Get methylation difference gene |
138 |
139 | methydifferential_ucsc()
140 | |
141 | Title |
142 |
143 | module
144 | |
145 | a matrix of module name, gene symbols, and the number of gene symbols |
146 |
147 | prepare_chi()
148 | |
149 | Preparer file for chi-square test |
150 |
151 | profile
152 | |
153 | a matrix of gene expression data in TCGA |
154 |
155 | repAssign()
156 | |
157 | Handle the case where one id corresponds to multiple genes |
158 |
159 | repRemove()
160 | |
161 | Handle the case where one id corresponds to multiple genes |
162 |
163 | tcga_cli_deal()
164 | |
165 | Combine clinical information obtained from TCGA and extract survival data |
166 |
167 | ventricle
168 | |
169 | a matrix of gene expression data in GEO |
170 |
171 |
172 |
175 |
176 |
177 |
178 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
--------------------------------------------------------------------------------
/docs/sitemap.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /404.html
5 |
6 |
7 | /articles/GeoTcgaData.html
8 |
9 |
10 | /articles/index.html
11 |
12 |
13 | /authors.html
14 |
15 |
16 | /CONDUCT.html
17 |
18 |
19 | /CONTRIBUTING.html
20 |
21 |
22 | /index.html
23 |
24 |
25 | /LICENSE-text.html
26 |
27 |
28 | /news/index.html
29 |
30 |
31 | /reference/differential_array.html
32 |
33 |
34 | /reference/cal_mean_module.html
35 |
36 |
37 | /reference/classify_sample.html
38 |
39 |
40 | /reference/countToFpkm.html
41 |
42 |
43 | /reference/countToTpm.html
44 |
45 |
46 | /reference/differential_cnv.html
47 |
48 |
49 | /reference/differential_CNV.html
50 |
51 |
52 | /reference/differential_gene.html
53 |
54 |
55 | /reference/differential_limma.html
56 |
57 |
58 | /reference/differential_RNA.html
59 |
60 |
61 | /reference/differential_RNA_ucsc.html
62 |
63 |
64 | /reference/differential_SNP.html
65 |
66 |
67 | /reference/differential_SNP_tcga.html
68 |
69 |
70 | /reference/fpkmToTpm_matrix.html
71 |
72 |
73 | /reference/geneExpress.html
74 |
75 |
76 | /reference/gene_ave.html
77 |
78 |
79 | /reference/gene_cov.html
80 |
81 |
82 | /reference/GSE66705_sample2.html
83 |
84 |
85 | /reference/id_ava.html
86 |
87 |
88 | /reference/id_conversion.html
89 |
90 |
91 | /reference/id_conversion.html
92 |
93 |
94 | /reference/index.html
95 |
96 |
97 | /reference/kegg_liver.html
98 |
99 |
100 | /reference/Merge_methy_tcga.html
101 |
102 |
103 | /reference/methyDiff.html
104 |
105 |
106 | /reference/methydifferential_ucsc.html
107 |
108 |
109 | /reference/module.html
110 |
111 |
112 | /reference/prepare_chi.html
113 |
114 |
115 | /reference/profile.html
116 |
117 |
118 | /reference/repAssign.html
119 |
120 |
121 | /reference/repRemove.html
122 |
123 |
124 | /reference/tcga_cli_deal.html
125 |
126 |
127 | /reference/ventricle.html
128 |
129 |
130 |
--------------------------------------------------------------------------------
/inst/extdata/build_data.R:
--------------------------------------------------------------------------------
1 | #setwd("E:\\GeoTcgaData_work")
2 | hgnc_file <- data.table::fread("E:\\GeoTcgaData_work\\hgnc_complete_set.txt",
3 | sep = "\t", header = TRUE)
4 | hgnc_file <- dplyr::select(hgnc_file, -c("alias_symbol", "alias_name",
5 | "prev_symbol", "lsdb", "agr"))
6 | class(hgnc_file) <- "data.frame"
7 | gene_loc_len <- GeoTcgaData:::gene_loc_len
8 | hgnc <- GeoTcgaData:::hgnc
9 | genePos <- GeoTcgaData:::genePos
10 | hgnc_file <- GeoTcgaData:::hgnc_file
11 | usethis::use_data(hgnc_file, hgnc, gene_loc_len,
12 | internal = TRUE, compress = "xz", overwrite = TRUE)
13 |
14 |
15 | ## gene_cov
16 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
17 | library(BSgenome.Hsapiens.UCSC.hg38)
18 | hg38_TxDb <- TxDb.Hsapiens.UCSC.hg38.knownGene
19 | hg38 <- BSgenome.Hsapiens.UCSC.hg38
20 | calc_gene_cov <- function(TxDB, BSGENOME){
21 | Gene <- genes(TxDB, single.strand.genes.only = FALSE)
22 | Exon <- exons(x = TxDB)
23 | Overlap <- findOverlaps(Exon, Gene)
24 | Exon <- Exon[queryHits(Overlap)]
25 | mcols(Exon)$gene_id <- mcols(Gene[subjectHits(Overlap)])$gene_id
26 | Exon <- split(Exon, mcols(Exon)$gene_id)
27 | Exon <- reduce(Exon)
28 | calculate_cov <- function(x){
29 | xlen <- sum(width(x))
30 | xseq <- BSgenome::getSeq(BSGENOME, x)
31 | xGC <- sum(Biostrings::letterFrequency(xseq, 'GC'))/xlen
32 | c(xlen, xGC)
33 | }
34 | gene_cov <- lapply(Exon, calculate_cov)
35 | gene_cov <- gene_cov[names(Gene)]
36 | gene_cov <- t(as.data.frame(gene_cov))
37 | rownames(gene_cov) <- names(Gene)
38 | colnames(gene_cov) <- c('length', 'GC')
39 | as.data.frame(gene_cov)
40 | }
41 | gene_cov <- calc_gene_cov(TxDB = hg38_TxDb, BSGENOME = hg38)
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/inst/extdata/cnv/00de3/HONGS_p_TCGAb3_75_76_77_NSP_G.txt:
--------------------------------------------------------------------------------
1 | GDC_Aliquot Chromosome Start End Num_Probes Segment_Mean
2 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 1 3301765 55785707 28262 0.0033
3 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 1 55792696 55827469 30 0.5462
4 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 1 55829348 87233893 20119 -0.0012
5 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 1 87237285 87239700 8 -1.1169
6 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 1 87243121 247650984 81057 0.0018
7 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 480597 145900182 76793 0.0034
8 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 145907622 145908689 2 -2.1771
9 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 145908860 193722432 27110 0.0069
10 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 193724639 193733648 7 -1.0391
11 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 193737212 230912395 21841 0.0103
12 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 230912836 230913055 2 -1.5502
13 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 2 230913581 241537572 6229 0.0076
14 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 2170634 170801846 93068 0.0017
15 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 170804047 170833713 18 0.6177
16 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 170834686 181217584 5282 -0.0004
17 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 181220309 181225348 7 -0.8341
18 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 181239123 186849637 2727 0.002
19 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 186850494 186859338 15 0.5654
20 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 3 186860325 197812401 5891 0.0045
21 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 1059384 5815967 2403 0.0006
22 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 5817427 5817557 2 -1.9893
23 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 5817654 29171694 15280 0.001
24 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 29179931 29180476 2 -1.5391
25 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 29180581 52137599 10803 0.0037
26 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 52138506 52139008 2 -1.7386
27 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 52141824 110472535 32956 0.0009
28 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 110475372 110475792 2 -1.7461
29 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 4 110477052 187842528 41865 -0.0004
30 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 5 913983 95230802 50545 0.0016
31 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 5 95236087 95237226 3 -1.9218
32 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 5 95237881 180934240 50395 0.0013
33 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 6 1011760 4574891 2512 -0.0041
34 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 6 4575676 4581489 4 -1.5749
35 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 6 4581494 31204878 16832 0.0056
36 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 6 31208262 31208825 2 1.6481
37 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 6 31210138 170596889 77652 0.0046
38 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 7 664936 40174597 23004 -0.001
39 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 7 40175592 40219954 28 0.5835
40 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 7 40223821 158592540 58784 0.001
41 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 8 667625 144182542 82073 0.0016
42 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 9 789794 138044505 68363 -0.0004
43 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 10 366509 30944103 21620 0.0034
44 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 10 30957229 30978973 27 -0.7166
45 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 10 30979829 133411599 59493 0.0011
46 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 11 456012 26339913 16706 -0.003
47 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 11 26342749 26345172 2 -1.7145
48 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 11 26345232 123812391 53610 0.0006
49 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 11 123812566 123812839 2 -1.7801
50 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 11 123816297 134272740 7131 0.0028
51 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 12 780472 132605822 74211 0.002
52 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 13 18874255 114226675 56874 0.0028
53 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 14 20033191 27398580 3634 0.003
54 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 14 27399274 27507409 51 -0.8739
55 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 14 27507968 38840075 6666 -0.0016
56 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 14 38841818 38863301 12 -1.178
57 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 14 38865374 105533894 39335 0.001
58 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 15 23437561 101344124 44708 0
59 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 16 603333 53175083 18211 0.0032
60 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 16 53177717 53177940 2 -2.6236
61 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 16 53178250 89317317 22941 0.0007
62 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 17 1074619 38769230 14904 -0.0016
63 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 17 38769928 38772885 2 -1.8733
64 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 17 38772947 82959812 22329 0.0021
65 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 18 326691 5947124 2994 -0.0029
66 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 18 5947875 5948061 2 -1.9811
67 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 18 5948488 79349796 39875 0.0014
68 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 19 283868 58370362 24118 0.0013
69 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 20 472817 63588502 37516 -0.0009
70 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 21 13974127 46262057 20561 -0.0025
71 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 22 16934932 31381621 5794 -0.0031
72 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 22 31382595 31384583 2 -1.4397
73 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 22 31388932 48940621 11270 -0.0013
74 | 8dd570dd-2a44-4d3d-bcf2-d9c5c21d00f9 X 3236359 155677414 63397 0.0049
75 |
--------------------------------------------------------------------------------
/inst/extdata/cnv/00e8/MICHE_p_TCGAb_428_429_NS.txt:
--------------------------------------------------------------------------------
1 | GDC_Aliquot Chromosome Start End Num_Probes Segment_Mean
2 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 1 3301765 247650984 129760 0.0004
3 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 480597 9808346 5787 0.0102
4 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 9809447 9810242 3 -1.5053
5 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 9813820 70029123 35550 0.0044
6 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 70031855 70032363 3 -0.9245
7 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 70036199 76542146 3636 0.0025
8 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 76552672 76572000 7 -0.9465
9 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 76580922 221887086 75389 0.0053
10 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 221889381 221894838 4 -1.11
11 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 2 221895559 241537572 11830 0.0033
12 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 2170634 10517554 4836 0.0046
13 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 10519258 10520849 2 -1.5264
14 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 10523441 65061928 31955 0.0024
15 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 65061958 65062362 2 -1.3193
16 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 65066087 65446355 334 -0.0061
17 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 65446789 65446808 2 -1.1371
18 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 65448972 80884445 9563 -0.0025
19 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 80886020 80887429 3 -1.267
20 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 3 80890988 197812401 60481 0.0033
21 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 1059384 36534634 21978 0.0017
22 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 36538381 36538411 2 -1.44
23 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 36544419 39205698 1886 -0.0025
24 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 39208967 39210064 2 -1.2461
25 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 39210390 57634986 8283 0.0016
26 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 57635142 57635246 3 -0.9563
27 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 57639758 60483928 1644 0.0017
28 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 60485425 60486464 2 -1.3006
29 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 60486811 111043051 28100 0.001
30 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 111043145 111043496 4 -0.8891
31 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 4 111044539 187842528 41595 0.0019
32 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 913983 17462220 10901 -0.002
33 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 17747964 17805789 62 -1.0687
34 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 17810843 135486593 62140 0.0009
35 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 135489795 135489993 2 -2.9358
36 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 135492523 173001547 23795 0.0018
37 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 173003677 173010638 5 -1.0247
38 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 5 173011703 180934240 4204 0.0039
39 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 6 1011760 145256409 82006 0.0011
40 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 6 145256510 145257320 2 -1.2561
41 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 6 145259422 148392567 2017 0.0148
42 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 6 148393488 148401973 2 -1.4044
43 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 6 148402609 170596889 13141 0.0051
44 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 664936 3762998 1146 -0.0044
45 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 3766579 3766898 2 -1.2036
46 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 3767918 7516194 1521 0.0115
47 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 7518409 7518634 2 -1.2823
48 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 7522447 75626698 35601 0.0044
49 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 75630440 75631353 3 -1.1212
50 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 75631392 102171122 13386 0.0017
51 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 102175051 102175356 2 -1.405
52 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 102176035 126079078 12799 0.0035
53 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 126079396 126079402 2 -1.2042
54 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 7 126081764 158592540 17468 0.0054
55 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 8 667625 144182542 82217 0.0013
56 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 9 789794 138044505 68474 0.0012
57 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 10 366509 17480443 12572 -0.0088
58 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 10 17480684 17480700 2 -1.3838
59 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 10 17481042 133411599 68725 0.0002
60 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 11 456012 134272740 77584 0.0027
61 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 12 780472 132605822 74347 0.0021
62 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 13 18874255 32367536 8277 -0.0004
63 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 13 32367551 32371795 2 -1.3959
64 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 13 32373678 68558328 19791 -0.0026
65 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 13 68565286 68565738 7 -0.6895
66 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 13 68568076 114226675 28917 0.0019
67 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 14 20033191 103895954 49346 -0.0001
68 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 14 103900849 103901391 2 -1.2198
69 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 14 103902012 105533894 433 -0.005
70 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 15 23437561 101344124 44793 0.0021
71 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 16 603333 20586231 9228 0.0011
72 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 16 20595007 20604353 6 0.791
73 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 16 20622023 83760390 29282 0.0026
74 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 16 83760392 83760401 2 -1.6412
75 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 16 83760419 89317317 2698 0.0049
76 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 17 1074619 82959812 37293 0.0006
77 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 18 326691 66976937 36698 0.0002
78 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 18 66978926 67010720 16 0.5568
79 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 18 67014755 79349796 6252 0.0046
80 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 19 283868 58370362 24154 0.0016
81 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 472817 14289450 8860 0.001
82 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 14291481 14296040 2 -1.1918
83 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 14299785 31449137 8201 0.0001
84 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 31452707 31461069 6 -0.6285
85 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 31465102 39405820 4137 0.0048
86 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 39410348 39413403 3 -1.1142
87 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 39417485 46267140 4547 0.0047
88 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 46269102 46269251 2 -1.921
89 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 20 46270491 63588502 11803 0.0052
90 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 21 13974127 24213071 6276 -0.0001
91 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 21 24215936 24216026 3 -1.3743
92 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 21 24216643 46262057 14320 0.0035
93 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f 22 16934932 48940621 17094 0.0001
94 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f X 3236359 13853883 3996 -0.0026
95 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f X 13854003 13854349 3 -3.0405
96 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f X 13856180 79841790 26054 0.01
97 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f X 79842702 79844341 2 -5.3043
98 | 8b50b805-0ecd-46f5-b986-aa9d1ba0831f X 79849126 155677414 33730 0.0124
99 |
--------------------------------------------------------------------------------
/inst/extdata/methy/0a0b4/jhu-usc.e.H.4.lvl-3.TCGA-13-1405-01A-01D-0460-05.g.txt:
--------------------------------------------------------------------------------
1 | Composite Element REF Beta_value Chromosome Start End Gene_Symbol Gene_Type Transcript_ID Position_to_TSS CGI_Coordinate Feature_Type
2 | cg00000292 0.869906238572323 chr16 28878779 28878780 ATP2A1;ATP2A1;ATP2A1;ATP2A1;ATP2A1 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000357084.6;ENST00000395503.7;ENST00000536376.4;ENST00000562185.4;ENST00000563975.1 373;290;-1275;-465;-83 CGI:chr16:28879633-28880547 N_Shore
3 | cg00002426 0.116136770722113 chr3 57757816 57757817 SLMAP;SLMAP;SLMAP;SLMAP;SLMAP;SLMAP protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000295951.6;ENST00000295952.6;ENST00000383718.6;ENST00000428312.4;ENST00000449503.5;ENST00000467901.1 1585;368;261;257;257;514 CGI:chr3:57756198-57757263 S_Shore
4 | cg00003994 0.406591482378321 chr7 15686237 15686238 MEOX2 protein_coding ENST00000262041.5 576 CGI:chr7:16399497-16399700 .
5 | cg00005847 0.2148836012527 chr2 176164345 176164346 AC009336.19;HOXD3;HOXD3;HOXD3;RP11-387A1.5 protein_coding;protein_coding;protein_coding;protein_coding;antisense ENST00000468418.4;ENST00000249440.4;ENST00000410016.4;ENST00000432796.2;ENST00000608941.1 13259;267;3453;27387;1372 CGI:chr2:176164685-176165509 N_Shore
6 | cg00006414 0.2148836012527 chr7 149125745 149125746 RN7SL521P;ZNF398;ZNF425;ZNF425 misc_RNA;protein_coding;protein_coding;protein_coding ENST00000488398.3;ENST00000426851.5;ENST00000378061.5;ENST00000483014.1 242;-672;602;562 CGI:chr7:149126122-149127136 N_Shore
7 | cg00007981 0.0111896937323695 chr11 94129428 94129429 PANX1;PANX1 protein_coding;protein_coding ENST00000227638.6;ENST00000436171.2 499;498 CGI:chr11:94128394-94129607 Island
8 | cg00008493 0.994159645759505 chr14 93347431 93347432 COX8C;UNC79 protein_coding;protein_coding ENST00000342144.2;ENST00000256339.7 239;14211 CGI:chr14:93347137-93347765 Island
9 | cg00008713 0.00826646235457973 chr18 11980954 11980955 IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000269159.6;ENST00000383376.8;ENST00000588752.4;ENST00000588927.4;ENST00000589238.4;ENST00000590107.4;ENST00000590138.1;ENST00000625802.1 -475;-563;-72;-717;-84;-512;-658;-475 CGI:chr18:11980484-11982143 Island
10 | cg00009407 0.0108723860129016 chr14 88824577 88824578 TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000338104.9;ENST00000345383.8;ENST00000346301.7;ENST00000354441.9;ENST00000358622.8;ENST00000380656.5;ENST00000536576.4;ENST00000553718.1;ENST00000554686.4;ENST00000555057.4;ENST00000556077.4;ENST00000556567.4;ENST00000556651.4;ENST00000614125.3;ENST00000622513.3 -80;423;-80;-84;423;-86;423;-18;-163;-96;423;-75;-109;423;-48 CGI:chr14:88824574-88825011 Island
11 | cg00010193 0.71090039937439 chr4 1151428 1151429 AC092535.3;TMED11P antisense;unitary_pseudogene ENST00000417557.1;ENST00000479478.4 55;2299 CGI:chr4:1144210-1146033 .
12 | cg00011459 0.90483721870109 chr16 8796568 8796569 PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;TMEM186;TMEM186 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000268261.7;ENST00000562318.4;ENST00000562448.1;ENST00000564030.4;ENST00000564069.1;ENST00000565221.4;ENST00000565896.4;ENST00000566196.4;ENST00000566540.4;ENST00000566604.4;ENST00000566983.4;ENST00000568602.4;ENST00000569958.4;ENST00000570076.4;ENST00000570134.4;ENST00000333050.6;ENST00000564869.1 -1250;-1275;-1275;-1254;-1345;-1300;-1275;-1272;-1275;-1270;7744;-1275;-1300;-1283;-1271;1081;1075 CGI:chr16:8797465-8798071 N_Shore
13 | cg00012199 0.013190421994616 chr14 20682865 20682866 ANG;RNASE4 protein_coding;protein_coding ENST00000336811.9;ENST00000555835.2 -1313;-1236 CGI:chr14:20682759-20683065 Island
14 | cg00012386 0.00562937717912839 chr1 227734811 227734812 JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000366758.6;ENST00000438896.2;ENST00000465251.1;ENST00000480590.1;ENST00000485807.1;ENST00000615711.3;ENST00000620518.3;ENST00000315781.8;ENST00000366759.7;ENST00000366760.4;ENST00000418653.4;ENST00000426344.4;ENST00000470038.1;ENST00000475930.1;ENST00000491439.1;ENST00000617596.3 601;601;-439;266;-433;601;473;-186;-186;6153;-621;-626;-1363;6192;6103;-643 CGI:chr1:227734942-227735730 N_Shore
15 | cg00012792 0.0130914072814042 chr6 8064260 8064261 BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000244777.5;ENST00000397457.5;ENST00000543936.4;ENST00000627748.1;ENST00000439343.2;ENST00000397456.2 134;155;155;137;105;38271 CGI:chr6:8063813-8064532 Island
--------------------------------------------------------------------------------
/inst/extdata/methy/0a0b4/logs/file1.parcel:
--------------------------------------------------------------------------------
1 | cintervaltree.intervaltree
2 | IntervalTree
3 | p0
4 | ((lp1
5 | cintervaltree.interval
6 | Interval
7 | p2
8 | (I0
9 | I1048576
10 | (dp3
11 | S'md5sum'
12 | p4
13 | S'007d09d1333971eb947569f6a4ca3b36'
14 | p5
15 | stp6
16 | Rp7
17 | ag2
18 | (I1048576
19 | I2097152
20 | (dp8
21 | g4
22 | S'9dd013a350555afe41f63985987d2182'
23 | p9
24 | stp10
25 | Rp11
26 | ag2
27 | (I2097152
28 | I3145728
29 | (dp12
30 | g4
31 | S'2d7aa1a88fbb9e6c47364461aac67ad4'
32 | p13
33 | stp14
34 | Rp15
35 | ag2
36 | (I3145728
37 | I4194304
38 | (dp16
39 | g4
40 | S'2ab2df63fdb543e5a103cf3e6701e579'
41 | p17
42 | stp18
43 | Rp19
44 | ag2
45 | (I4194304
46 | I5242880
47 | (dp20
48 | g4
49 | S'267a2c4eb24b1b3ec2776291c75e9cc4'
50 | p21
51 | stp22
52 | Rp23
53 | ag2
54 | (I5242880
55 | I6291456
56 | (dp24
57 | g4
58 | S'cce6ce104a1cedca0335d622c43a4b3b'
59 | p25
60 | stp26
61 | Rp27
62 | ag2
63 | (I6291456
64 | I7340032
65 | (dp28
66 | g4
67 | S'849ececa93632641aca3615754a8bac8'
68 | p29
69 | stp30
70 | Rp31
71 | ag2
72 | (I7340032
73 | I8388608
74 | (dp32
75 | g4
76 | S'9296760f45ccc5105c12a4ca81dc4b0d'
77 | p33
78 | stp34
79 | Rp35
80 | ag2
81 | (I8388608
82 | I9437184
83 | (dp36
84 | g4
85 | S'103e09b9b48151c4a7049220ae2ac54f'
86 | p37
87 | stp38
88 | Rp39
89 | ag2
90 | (I9437184
91 | I9956275
92 | (dp40
93 | g4
94 | S'860d32161e9b8a1cff47d8126124bee0'
95 | p41
96 | stp42
97 | Rp43
98 | atp44
99 | Rp45
100 | .
--------------------------------------------------------------------------------
/inst/extdata/methy/0a6b/jhu-usc.e.H.10.lvl-3.TCGA-30-1880-01A-01D-0652-05.g.txt:
--------------------------------------------------------------------------------
1 | Composite Element REF Beta_value Chromosome Start End Gene_Symbol Gene_Type Transcript_ID Position_to_TSS CGI_Coordinate Feature_Type
2 | cg00000292 0.796523939932179 chr16 28878779 28878780 ATP2A1;ATP2A1;ATP2A1;ATP2A1;ATP2A1 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000357084.6;ENST00000395503.7;ENST00000536376.4;ENST00000562185.4;ENST00000563975.1 373;290;-1275;-465;-83 CGI:chr16:28879633-28880547 N_Shore
3 | cg00002426 0.0770646801597857 chr3 57757816 57757817 SLMAP;SLMAP;SLMAP;SLMAP;SLMAP;SLMAP protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000295951.6;ENST00000295952.6;ENST00000383718.6;ENST00000428312.4;ENST00000449503.5;ENST00000467901.1 1585;368;261;257;257;514 CGI:chr3:57756198-57757263 S_Shore
4 | cg00003994 0.0832823123959163 chr7 15686237 15686238 MEOX2 protein_coding ENST00000262041.5 576 CGI:chr7:16399497-16399700 .
5 | cg00005847 0.504193627239696 chr2 176164345 176164346 AC009336.19;HOXD3;HOXD3;HOXD3;RP11-387A1.5 protein_coding;protein_coding;protein_coding;protein_coding;antisense ENST00000468418.4;ENST00000249440.4;ENST00000410016.4;ENST00000432796.2;ENST00000608941.1 13259;267;3453;27387;1372 CGI:chr2:176164685-176165509 N_Shore
6 | cg00006414 0.2148836012527 chr7 149125745 149125746 RN7SL521P;ZNF398;ZNF425;ZNF425 misc_RNA;protein_coding;protein_coding;protein_coding ENST00000488398.3;ENST00000426851.5;ENST00000378061.5;ENST00000483014.1 242;-672;602;562 CGI:chr7:149126122-149127136 N_Shore
7 | cg00007981 0.0228410194656724 chr11 94129428 94129429 PANX1;PANX1 protein_coding;protein_coding ENST00000227638.6;ENST00000436171.2 499;498 CGI:chr11:94128394-94129607 Island
8 | cg00008493 0.98921474501248 chr14 93347431 93347432 COX8C;UNC79 protein_coding;protein_coding ENST00000342144.2;ENST00000256339.7 239;14211 CGI:chr14:93347137-93347765 Island
9 | cg00008713 0.00849766793256932 chr18 11980954 11980955 IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000269159.6;ENST00000383376.8;ENST00000588752.4;ENST00000588927.4;ENST00000589238.4;ENST00000590107.4;ENST00000590138.1;ENST00000625802.1 -475;-563;-72;-717;-84;-512;-658;-475 CGI:chr18:11980484-11982143 Island
10 | cg00009407 0.00600256591763238 chr14 88824577 88824578 TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000338104.9;ENST00000345383.8;ENST00000346301.7;ENST00000354441.9;ENST00000358622.8;ENST00000380656.5;ENST00000536576.4;ENST00000553718.1;ENST00000554686.4;ENST00000555057.4;ENST00000556077.4;ENST00000556567.4;ENST00000556651.4;ENST00000614125.3;ENST00000622513.3 -80;423;-80;-84;423;-86;423;-18;-163;-96;423;-75;-109;423;-48 CGI:chr14:88824574-88825011 Island
11 | cg00010193 0.78874066422712 chr4 1151428 1151429 AC092535.3;TMED11P antisense;unitary_pseudogene ENST00000417557.1;ENST00000479478.4 55;2299 CGI:chr4:1144210-1146033 .
12 | cg00011459 0.950438041209892 chr16 8796568 8796569 PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;TMEM186;TMEM186 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000268261.7;ENST00000562318.4;ENST00000562448.1;ENST00000564030.4;ENST00000564069.1;ENST00000565221.4;ENST00000565896.4;ENST00000566196.4;ENST00000566540.4;ENST00000566604.4;ENST00000566983.4;ENST00000568602.4;ENST00000569958.4;ENST00000570076.4;ENST00000570134.4;ENST00000333050.6;ENST00000564869.1 -1250;-1275;-1275;-1254;-1345;-1300;-1275;-1272;-1275;-1270;7744;-1275;-1300;-1283;-1271;1081;1075 CGI:chr16:8797465-8798071 N_Shore
13 | cg00012199 0.0119642491262264 chr14 20682865 20682866 ANG;RNASE4 protein_coding;protein_coding ENST00000336811.9;ENST00000555835.2 -1313;-1236 CGI:chr14:20682759-20683065 Island
14 | cg00012386 0.00622611896739758 chr1 227734811 227734812 JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000366758.6;ENST00000438896.2;ENST00000465251.1;ENST00000480590.1;ENST00000485807.1;ENST00000615711.3;ENST00000620518.3;ENST00000315781.8;ENST00000366759.7;ENST00000366760.4;ENST00000418653.4;ENST00000426344.4;ENST00000470038.1;ENST00000475930.1;ENST00000491439.1;ENST00000617596.3 601;601;-439;266;-433;601;473;-186;-186;6153;-621;-626;-1363;6192;6103;-643 CGI:chr1:227734942-227735730 N_Shore
15 | cg00012792 0.0125193760585237 chr6 8064260 8064261 BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000244777.5;ENST00000397457.5;ENST00000543936.4;ENST00000627748.1;ENST00000439343.2;ENST00000397456.2 134;155;155;137;105;38271 CGI:chr6:8063813-8064532 Island
--------------------------------------------------------------------------------
/inst/extdata/methy/0a6b/logs/file2.parcel:
--------------------------------------------------------------------------------
1 | cintervaltree.intervaltree
2 | IntervalTree
3 | p0
4 | ((lp1
5 | cintervaltree.interval
6 | Interval
7 | p2
8 | (I0
9 | I1048576
10 | (dp3
11 | S'md5sum'
12 | p4
13 | S'65301bc6a5c8f6838727b70bae72c2b4'
14 | p5
15 | stp6
16 | Rp7
17 | ag2
18 | (I1048576
19 | I2097152
20 | (dp8
21 | g4
22 | S'd1f6b8a428f3244126f82be0a2b0935d'
23 | p9
24 | stp10
25 | Rp11
26 | ag2
27 | (I2097152
28 | I3145728
29 | (dp12
30 | g4
31 | S'42d333e2cced5dc15faba0ca5ef122d5'
32 | p13
33 | stp14
34 | Rp15
35 | ag2
36 | (I3145728
37 | I4194304
38 | (dp16
39 | g4
40 | S'e3551f0661c5c17354a9729d8662383a'
41 | p17
42 | stp18
43 | Rp19
44 | ag2
45 | (I4194304
46 | I5242880
47 | (dp20
48 | g4
49 | S'e38664c435365f5a24408639e7418093'
50 | p21
51 | stp22
52 | Rp23
53 | ag2
54 | (I5242880
55 | I6291456
56 | (dp24
57 | g4
58 | S'250eef3445b553a91902aee338ff4ac4'
59 | p25
60 | stp26
61 | Rp27
62 | ag2
63 | (I6291456
64 | I7340032
65 | (dp28
66 | g4
67 | S'd029fdc9cf26c7f6483bb4d39a36d848'
68 | p29
69 | stp30
70 | Rp31
71 | ag2
72 | (I7340032
73 | I8388608
74 | (dp32
75 | g4
76 | S'f2d9dc657140e23354beeff8a56a9052'
77 | p33
78 | stp34
79 | Rp35
80 | ag2
81 | (I8388608
82 | I9437184
83 | (dp36
84 | g4
85 | S'21352044be69cbd84c4a724957f596d7'
86 | p37
87 | stp38
88 | Rp39
89 | ag2
90 | (I9437184
91 | I9956847
92 | (dp40
93 | g4
94 | S'47b7380df258324a491e9cf284778f98'
95 | p41
96 | stp42
97 | Rp43
98 | atp44
99 | Rp45
100 | .
--------------------------------------------------------------------------------
/inst/extdata/methy/0ae7/jhu-usc.H.8.l.I.TCGA-30-1714-01A-02D-0563-05.g.txt:
--------------------------------------------------------------------------------
1 | Composite Element REF Beta_value Chromosome Start End Gene_Symbol Gene_Type Transcript_ID Position_to_TSS CGI_Coordinate Feature_Type
2 | cg00000292 0.771395097495164 chr16 28878779 28878780 ATP2A1;ATP2A1;ATP2A1;ATP2A1;ATP2A1 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000357084.6;ENST00000395503.7;ENST00000536376.4;ENST00000562185.4;ENST00000563975.1 373;290;-1275;-465;-83 CGI:chr16:28879633-28880547 N_Shore
3 | cg00002426 0.0233901750159108 chr3 57757816 57757817 SLMAP;SLMAP;SLMAP;SLMAP;SLMAP;SLMAP protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000295951.6;ENST00000295952.6;ENST00000383718.6;ENST00000428312.4;ENST00000449503.5;ENST00000467901.1 1585;368;261;257;257;514 CGI:chr3:57756198-57757263 S_Shore
4 | cg00003994 0.028704838704695 chr7 15686237 15686238 MEOX2 protein_coding ENST00000262041.5 576 CGI:chr7:16399497-16399700 .
5 | cg00005847 0.903451857318154 chr2 176164345 176164346 AC009336.19;HOXD3;HOXD3;HOXD3;RP11-387A1.5 protein_coding;protein_coding;protein_coding;protein_coding;antisense ENST00000468418.4;ENST00000249440.4;ENST00000410016.4;ENST00000432796.2;ENST00000608941.1 13259;267;3453;27387;1372 CGI:chr2:176164685-176165509 N_Shore
6 | cg00006414 0.2148836012527 chr7 149125745 149125746 RN7SL521P;ZNF398;ZNF425;ZNF425 misc_RNA;protein_coding;protein_coding;protein_coding ENST00000488398.3;ENST00000426851.5;ENST00000378061.5;ENST00000483014.1 242;-672;602;562 CGI:chr7:149126122-149127136 N_Shore
7 | cg00007981 0.00887051985245265 chr11 94129428 94129429 PANX1;PANX1 protein_coding;protein_coding ENST00000227638.6;ENST00000436171.2 499;498 CGI:chr11:94128394-94129607 Island
8 | cg00008493 0.985460674026316 chr14 93347431 93347432 COX8C;UNC79 protein_coding;protein_coding ENST00000342144.2;ENST00000256339.7 239;14211 CGI:chr14:93347137-93347765 Island
9 | cg00008713 0.0102627571258565 chr18 11980954 11980955 IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000269159.6;ENST00000383376.8;ENST00000588752.4;ENST00000588927.4;ENST00000589238.4;ENST00000590107.4;ENST00000590138.1;ENST00000625802.1 -475;-563;-72;-717;-84;-512;-658;-475 CGI:chr18:11980484-11982143 Island
10 | cg00009407 0.00664922618318322 chr14 88824577 88824578 TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000338104.9;ENST00000345383.8;ENST00000346301.7;ENST00000354441.9;ENST00000358622.8;ENST00000380656.5;ENST00000536576.4;ENST00000553718.1;ENST00000554686.4;ENST00000555057.4;ENST00000556077.4;ENST00000556567.4;ENST00000556651.4;ENST00000614125.3;ENST00000622513.3 -80;423;-80;-84;423;-86;423;-18;-163;-96;423;-75;-109;423;-48 CGI:chr14:88824574-88825011 Island
11 | cg00010193 0.624378233369569 chr4 1151428 1151429 AC092535.3;TMED11P antisense;unitary_pseudogene ENST00000417557.1;ENST00000479478.4 55;2299 CGI:chr4:1144210-1146033 .
12 | cg00011459 0.916476198575747 chr16 8796568 8796569 PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;TMEM186;TMEM186 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000268261.7;ENST00000562318.4;ENST00000562448.1;ENST00000564030.4;ENST00000564069.1;ENST00000565221.4;ENST00000565896.4;ENST00000566196.4;ENST00000566540.4;ENST00000566604.4;ENST00000566983.4;ENST00000568602.4;ENST00000569958.4;ENST00000570076.4;ENST00000570134.4;ENST00000333050.6;ENST00000564869.1 -1250;-1275;-1275;-1254;-1345;-1300;-1275;-1272;-1275;-1270;7744;-1275;-1300;-1283;-1271;1081;1075 CGI:chr16:8797465-8798071 N_Shore
13 | cg00012199 0.00897505734686159 chr14 20682865 20682866 ANG;RNASE4 protein_coding;protein_coding ENST00000336811.9;ENST00000555835.2 -1313;-1236 CGI:chr14:20682759-20683065 Island
14 | cg00012386 0.00783581050121143 chr1 227734811 227734812 JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000366758.6;ENST00000438896.2;ENST00000465251.1;ENST00000480590.1;ENST00000485807.1;ENST00000615711.3;ENST00000620518.3;ENST00000315781.8;ENST00000366759.7;ENST00000366760.4;ENST00000418653.4;ENST00000426344.4;ENST00000470038.1;ENST00000475930.1;ENST00000491439.1;ENST00000617596.3 601;601;-439;266;-433;601;473;-186;-186;6153;-621;-626;-1363;6192;6103;-643 CGI:chr1:227734942-227735730 N_Shore
15 | cg00012792 0.00972612443887198 chr6 8064260 8064261 BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000244777.5;ENST00000397457.5;ENST00000543936.4;ENST00000627748.1;ENST00000439343.2;ENST00000397456.2 134;155;155;137;105;38271 CGI:chr6:8063813-8064532 Island
--------------------------------------------------------------------------------
/inst/extdata/methy/0ae7/logs/file3.parcel:
--------------------------------------------------------------------------------
1 | cintervaltree.intervaltree
2 | IntervalTree
3 | p0
4 | ((lp1
5 | cintervaltree.interval
6 | Interval
7 | p2
8 | (I0
9 | I1048576
10 | (dp3
11 | S'md5sum'
12 | p4
13 | S'aead104e6ece37828f89eb77288fe918'
14 | p5
15 | stp6
16 | Rp7
17 | ag2
18 | (I1048576
19 | I2097152
20 | (dp8
21 | g4
22 | S'0773e3100764340b3cc404950897a6fb'
23 | p9
24 | stp10
25 | Rp11
26 | ag2
27 | (I2097152
28 | I3145728
29 | (dp12
30 | g4
31 | S'86ebefee3b2dfc27c2d0b5c9117c971d'
32 | p13
33 | stp14
34 | Rp15
35 | ag2
36 | (I3145728
37 | I4194304
38 | (dp16
39 | g4
40 | S'c05863055e7581a14e6be321d08a2511'
41 | p17
42 | stp18
43 | Rp19
44 | ag2
45 | (I4194304
46 | I5242880
47 | (dp20
48 | g4
49 | S'3fbdb5a920e9f2796cb6dbe7e5485bd7'
50 | p21
51 | stp22
52 | Rp23
53 | ag2
54 | (I5242880
55 | I6291456
56 | (dp24
57 | g4
58 | S'2dfd8445618501c7100903d44dfe0ab5'
59 | p25
60 | stp26
61 | Rp27
62 | ag2
63 | (I6291456
64 | I7340032
65 | (dp28
66 | g4
67 | S'38e743a0a65c8f211f9199cd85ea97db'
68 | p29
69 | stp30
70 | Rp31
71 | ag2
72 | (I7340032
73 | I8388608
74 | (dp32
75 | g4
76 | S'c030bea01e6543263fdf62e8e3504edd'
77 | p33
78 | stp34
79 | Rp35
80 | ag2
81 | (I8388608
82 | I9437184
83 | (dp36
84 | g4
85 | S'a43d34869fe5ba12c29279dd5bb4c9ba'
86 | p37
87 | stp38
88 | Rp39
89 | ag2
90 | (I9437184
91 | I9958646
92 | (dp40
93 | g4
94 | S'1d1cc80e21505dd920dd473d6cda718f'
95 | p41
96 | stp42
97 | Rp43
98 | atp44
99 | Rp45
100 | .
--------------------------------------------------------------------------------
/inst/extdata/methy/0b32/jhu-usc.e.H.5.l.TCGA-13-1510-01A-02D-0475-05.gdc_hg38.txt:
--------------------------------------------------------------------------------
1 | Composite Element REF Beta_value Chromosome Start End Gene_Symbol Gene_Type Transcript_ID Position_to_TSS CGI_Coordinate Feature_Type
2 | cg00000292 0.930908590439207 chr16 28878779 28878780 ATP2A1;ATP2A1;ATP2A1;ATP2A1;ATP2A1 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000357084.6;ENST00000395503.7;ENST00000536376.4;ENST00000562185.4;ENST00000563975.1 373;290;-1275;-465;-83 CGI:chr16:28879633-28880547 N_Shore
3 | cg00002426 0.104268079362755 chr3 57757816 57757817 SLMAP;SLMAP;SLMAP;SLMAP;SLMAP;SLMAP protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000295951.6;ENST00000295952.6;ENST00000383718.6;ENST00000428312.4;ENST00000449503.5;ENST00000467901.1 1585;368;261;257;257;514 CGI:chr3:57756198-57757263 S_Shore
4 | cg00003994 0.0401792073366475 chr7 15686237 15686238 MEOX2 protein_coding ENST00000262041.5 576 CGI:chr7:16399497-16399700 .
5 | cg00005847 0.780428197539084 chr2 176164345 176164346 AC009336.19;HOXD3;HOXD3;HOXD3;RP11-387A1.5 protein_coding;protein_coding;protein_coding;protein_coding;antisense ENST00000468418.4;ENST00000249440.4;ENST00000410016.4;ENST00000432796.2;ENST00000608941.1 13259;267;3453;27387;1372 CGI:chr2:176164685-176165509 N_Shore
6 | cg00006414 0.780428197539084 chr7 149125745 149125746 RN7SL521P;ZNF398;ZNF425;ZNF425 misc_RNA;protein_coding;protein_coding;protein_coding ENST00000488398.3;ENST00000426851.5;ENST00000378061.5;ENST00000483014.1 242;-672;602;562 CGI:chr7:149126122-149127136 N_Shore
7 | cg00007981 0.0220541697023661 chr11 94129428 94129429 PANX1;PANX1 protein_coding;protein_coding ENST00000227638.6;ENST00000436171.2 499;498 CGI:chr11:94128394-94129607 Island
8 | cg00008493 0.976977997665785 chr14 93347431 93347432 COX8C;UNC79 protein_coding;protein_coding ENST00000342144.2;ENST00000256339.7 239;14211 CGI:chr14:93347137-93347765 Island
9 | cg00008713 0.0150960841510609 chr18 11980954 11980955 IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2;IMPA2 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000269159.6;ENST00000383376.8;ENST00000588752.4;ENST00000588927.4;ENST00000589238.4;ENST00000590107.4;ENST00000590138.1;ENST00000625802.1 -475;-563;-72;-717;-84;-512;-658;-475 CGI:chr18:11980484-11982143 Island
10 | cg00009407 0.0881331292749216 chr14 88824577 88824578 TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8;TTC8 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000338104.9;ENST00000345383.8;ENST00000346301.7;ENST00000354441.9;ENST00000358622.8;ENST00000380656.5;ENST00000536576.4;ENST00000553718.1;ENST00000554686.4;ENST00000555057.4;ENST00000556077.4;ENST00000556567.4;ENST00000556651.4;ENST00000614125.3;ENST00000622513.3 -80;423;-80;-84;423;-86;423;-18;-163;-96;423;-75;-109;423;-48 CGI:chr14:88824574-88825011 Island
11 | cg00010193 0.630671624077345 chr4 1151428 1151429 AC092535.3;TMED11P antisense;unitary_pseudogene ENST00000417557.1;ENST00000479478.4 55;2299 CGI:chr4:1144210-1146033 .
12 | cg00011459 0.885521863234167 chr16 8796568 8796569 PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;PMM2;TMEM186;TMEM186 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000268261.7;ENST00000562318.4;ENST00000562448.1;ENST00000564030.4;ENST00000564069.1;ENST00000565221.4;ENST00000565896.4;ENST00000566196.4;ENST00000566540.4;ENST00000566604.4;ENST00000566983.4;ENST00000568602.4;ENST00000569958.4;ENST00000570076.4;ENST00000570134.4;ENST00000333050.6;ENST00000564869.1 -1250;-1275;-1275;-1254;-1345;-1300;-1275;-1272;-1275;-1270;7744;-1275;-1300;-1283;-1271;1081;1075 CGI:chr16:8797465-8798071 N_Shore
13 | cg00012199 0.0341206451263948 chr14 20682865 20682866 ANG;RNASE4 protein_coding;protein_coding ENST00000336811.9;ENST00000555835.2 -1313;-1236 CGI:chr14:20682759-20683065 Island
14 | cg00012386 0.0116998743835203 chr1 227734811 227734812 JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;JMJD4;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47;SNAP47 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000366758.6;ENST00000438896.2;ENST00000465251.1;ENST00000480590.1;ENST00000485807.1;ENST00000615711.3;ENST00000620518.3;ENST00000315781.8;ENST00000366759.7;ENST00000366760.4;ENST00000418653.4;ENST00000426344.4;ENST00000470038.1;ENST00000475930.1;ENST00000491439.1;ENST00000617596.3 601;601;-439;266;-433;601;473;-186;-186;6153;-621;-626;-1363;6192;6103;-643 CGI:chr1:227734942-227735730 N_Shore
15 | cg00012792 0.0149194095384671 chr6 8064260 8064261 BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5;BLOC1S5-TXNDC5;EEF1E1-BLOC1S5 protein_coding;protein_coding;protein_coding;protein_coding;protein_coding;protein_coding ENST00000244777.5;ENST00000397457.5;ENST00000543936.4;ENST00000627748.1;ENST00000439343.2;ENST00000397456.2 134;155;155;137;105;38271 CGI:chr6:8063813-8064532 Island
--------------------------------------------------------------------------------
/inst/extdata/methy/0b32/logs/file4.parcel:
--------------------------------------------------------------------------------
1 | cintervaltree.intervaltree
2 | IntervalTree
3 | p0
4 | ((lp1
5 | cintervaltree.interval
6 | Interval
7 | p2
8 | (I0
9 | I1048576
10 | (dp3
11 | S'md5sum'
12 | p4
13 | S'efa0b12c00fcd025f9bb24b4bbafdc26'
14 | p5
15 | stp6
16 | Rp7
17 | ag2
18 | (I1048576
19 | I2097152
20 | (dp8
21 | g4
22 | S'86c446ea359acb897b2b622d6d8ff3b8'
23 | p9
24 | stp10
25 | Rp11
26 | ag2
27 | (I2097152
28 | I3145728
29 | (dp12
30 | g4
31 | S'6bae43a0c5c6de9d628114a4987eee28'
32 | p13
33 | stp14
34 | Rp15
35 | ag2
36 | (I3145728
37 | I4194304
38 | (dp16
39 | g4
40 | S'0703a5f69cd342d85bc0dffe4daf0432'
41 | p17
42 | stp18
43 | Rp19
44 | ag2
45 | (I4194304
46 | I5242880
47 | (dp20
48 | g4
49 | S'07b7d7041ec060fdfa4b35fb73b0324b'
50 | p21
51 | stp22
52 | Rp23
53 | ag2
54 | (I5242880
55 | I6291456
56 | (dp24
57 | g4
58 | S'42c174280504a41536414c7c4119ebca'
59 | p25
60 | stp26
61 | Rp27
62 | ag2
63 | (I6291456
64 | I7340032
65 | (dp28
66 | g4
67 | S'4643bfaad70e5fd1e50df13889207d86'
68 | p29
69 | stp30
70 | Rp31
71 | ag2
72 | (I7340032
73 | I8388608
74 | (dp32
75 | g4
76 | S'803aeea62bad0d6a31e63f3e6647e9c4'
77 | p33
78 | stp34
79 | Rp35
80 | ag2
81 | (I8388608
82 | I9437184
83 | (dp36
84 | g4
85 | S'86fe78bd9182ab0920b0e3d2882206f6'
86 | p37
87 | stp38
88 | Rp39
89 | ag2
90 | (I9437184
91 | I9944322
92 | (dp40
93 | g4
94 | S'052e8964fbc18d41c883c15feceae565'
95 | p41
96 | stp42
97 | Rp43
98 | atp44
99 | Rp45
100 | .
--------------------------------------------------------------------------------
/man/GSE66705_sample2.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{GSE66705_sample2}
5 | \alias{GSE66705_sample2}
6 | \title{a matrix of gene expression data in GEO}
7 | \format{
8 | A matrix with 999 rows and 3 column
9 | }
10 | \usage{
11 | GSE66705_sample2
12 | }
13 | \description{
14 | the first column represents the gene symbol
15 | }
16 | \details{
17 | the other columns represent the expression of genes
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/man/Merge_methy_tcga.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/Merge_methylation.R
3 | \name{Merge_methy_tcga}
4 | \alias{Merge_methy_tcga}
5 | \title{Merge methylation data downloaded from TCGA}
6 | \usage{
7 | Merge_methy_tcga(dirr = NULL)
8 | }
9 | \arguments{
10 | \item{dirr}{a string for the directory of methylation data download from tcga
11 | useing the tools gdc}
12 | }
13 | \value{
14 | a matrix, a combined methylation expression spectrum matrix
15 | }
16 | \description{
17 | When the methylation data is downloaded from TCGA,
18 | each sample is saved in a folder, which contains the methylation value file
19 | and the descriptive file. This function can directly
20 | extract and consolidate all folders.
21 | }
22 | \examples{
23 | merge_result <- Merge_methy_tcga(system.file(file.path("extdata", "methy"),
24 | package = "GeoTcgaData"))
25 | }
26 |
--------------------------------------------------------------------------------
/man/SNP_QC.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/SNP.R
3 | \name{SNP_QC}
4 | \alias{SNP_QC}
5 | \title{Do quality control of SNP data downloaded from TCGAbiolinks}
6 | \usage{
7 | SNP_QC(
8 | snpData,
9 | geon = 0.02,
10 | mind = 0.02,
11 | maf = 0.05,
12 | hwe = 1e-06,
13 | miss = "NoCall"
14 | )
15 | }
16 | \arguments{
17 | \item{snpData}{data.frame of SNP data downloaded from TCGAbiolinks}
18 |
19 | \item{geon}{filters out all variants with missing call rates
20 | exceeding the provided value (default 0.02) to be removed}
21 |
22 | \item{mind}{filters out all samples with missing call rates exceeding
23 | the provided value (default 0.02) to be removed}
24 |
25 | \item{maf}{filters out all variants with minor allele frequency below
26 | the provided threshold}
27 |
28 | \item{hwe}{filters out all variants which have Hardy-Weinberg
29 | equilibrium exact test p-value below the provided threshold}
30 |
31 | \item{miss}{character of miss value}
32 | }
33 | \value{
34 | data.frame
35 | }
36 | \description{
37 | Do quality control of SNP data downloaded from TCGAbiolinks
38 | }
39 | \examples{
40 | # use demo data
41 | snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10)
42 | snpDf <- as.data.frame(snpDf)
43 | sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
44 | result <- SNP_QC(snpDf)
45 | }
46 |
--------------------------------------------------------------------------------
/man/array_preprocess.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/arrayDiff.R
3 | \name{array_preprocess}
4 | \alias{array_preprocess}
5 | \title{Preprocess of Microarray data}
6 | \usage{
7 | array_preprocess(x, missing_value = "knn", string = " /// ")
8 | }
9 | \arguments{
10 | \item{x}{matrix of Microarray data, each column is a sample,
11 | and each row is a gene.}
12 |
13 | \item{missing_value}{Method to impute missing expression data,
14 | one of "zero" and "knn".}
15 |
16 | \item{string}{a string, sep of the gene}
17 | }
18 | \value{
19 | matrix
20 | }
21 | \description{
22 | Preprocess of Microarray data
23 | }
24 | \examples{
25 | \donttest{
26 | arraylist <- get_geo_array("GSE781")
27 | arraylist <- lapply(arraylist, array_preprocess)
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/man/cal_mean_module.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_mean_module.R
3 | \name{cal_mean_module}
4 | \alias{cal_mean_module}
5 | \title{Find the mean value of the gene in each module}
6 | \usage{
7 | cal_mean_module(geneExpress, module)
8 | }
9 | \arguments{
10 | \item{geneExpress}{a data.frame of gene expression data.
11 | Each column is a sample, and each row is a gene.}
12 |
13 | \item{module}{a data.frame of two column. The first column is module name,
14 | the second column are genes in this module.}
15 | }
16 | \value{
17 | a data.frame, means the mean of gene expression value in
18 | the same module
19 | }
20 | \description{
21 | Find the mean value of the gene in each module
22 | }
23 | \examples{
24 | data(geneExpress)
25 | data(module)
26 | result <- cal_mean_module(geneExpress, module)
27 | }
28 |
--------------------------------------------------------------------------------
/man/cluster_array.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/arrayDiff.R
3 | \name{cluster_array}
4 | \alias{cluster_array}
5 | \title{cluster probes of Microarray data}
6 | \usage{
7 | cluster_array(x, clusterCutoff = 0.7)
8 | }
9 | \arguments{
10 | \item{x}{matrix of Microarray data, the first is the name of the gene,
11 | and the others are the expression value.}
12 |
13 | \item{clusterCutoff}{Pearson correlation threshold
14 | to cut off the hierarchical tree.}
15 | }
16 | \value{
17 | data.frame
18 | }
19 | \description{
20 | cluster probes of Microarray data
21 | }
22 | \examples{
23 | \donttest{
24 | arraylist <- get_geo_array("GSE781")
25 | arraylist <- lapply(arraylist, array_preprocess)
26 | arraylist_cluster <- lapply(arraylist, cluster_array)
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/man/combine_pvalue.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/SNP.R
3 | \name{combine_pvalue}
4 | \alias{combine_pvalue}
5 | \title{combine pvalues of SNP difference analysis result}
6 | \usage{
7 | combine_pvalue(snpResult, snp2gene, combineMethod = min)
8 | }
9 | \arguments{
10 | \item{snpResult}{data.frame of SNP difference analysis result.}
11 |
12 | \item{snp2gene}{data frame of two column: snp and gene.}
13 |
14 | \item{combineMethod}{Method of combining the
15 | pvalue of multiple snp in a gene.}
16 | }
17 | \value{
18 | data.frame
19 | }
20 | \description{
21 | combine pvalues of SNP difference analysis result
22 | }
23 | \examples{
24 | snpResult <- data.frame(pvalue = runif(100), estimate = runif(100))
25 | rownames(snpResult) <- paste0("snp", seq_len(100))
26 | snp2gene <- data.frame(snp = rownames(snpResult),
27 | gene = rep(paste0("gene", seq_len(20)), 5))
28 | result <- combine_pvalue(snpResult, snp2gene)
29 | }
30 |
--------------------------------------------------------------------------------
/man/countToFpkm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fpkm_count_conversion.r
3 | \name{countToFpkm}
4 | \alias{countToFpkm}
5 | \title{Convert count to FPKM}
6 | \usage{
7 | countToFpkm(counts_matrix, keyType = "SYMBOL", gene_cov)
8 | }
9 | \arguments{
10 | \item{counts_matrix}{a matrix, colnames of counts_matrix are sample name,
11 | rownames of counts_matrix are gene symbols}
12 |
13 | \item{keyType}{keyType, one of keytypes(org.Hs.eg.db).}
14 |
15 | \item{gene_cov}{data.frame of two column, the first column is gene length,
16 | the second column is gene GC content}
17 | }
18 | \value{
19 | a matrix
20 | }
21 | \description{
22 | Convert count to FPKM
23 | }
24 | \examples{
25 | data(gene_cov)
26 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
27 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
28 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
29 | result <- countToFpkm(lung_squ_count2,
30 | keyType = "SYMBOL",
31 | gene_cov = gene_cov
32 | )
33 | }
34 |
--------------------------------------------------------------------------------
/man/countToTpm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fpkm_count_conversion.r
3 | \name{countToTpm}
4 | \alias{countToTpm}
5 | \title{Convert count to Tpm}
6 | \usage{
7 | countToTpm(counts_matrix, keyType = "SYMBOL", gene_cov)
8 | }
9 | \arguments{
10 | \item{counts_matrix}{a matrix, colnames of counts_matrix are sample name,
11 | rownames of counts_matrix are gene symbols}
12 |
13 | \item{keyType}{keyType, one of keytypes(org.Hs.eg.db).}
14 |
15 | \item{gene_cov}{data.frame of two column, the first column is gene length,
16 | the second column is gene GC content}
17 | }
18 | \value{
19 | a matrix
20 | }
21 | \description{
22 | Convert count to Tpm
23 | }
24 | \examples{
25 | data(gene_cov)
26 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
27 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
28 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
29 | result <- countToTpm(lung_squ_count2,
30 | keyType = "SYMBOL",
31 | gene_cov = gene_cov
32 | )
33 | }
34 |
--------------------------------------------------------------------------------
/man/differential_RNA.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/RNA_seq.R
3 | \name{differential_RNA}
4 | \alias{differential_RNA}
5 | \title{differential_RNA}
6 | \usage{
7 | differential_RNA(
8 | counts,
9 | group,
10 | groupCol,
11 | method = "limma",
12 | geneLength = NULL,
13 | gccontent = NULL,
14 | filter = TRUE,
15 | edgeRNorm = TRUE,
16 | adjust.method = "BH",
17 | useTopconfects = TRUE,
18 | ucscData = FALSE
19 | )
20 | }
21 | \arguments{
22 | \item{counts}{a dataframe or numeric matrix of raw counts data,
23 | or SummarizedExperiment object}
24 |
25 | \item{group}{sample groups}
26 |
27 | \item{groupCol}{group column}
28 |
29 | \item{method}{one of "DESeq2", "edgeR" , "limma", "dearseq",
30 | "NOISeq", "Wilcoxon", and "auto".}
31 |
32 | \item{geneLength}{a vector of gene length.}
33 |
34 | \item{gccontent}{a vector of gene GC content.}
35 |
36 | \item{filter}{if TRUE, use filterByExpr to filter genes.}
37 |
38 | \item{edgeRNorm}{if TRUE, use edgeR to do normalization for dearseq method.}
39 |
40 | \item{adjust.method}{character string specifying the method used to
41 | adjust p-values for multiple testing.
42 | See \link{p.adjust} for possible values.}
43 |
44 | \item{useTopconfects}{if TRUE, use topconfects to provide a
45 | more biologically useful ranked gene list.}
46 |
47 | \item{ucscData}{Logical, whether the data comes from UCSC Xena.}
48 | }
49 | \value{
50 | data.frame
51 | }
52 | \description{
53 | Do difference analysis of RNA-seq data
54 | }
55 | \examples{
56 | \donttest{
57 | library(TCGAbiolinks)
58 |
59 | query <- GDCquery(
60 | project = "TCGA-ACC",
61 | data.category = "Transcriptome Profiling",
62 | data.type = "Gene Expression Quantification",
63 | workflow.type = "STAR - Counts"
64 | )
65 |
66 | GDCdownload(query,
67 | method = "api", files.per.chunk = 3,
68 | directory = Your_Path
69 | )
70 |
71 | dataRNA <- GDCprepare(
72 | query = query, directory = Your_Path,
73 | save = TRUE, save.filename = "dataRNA.RData"
74 | )
75 | ## get raw count matrix
76 | dataPrep <- TCGAanalyze_Preprocessing(
77 | object = dataRNA,
78 | cor.cut = 0.6,
79 | datatype = "STAR - Counts"
80 | )
81 |
82 | # Use `differential_RNA` to do difference analysis.
83 | # We provide the data of human gene length and GC content in `gene_cov`.
84 | group <- sample(c("grp1", "grp2"), ncol(dataPrep), replace = TRUE)
85 | library(cqn) # To avoid reporting errors: there is no function "rq"
86 | ## get gene length and GC content
87 | library(org.Hs.eg.db)
88 | genes_bitr <- bitr(rownames(gene_cov),
89 | fromType = "ENTREZID", toType = "ENSEMBL",
90 | OrgDb = org.Hs.eg.db, drop = TRUE
91 | )
92 | genes_bitr <- genes_bitr[!duplicated(genes_bitr[, 2]), ]
93 | gene_cov2 <- gene_cov[genes_bitr$ENTREZID, ]
94 | rownames(gene_cov2) <- genes_bitr$ENSEMBL
95 | genes <- intersect(rownames(dataPrep), rownames(gene_cov2))
96 | dataPrep <- dataPrep[genes, ]
97 | geneLength <- gene_cov2(genes, "length")
98 | gccontent <- gene_cov2(genes, "GC")
99 | names(geneLength) <- names(gccontent) <- genes
100 | ## Difference analysis
101 | DEGAll <- differential_RNA(
102 | counts = dataPrep, group = group,
103 | geneLength = geneLength, gccontent = gccontent
104 | )
105 | # Use `clusterProfiler` to do enrichment analytics:
106 | diffGenes <- DEGAll$logFC
107 | names(diffGenes) <- rownames(DEGAll)
108 | diffGenes <- sort(diffGenes, decreasing = TRUE)
109 | library(clusterProfiler)
110 | library(enrichplot)
111 | library(org.Hs.eg.db)
112 | gsego <- gseGO(gene = diffGenes, OrgDb = org.Hs.eg.db, keyType = "ENSEMBL")
113 | dotplot(gsego)
114 | }
115 | # use user-defined data
116 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
117 | df <- as.data.frame(df)
118 | rownames(df) <- paste0("gene", 1:25)
119 | colnames(df) <- paste0("sample", 1:16)
120 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
121 | result <- differential_RNA(counts = df, group = group,
122 | filte = FALSE, method = "Wilcoxon")
123 | # use SummarizedExperiment object input
124 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
125 | rownames(df) <- paste0("gene", 1:25)
126 | colnames(df) <- paste0("sample", 1:16)
127 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
128 |
129 | nrows <- 200; ncols <- 20
130 | counts <- matrix(
131 | runif(nrows * ncols, 1, 1e4), nrows,
132 | dimnames = list(paste0("cg",1:200),paste0("S",1:20))
133 | )
134 |
135 | colData <- S4Vectors::DataFrame(
136 | row.names = paste0("sample", 1:16),
137 | group = group
138 | )
139 | data <- SummarizedExperiment::SummarizedExperiment(
140 | assays=S4Vectors::SimpleList(counts=df),
141 | colData = colData)
142 |
143 | result <- differential_RNA(counts = data, groupCol = "group",
144 | filte = FALSE, method = "Wilcoxon")
145 | }
146 |
--------------------------------------------------------------------------------
/man/differential_SNP.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/SNP.R
3 | \name{differential_SNP}
4 | \alias{differential_SNP}
5 | \title{Do difference analysis of SNP data}
6 | \usage{
7 | differential_SNP(snpDf, sampleGroup, combineMethod = min)
8 | }
9 | \arguments{
10 | \item{snpDf}{data.frame of SNP data, each column is a sample,
11 | and each row is a SNP.}
12 |
13 | \item{sampleGroup}{vector of sample group.}
14 |
15 | \item{combineMethod}{Method of combining the
16 | pvalue of multiple snp in a gene.}
17 | }
18 | \value{
19 | data.frame
20 | }
21 | \description{
22 | Do difference analysis of SNP data
23 | }
24 | \examples{
25 | \donttest{
26 | library(TCGAbiolinks)
27 | query <- GDCquery(
28 | project = "TCGA-CHOL",
29 | data.category = "Simple Nucleotide Variation",
30 | access = "open",
31 | legacy = FALSE,
32 | data.type = "Masked Somatic Mutation",
33 | workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
34 | )
35 | GDCdownload(query)
36 | data_snp <- GDCprepare(query)
37 | samples <- unique(data_snp$Tumor_Sample_Barcode)
38 | sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE)
39 | names(sampleGroup) <- samples
40 | pvalue <- differential_SNP_tcga(snpData = data_snp,
41 | sampleGroup = sampleGroup)
42 | }
43 | # use demo data
44 | snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10)
45 | snpDf <- as.data.frame(snpDf)
46 | sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
47 | result <- differential_SNP(snpDf, sampleGroup)
48 | }
49 |
--------------------------------------------------------------------------------
/man/differential_SNP_GEO.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/SNP.R
3 | \name{differential_SNP_GEO}
4 | \alias{differential_SNP_GEO}
5 | \title{Do difference analysis of SNP data downloaded from GEO}
6 | \usage{
7 | differential_SNP_GEO(snpData, sampleGroup, method = "Chisquare")
8 | }
9 | \arguments{
10 | \item{snpData}{data.frame of SNP data downloaded from GEO}
11 |
12 | \item{sampleGroup}{vector of sample group}
13 |
14 | \item{method}{one of "Chisquare", "fisher",
15 | and "CATT"(Cochran-Armitage trend test)}
16 | }
17 | \value{
18 | data.frame
19 | }
20 | \description{
21 | Do difference analysis of SNP data downloaded from GEO
22 | }
23 | \examples{
24 | \donttest{
25 | file1 <- read.table("GSE66903_series_matrix.txt.gz",
26 | fill=TRUE, comment.char="!", header = TRUE)
27 | rownames(file1) <- file1[, 1]
28 | snpData <- file1[, -1]
29 | sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE)
30 | names(sampleGroup) <- colnames(snpData)
31 | snpData <- SNP_QC(snpData)
32 | sampleGroup <- sample(c("A", "B"), ncol(snpData ), replace = TRUE)
33 | result1 <- differential_SNP_GEO(snpData = snpData,
34 | sampleGroup = sampleGroup, method = "Chisquare")
35 | }
36 | # use demo data
37 | snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10)
38 | snpDf <- as.data.frame(snpDf)
39 | sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
40 | result <- differential_SNP_GEO(snpDf, sampleGroup, method = "fisher")
41 | }
42 |
--------------------------------------------------------------------------------
/man/differential_SNP_tcga.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/SNP.R
3 | \name{differential_SNP_tcga}
4 | \alias{differential_SNP_tcga}
5 | \title{Do difference analysis of SNP data downloaded from TCGAbiolinks}
6 | \usage{
7 | differential_SNP_tcga(snpData, sampleGroup, combineMethod = NULL)
8 | }
9 | \arguments{
10 | \item{snpData}{data.frame of SNP data downloaded from TCGAbiolinks}
11 |
12 | \item{sampleGroup}{vector of sample group}
13 |
14 | \item{combineMethod}{Method of combining the pvalue of
15 | multiple snp in a gene.}
16 | }
17 | \value{
18 | data.frame
19 | }
20 | \description{
21 | Do difference analysis of SNP data downloaded from TCGAbiolinks
22 | }
23 | \examples{
24 | \donttest{
25 | library(TCGAbiolinks)
26 | query <- GDCquery(
27 | project = "TCGA-CHOL",
28 | data.category = "Simple Nucleotide Variation",
29 | access = "open",
30 | legacy = FALSE,
31 | data.type = "Masked Somatic Mutation",
32 | workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
33 | )
34 | GDCdownload(query)
35 | data_snp <- GDCprepare(query)
36 | samples <- unique(data_snp$Tumor_Sample_Barcode)
37 | sampleGroup <- sample(c("A", "B"), length(samples), replace = TRUE)
38 | names(sampleGroup) <- samples
39 | pvalue <- differential_SNP_tcga(snpData = data_snp,
40 | sampleGroup = sampleGroup)
41 | }
42 | # use demo data
43 | snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10)
44 | snpDf <- as.data.frame(snpDf)
45 | sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
46 | result <- differential_SNP(snpDf, sampleGroup)
47 | }
48 |
--------------------------------------------------------------------------------
/man/differential_array.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/arrayDiff.R
3 | \name{differential_array}
4 | \alias{differential_array}
5 | \title{Differential analysis of Microarray data}
6 | \usage{
7 | differential_array(df, group, method = "limma", adjust.method = "BH")
8 | }
9 | \arguments{
10 | \item{df}{data.frame of the omic data, each column is a sample,
11 | and each row is a gene.}
12 |
13 | \item{group}{a vector, group of samples.}
14 |
15 | \item{method}{method to do differential analysis,
16 | one of "limma", "ttest", "wilcox".}
17 |
18 | \item{adjust.method}{adjust.method, one of "holm", "hochberg", "hommel",
19 | "bonferroni", "BH", "BY", "fdr", and "none".}
20 | }
21 | \value{
22 | data.frame
23 | }
24 | \description{
25 | Differential analysis of Microarray data
26 | }
27 | \examples{
28 | \donttest{
29 | library(GeoTcgaData)
30 | library(data.table)
31 | # Use real GEO data as example
32 | arrayData <- read.table("GSE54807_series_matrix.txt.gz",
33 | sep = "\t", header = TRUE,
34 | fill=TRUE, comment.char = "!", check.names=FALSE)
35 | gpl <- fread("GPL6244-17930.txt", sep = "\t", header = TRUE)
36 | gpl <- gpl[, c("ID", "gene_assignment")]
37 | class(gpl) <- "data.frame"
38 |
39 | for (i in seq_len(nrow(gpl))) {
40 | aa <- strsplit(gpl[i, 2], " // ")[[1]][5]
41 | gpl[i, 2] <- as.character(strsplit(aa, " /// ")[[1]][1])
42 | }
43 | gpl[,1] <- as.character(gpl[,1])
44 | arrayData[, 1] <- as.character(arrayData[, 1])
45 | rownames(gpl) <- gpl[, 1]
46 | arrayData[, 1] <- gpl[arrayData[, 1], 2]
47 |
48 |
49 | arrayData <- repRemove(arrayData," /// ")
50 |
51 | # Remove rows that do not correspond to genes
52 | arrayData <- arrayData[!is.na(arrayData[, 1]), ]
53 | arrayData <- arrayData[!arrayData[, 1] == "", ]
54 | arrayData <- arrayData[!arrayData[, 1] == "---", ]
55 |
56 |
57 | arrayData <- arrayData[order(arrayData[, 1]), ]
58 | arrayData <- gene_ave(arrayData, 1)
59 |
60 | keep <- apply(arrayData, 1, function(x) sum(x < 1) < (length(x)/2))
61 | arrayData <- arrayData[keep, ]
62 |
63 | group <- c(rep("group1", 12), rep("group2", 12))
64 | result <- differential_array(df = arrayData, group = group)
65 | }
66 | # Use random data as example
67 | arrayData <- matrix(runif(200), 25, 8)
68 | rownames(arrayData) <- paste0("gene", 1:25)
69 | colnames(arrayData) <- paste0("sample", 1:8)
70 | group <- c(rep("group1", 4), rep("group2", 4))
71 | result <- differential_array(df = arrayData, group = group)
72 | }
73 |
--------------------------------------------------------------------------------
/man/differential_cnv.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/CNV.R
3 | \name{differential_CNV}
4 | \alias{differential_CNV}
5 | \title{Do difference analysis of gene level copy number variation data}
6 | \usage{
7 | differential_CNV(
8 | cnvData,
9 | sampleGroup,
10 | method = "Chisquare",
11 | adjust.method = "BH",
12 | ...
13 | )
14 | }
15 | \arguments{
16 | \item{cnvData}{data.frame of CNV data, each column is a sample,
17 | and each row is a CNV.}
18 |
19 | \item{sampleGroup}{vector of sample group}
20 |
21 | \item{method}{method to do diffenenital analysis,
22 | one of "Chisquare", "fisher",
23 | and "CATT"(Cochran-Armitage trend test)}
24 |
25 | \item{adjust.method}{adjust.method, one of "holm", "hochberg", "hommel",
26 | "bonferroni", "BH", "BY", "fdr", and "none".}
27 |
28 | \item{...}{parameters for "Chisquare", "fisher",
29 | and "CATT"(Cochran-Armitage trend test)}
30 | }
31 | \value{
32 | data.frame with pvalue and estimate
33 | }
34 | \description{
35 | Do difference analysis of gene level copy number variation data
36 | }
37 | \examples{
38 | \donttest{
39 | # use TCGAbiolinks data as example
40 | library(TCGAbiolinks)
41 | query <- GDCquery(
42 | project = "TCGA-ACC",
43 | data.category = "Copy Number Variation",
44 | data.type = "Gene Level Copy Number",
45 | access = "open"
46 | )
47 | GDCdownload(query)
48 | cnvData <- GDCprepare(query)
49 | aa <- assays(cnvData)$copy_number
50 | bb <- aa
51 | aa[bb == 2] <- 0
52 | aa[bb < 2] <- -1
53 | aa[bb > 2] <- 1
54 | sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE)
55 | diffCnv <- differential_CNV(aa, sampleGroup)
56 |
57 | # Use sangerbox CNV data as example
58 | cnvData <- fread("Merge_GeneLevelCopyNumber.txt")
59 | class(cnvData) <- "data.frame"
60 | rownames(cnvData) <- cnvData[, 1]
61 | cnvData <- cnvData[, -c(1, 2, 3)]
62 | sampleGroup <- sample(c("A", "B"), ncol(cnvData), replace = TRUE)
63 | diffCnv <- differential_CNV(cnvData, sampleGroup)
64 | }
65 | # use random data as example
66 | aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8)
67 | rownames(aa) <- paste0("gene", 1:25)
68 | colnames(aa) <- paste0("sample", 1:8)
69 | sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE)
70 | diffCnv <- differential_CNV(aa, sampleGroup)
71 | }
72 |
--------------------------------------------------------------------------------
/man/differential_limma.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/Merge_methylation.R
3 | \name{differential_limma}
4 | \alias{differential_limma}
5 | \title{differential_limma}
6 | \usage{
7 | differential_limma(df, group, adjust.method = "BH")
8 | }
9 | \arguments{
10 | \item{df}{data.frame of the omic data}
11 |
12 | \item{group}{a vector, group of samples.}
13 |
14 | \item{adjust.method}{adjust.method.}
15 | }
16 | \value{
17 | data.frame
18 | }
19 | \description{
20 | differential_limma
21 | }
22 | \examples{
23 | df <- matrix(runif(200), 25, 8)
24 | df <- as.data.frame(df)
25 | rownames(df) <- paste0("gene", 1:25)
26 | colnames(df) <- paste0("sample", 1:8)
27 | group <- sample(c("group1", "group2"), 8, replace = TRUE)
28 | result <- differential_limma(df = df, group = group)
29 | }
30 |
--------------------------------------------------------------------------------
/man/differential_methy.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/Merge_methylation.R
3 | \name{differential_methy}
4 | \alias{differential_methy}
5 | \title{differential_methy}
6 | \usage{
7 | differential_methy(
8 | cpgData,
9 | sampleGroup,
10 | groupCol,
11 | combineMethod = "stouffer",
12 | missing_value = "knn",
13 | cpg2gene = NULL,
14 | normMethod = "PBC",
15 | region = "TSS1500",
16 | model = "gene",
17 | adjust.method = "BH",
18 | adjPvalCutoff = 0.05,
19 | ucscData = FALSE
20 | )
21 | }
22 | \arguments{
23 | \item{cpgData}{data.frame of cpg beta value, , or SummarizedExperiment object}
24 |
25 | \item{sampleGroup}{vector of sample group}
26 |
27 | \item{groupCol}{group column}
28 |
29 | \item{combineMethod}{method to combine the cpg pvalues,
30 | a function or one of "stouffer", "fisher" and "rhoScores".}
31 |
32 | \item{missing_value}{Method to impute missing expression data,
33 | one of "zero" and "knn".}
34 |
35 | \item{cpg2gene}{data.frame to annotate cpg locus to gene}
36 |
37 | \item{normMethod}{Method to do normalization: "PBC" or "BMIQ".}
38 |
39 | \item{region}{region of genes, one of "Body", "TSS1500", "TSS200",
40 | "3'UTR", "1stExon", "5'UTR", and "IGR". Only used when cpg2gene is NULL.}
41 |
42 | \item{model}{if "cpg", step1: calculate difference cpgs;
43 | step2: calculate difference genes.
44 | if "gene", step1: calculate the methylation level of genes;
45 | step2: calculate difference genes.}
46 |
47 | \item{adjust.method}{character string specifying the method
48 | used to adjust p-values for multiple testing.
49 | See \link{p.adjust} for possible values.}
50 |
51 | \item{adjPvalCutoff}{adjusted pvalue cutoff}
52 |
53 | \item{ucscData}{Logical, whether the data comes from UCSC Xena.}
54 | }
55 | \value{
56 | data.frame
57 | }
58 | \description{
59 | Get methylation difference gene
60 | }
61 | \examples{
62 | \donttest{
63 | # use TCGAbiolinks data
64 | library(TCGAbiolinks)
65 | query <- GDCquery(project = "TCGA-ACC",
66 | data.category = "DNA Methylation",
67 | data.type = "Methylation Beta Value",
68 | platform = "Illumina Human Methylation 450")
69 | GDCdownload(query, method = "api", files.per.chunk = 5,
70 | directory = Your_Path)
71 | merge_result <- Merge_methy_tcga(Your_Path_to_DNA_Methylation_data)
72 | library(ChAMP) # To avoid reporting errors
73 | differential_gene <- differential_methy(cpgData = merge_result,
74 | sampleGroup = sample(c("C","T"),
75 | ncol(merge_result[[1]]), replace = TRUE))
76 | }
77 | # use user defined data
78 | library(ChAMP)
79 | cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
80 | rownames(cpgData) <- paste0("cpg", seq_len(200))
81 | colnames(cpgData) <- paste0("sample", seq_len(10))
82 | sampleGroup <- c(rep("group1", 5), rep("group2", 5))
83 | names(sampleGroup) <- colnames(cpgData)
84 | cpg2gene <- data.frame(cpg = rownames(cpgData),
85 | gene = rep(paste0("gene", seq_len(20)), 10))
86 | result <- differential_methy(cpgData, sampleGroup,
87 | cpg2gene = cpg2gene, normMethod = NULL)
88 | # use SummarizedExperiment object input
89 | library(ChAMP)
90 | cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
91 | rownames(cpgData) <- paste0("cpg", seq_len(200))
92 | colnames(cpgData) <- paste0("sample", seq_len(10))
93 | sampleGroup <- c(rep("group1", 5), rep("group2", 5))
94 | names(sampleGroup) <- colnames(cpgData)
95 | cpg2gene <- data.frame(cpg = rownames(cpgData),
96 | gene = rep(paste0("gene", seq_len(20)), 10))
97 | colData <- S4Vectors::DataFrame(
98 | row.names = colnames(cpgData),
99 | group = sampleGroup
100 | )
101 | data <- SummarizedExperiment::SummarizedExperiment(
102 | assays=S4Vectors::SimpleList(counts=cpgData),
103 | colData = colData)
104 | result <- differential_methy(cpgData = data,
105 | groupCol = "group", normMethod = NULL,
106 | cpg2gene = cpg2gene)
107 | }
108 |
--------------------------------------------------------------------------------
/man/fpkmToTpm.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/fpkm_count_conversion.r
3 | \name{fpkmToTpm}
4 | \alias{fpkmToTpm}
5 | \title{Convert fpkm to Tpm}
6 | \usage{
7 | fpkmToTpm(fpkm_matrix)
8 | }
9 | \arguments{
10 | \item{fpkm_matrix}{a matrix, colnames of fpkm_matrix are sample name,
11 | rownames of fpkm_matrix are genes}
12 | }
13 | \value{
14 | a matrix
15 | }
16 | \description{
17 | Convert fpkm to Tpm
18 | }
19 | \examples{
20 | lung_squ_count2 <- matrix(c(0.11, 0.22, 0.43, 0.14, 0.875,
21 | 0.66, 0.77, 0.18, 0.29), ncol = 3)
22 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
23 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
24 | result <- fpkmToTpm(lung_squ_count2)
25 | }
26 |
--------------------------------------------------------------------------------
/man/geneExpress.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{geneExpress}
5 | \alias{geneExpress}
6 | \title{a data.frame of gene expression data}
7 | \format{
8 | A data.frame with 10779 rows and 2 column
9 | }
10 | \usage{
11 | geneExpress
12 | }
13 | \description{
14 | It is a randomly generated expression data
15 | used as an example of functions in this package.
16 | the rowname is gene symbols
17 | the columns are gene expression values
18 | }
19 | \keyword{datasets}
20 |
--------------------------------------------------------------------------------
/man/gene_ave.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/calculate_mean_profile.R
3 | \name{gene_ave}
4 | \alias{gene_ave}
5 | \title{Average the values of same genes in gene expression profile}
6 | \usage{
7 | gene_ave(file_gene_ave, k = 1)
8 | }
9 | \arguments{
10 | \item{file_gene_ave}{a data.frame of gene expression data,
11 | each column is a sample, and each row is a gene.}
12 |
13 | \item{k}{a number, indicates which is the gene column.}
14 | }
15 | \value{
16 | a data.frame, the values of same genes in gene expression profile
17 | }
18 | \description{
19 | Average the values of same genes in gene expression profile
20 | }
21 | \examples{
22 | aa <- c("MARCH1", "MARC1", "MARCH1", "MARCH1", "MARCH1")
23 | bb <- c(2.969058399, 4.722410064, 8.165514853, 8.24243893, 8.60815086)
24 | cc <- c(3.969058399, 5.722410064, 7.165514853, 6.24243893, 7.60815086)
25 | file_gene_ave <- data.frame(aa = aa, bb = bb, cc = cc)
26 | colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
27 |
28 | result <- gene_ave(file_gene_ave, 1)
29 | }
30 |
--------------------------------------------------------------------------------
/man/gene_cov.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{gene_cov}
5 | \alias{gene_cov}
6 | \title{a data.frame of gene length and GC content}
7 | \format{
8 | A data.frame with 27341 rows and 2 column
9 | }
10 | \usage{
11 | gene_cov
12 | }
13 | \description{
14 | the gene length and GC content data comes from
15 | TxDb.Hsapiens.UCSC.hg38.knownGene and
16 | BSgenome.Hsapiens.UCSC.hg38
17 | }
18 | \keyword{datasets}
19 |
--------------------------------------------------------------------------------
/man/get_geo_array.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/arrayDiff.R
3 | \name{get_geo_array}
4 | \alias{get_geo_array}
5 | \title{Get Microarray matrix data from GEO}
6 | \usage{
7 | get_geo_array(gse)
8 | }
9 | \arguments{
10 | \item{gse}{GSE number, such as GSE781.}
11 | }
12 | \value{
13 | a list of matrix
14 | }
15 | \description{
16 | Get Microarray matrix data from GEO
17 | }
18 | \examples{
19 | \donttest{
20 | arraylist <- get_geo_array("GSE781")
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/man/id_conversion_TCGA.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/TCGA_id_conversion.R
3 | \name{id_conversion_TCGA}
4 | \alias{id_conversion_TCGA}
5 | \title{Convert ENSEMBL gene id to gene Symbol in TCGA}
6 | \usage{
7 | id_conversion_TCGA(profiles, toType = "SYMBOL")
8 | }
9 | \arguments{
10 | \item{profiles}{a data.frame of gene expression data,
11 | each column is a sample,
12 | and each row is a gene.}
13 |
14 | \item{toType}{one of 'keytypes(org.Hs.eg.db)'}
15 | }
16 | \value{
17 | a data.frame, gene symbols and their expression value
18 | }
19 | \description{
20 | Convert ENSEMBL gene id to gene Symbol in TCGA
21 | }
22 | \examples{
23 | library(org.Hs.eg.db)
24 | data(profile)
25 | result <- id_conversion_TCGA(profile)
26 | }
27 |
--------------------------------------------------------------------------------
/man/kegg_liver.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{kegg_liver}
5 | \alias{kegg_liver}
6 | \title{a matrix of gene expression data in TCGA}
7 | \format{
8 | A matrix with 100 rows and 150 column
9 | }
10 | \usage{
11 | kegg_liver
12 | }
13 | \description{
14 | It is a randomly generated expression data
15 | used as an example of functions in this package.
16 | the first column represents the gene symbol
17 | }
18 | \details{
19 | the other columns represent the expression(count) of genes
20 | }
21 | \keyword{datasets}
22 |
--------------------------------------------------------------------------------
/man/module.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{module}
5 | \alias{module}
6 | \title{a matrix of module name, gene symbols, and the number of gene symbols}
7 | \format{
8 | A matrix with 176 rows and 3 column
9 | }
10 | \usage{
11 | module
12 | }
13 | \description{
14 | It is a randomly generated expression data
15 | used as an example of functions in this package.
16 | }
17 | \keyword{datasets}
18 |
--------------------------------------------------------------------------------
/man/prepare_chi.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/prepareChi.r
3 | \name{prepare_chi}
4 | \alias{prepare_chi}
5 | \title{Preparer file for chi-square test}
6 | \usage{
7 | prepare_chi(cnv)
8 | }
9 | \arguments{
10 | \item{cnv}{result of ann_merge()}
11 | }
12 | \value{
13 | a matrix
14 | }
15 | \description{
16 | Preparer file for chi-square test
17 | }
18 | \examples{
19 | cnv <- matrix(c(
20 | -1.09150, -1.47120, -0.87050, -0.50880,
21 | -0.50880, 2.0, 2.0, 2.0, 2.0, 2.0, 2.601962, 2.621332, 2.621332,
22 | 2.621332, 2.621332, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
23 | 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
24 | ), nrow = 5)
25 | cnv <- as.data.frame(cnv)
26 | rownames(cnv) <- c("AJAP1", "FHAD1", "CLCNKB", "CROCCP2", "AL137798.3")
27 | colnames(cnv) <- c(
28 | "TCGA-DD-A4NS-10A-01D-A30U-01", "TCGA-ED-A82E-01A-11D-A34Y-01",
29 | "TCGA-WQ-A9G7-01A-11D-A36W-01", "TCGA-DD-AADN-01A-11D-A40Q-01",
30 | "TCGA-ZS-A9CD-10A-01D-A36Z-01", "TCGA-DD-A1EB-11A-11D-A12Y-01"
31 | )
32 | cnv_chi_file <- prepare_chi(cnv)
33 | }
34 |
--------------------------------------------------------------------------------
/man/profile.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{profile}
5 | \alias{profile}
6 | \title{a matrix of gene expression data in TCGA}
7 | \format{
8 | A matrix with 10 rows and 10 column
9 | }
10 | \usage{
11 | profile
12 | }
13 | \description{
14 | It is a randomly generated expression data
15 | used as an example of functions in this package.
16 | the first column represents the gene symbol
17 | }
18 | \details{
19 | the other columns represent the expression(FPKM) of genes
20 | }
21 | \keyword{datasets}
22 |
--------------------------------------------------------------------------------
/man/repAssign.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/rep.R
3 | \name{repAssign}
4 | \alias{repAssign}
5 | \title{Handle the case where one id corresponds to multiple genes}
6 | \usage{
7 | repAssign(input_file, string)
8 | }
9 | \arguments{
10 | \item{input_file}{input file, a data.frame or a matrix,
11 | the first column should be genes.}
12 |
13 | \item{string}{a string, sep of the gene}
14 | }
15 | \value{
16 | a data.frame, when an id corresponds to multiple genes,
17 | the expression value is assigned to each gene
18 | }
19 | \description{
20 | Handle the case where one id corresponds to multiple genes
21 | }
22 | \examples{
23 | aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
24 | "MARCH3 /// MARCH4", "MARCH1")
25 | bb <- c("2.969058399", "4.722410064", "8.165514853",
26 | "8.24243893", "8.60815086")
27 | cc <- c("3.969058399", "5.722410064", "7.165514853",
28 | "6.24243893", "7.60815086")
29 | input_file <- data.frame(aa = aa, bb = bb, cc = cc)
30 |
31 | repAssign_result <- repAssign(input_file, " /// ")
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/man/repRemove.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/rep.R
3 | \name{repRemove}
4 | \alias{repRemove}
5 | \title{Handle the case where one id corresponds to multiple genes}
6 | \usage{
7 | repRemove(input_file, string)
8 | }
9 | \arguments{
10 | \item{input_file}{input file, a data.frame or a matrix,
11 | the first column should be genes.}
12 |
13 | \item{string}{a string,sep of the gene}
14 | }
15 | \value{
16 | a data.frame, when an id corresponds to multiple genes,
17 | the expression value is deleted
18 | }
19 | \description{
20 | Handle the case where one id corresponds to multiple genes
21 | }
22 | \examples{
23 | aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
24 | "MARCH3 /// MARCH4", "MARCH1")
25 | bb <- c("2.969058399", "4.722410064", "8.165514853",
26 | "8.24243893", "8.60815086")
27 | cc <- c("3.969058399", "5.722410064", "7.165514853",
28 | "6.24243893", "7.60815086")
29 | input_file <- data.frame(aa = aa, bb = bb, cc = cc)
30 | repRemove_result <- repRemove(input_file, " /// ")
31 | }
32 |
--------------------------------------------------------------------------------
/man/ventricle.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/data.R
3 | \docType{data}
4 | \name{ventricle}
5 | \alias{ventricle}
6 | \title{a matrix of gene expression data in GEO}
7 | \format{
8 | A matrix with 32 rows and 20 column
9 | }
10 | \usage{
11 | ventricle
12 | }
13 | \description{
14 | It is a randomly generated expression data
15 | used as an example of functions in this package.
16 | the first column represents the gene symbol
17 | }
18 | \details{
19 | the other columns represent the expression of genes
20 | }
21 | \keyword{datasets}
22 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | # This file is part of the standard setup for testthat.
2 | # It is recommended that you do not modify it.
3 | #
4 | # Where should you do additional test configuration?
5 | # Learn more about the roles of various files in:
6 | # * https://r-pkgs.org/tests.html
7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
8 |
9 | library(testthat)
10 | library(GeoTcgaData)
11 |
12 | test_check("GeoTcgaData")
13 |
--------------------------------------------------------------------------------
/tests/testthat/fpkmToTpm.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example fpkmToTpm_matrix", {
2 | lung_squ_count2 <- matrix(c(0.11, 0.22, 0.43, 0.14, 0.875,
3 | 0.66, 0.77, 0.18, 0.29), ncol = 3)
4 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
5 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
6 | result <- fpkmToTpm(lung_squ_count2)
7 | expect_equal(dim(lung_squ_count2), dim(result))
8 | })
9 |
--------------------------------------------------------------------------------
/tests/testthat/test_Merge_methy_tcga.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example Merge_methy_tcga", {
2 | merge_result <- Merge_methy_tcga(system.file(file.path("extdata", "methy"),
3 | package = "GeoTcgaData"))
4 | expect_equal(names(merge_result), c("methyResult", "cpg_info"))
5 | })
--------------------------------------------------------------------------------
/tests/testthat/test_arrayDiff.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example differential_array", {
2 | # skip_on_bioc()
3 | arrayData <- matrix(runif(200), 25, 8)
4 | rownames(arrayData) <- paste0("gene", 1:25)
5 | colnames(arrayData) <- paste0("sample", 1:8)
6 | group <- c(rep("group1", 4), rep("group2", 4))
7 | result <- differential_array(df = arrayData, group = group)
8 | expect_true( "P.Value" %in% colnames(result))
9 | })
10 |
--------------------------------------------------------------------------------
/tests/testthat/test_cal_mean_module.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example cal_mean_module", {
2 | data(geneExpress)
3 | data(module)
4 | result <- cal_mean_module(geneExpress, module)
5 | expect_equal( ncol(result) , ncol(geneExpress))
6 | })
--------------------------------------------------------------------------------
/tests/testthat/test_countToFpkm.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example countToFpkm_matrix", {
2 | data(gene_cov)
3 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
4 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
5 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
6 | result <- countToFpkm(lung_squ_count2,
7 | keyType = "SYMBOL",
8 | gene_cov = gene_cov
9 | )
10 | expect_equal(dim(lung_squ_count2), dim(result))
11 | })
12 |
--------------------------------------------------------------------------------
/tests/testthat/test_countToTpm.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example countToTpm_matrix", {
2 | data(gene_cov)
3 | lung_squ_count2 <- matrix(c(1, 2, 3, 4, 5, 6, 7, 8, 9), ncol = 3)
4 | rownames(lung_squ_count2) <- c("DISC1", "TCOF1", "SPPL3")
5 | colnames(lung_squ_count2) <- c("sample1", "sample2", "sample3")
6 | result <- countToTpm(lung_squ_count2,
7 | keyType = "SYMBOL",
8 | gene_cov = gene_cov
9 | )
10 | expect_equal(dim(lung_squ_count2), dim(result))
11 | })
12 |
--------------------------------------------------------------------------------
/tests/testthat/test_diff_RNA_ucsc.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example diff_RNA_ucsc", {
2 | skip_on_cran()
3 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
4 | df <- as.data.frame(df)
5 | rownames(df) <- paste0("gene", 1:25)
6 | colnames(df) <- paste0("sample", 1:16)
7 | df <- log2(df + 1)
8 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
9 | df <- cbind(rownames(df), df)
10 | result <- differential_RNA(counts = df, group = group,
11 | filte = FALSE, method = "limma", ucscData = TRUE)
12 | expect_true( "P.Value" %in% colnames(result))
13 | })
14 |
--------------------------------------------------------------------------------
/tests/testthat/test_differential_CNV.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example diff_CNV", {
2 | aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8)
3 | rownames(aa) <- paste0("gene", 1:25)
4 | colnames(aa) <- paste0("sample", 1:8)
5 | sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE)
6 | diffCnv <- differential_CNV(aa, sampleGroup)
7 | expect_true( "P.Value" %in% colnames(diffCnv))
8 | })
9 |
--------------------------------------------------------------------------------
/tests/testthat/test_differential_RNA.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example differential_RNA", {
2 | skip_on_cran()
3 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
4 | df <- as.data.frame(df)
5 | rownames(df) <- paste0("gene", 1:25)
6 | colnames(df) <- paste0("sample", 1:16)
7 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
8 | result <- differential_RNA(counts = df, group = group,
9 | filte = FALSE, method = "Wilcoxon")
10 | expect_true( "P.Value" %in% colnames(result))
11 | })
12 |
--------------------------------------------------------------------------------
/tests/testthat/test_differential_limma.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example Diff_limma", {
2 | df <- matrix(runif(200), 25, 8)
3 | df <- as.data.frame(df)
4 | rownames(df) <- paste0("gene", 1:25)
5 | colnames(df) <- paste0("sample", 1:8)
6 | group <- sample(c("group1", "group2"), 8, replace = TRUE)
7 | result <- differential_limma(df = df, group = group)
8 | expect_true( "P.Value" %in% colnames(result))
9 | })
10 |
--------------------------------------------------------------------------------
/tests/testthat/test_gene_ave.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example gene_ave", {
2 | aa <- c("MARCH1", "MARC1", "MARCH1", "MARCH1", "MARCH1")
3 | bb <- c(2.969058399, 4.722410064, 8.165514853, 8.24243893, 8.60815086)
4 | cc <- c(3.969058399, 5.722410064, 7.165514853, 6.24243893, 7.60815086)
5 | file_gene_ave <- data.frame(aa = aa, bb = bb, cc = cc)
6 | colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
7 |
8 | result <- gene_ave(file_gene_ave, 1)
9 | expect_equal(sort(unique(file_gene_ave[, 1])),
10 | sort(unique(rownames(result))))
11 | })
12 |
--------------------------------------------------------------------------------
/tests/testthat/test_prepare_chi.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example prepare_chi", {
2 | cnv <- matrix(c(
3 | -1.09150, -1.47120, -0.87050, -0.50880,
4 | -0.50880, 2.0, 2.0, 2.0, 2.0, 2.0, 2.601962, 2.621332, 2.621332,
5 | 2.621332, 2.621332, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0,
6 | 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0
7 | ), nrow = 5)
8 | cnv <- as.data.frame(cnv)
9 | rownames(cnv) <- c("AJAP1", "FHAD1", "CLCNKB", "CROCCP2", "AL137798.3")
10 | colnames(cnv) <- c(
11 | "TCGA-DD-A4NS-10A-01D-A30U-01", "TCGA-ED-A82E-01A-11D-A34Y-01",
12 | "TCGA-WQ-A9G7-01A-11D-A36W-01", "TCGA-DD-AADN-01A-11D-A40Q-01",
13 | "TCGA-ZS-A9CD-10A-01D-A36Z-01", "TCGA-DD-A1EB-11A-11D-A12Y-01"
14 | )
15 | cnv_chi_file <- prepare_chi(cnv)
16 | expect_true("normalCNV" %in% colnames(cnv_chi_file))
17 | })
18 |
--------------------------------------------------------------------------------
/tests/testthat/test_repAssign.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example rep1", {
2 | aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
3 | "MARCH3 /// MARCH4", "MARCH1")
4 | bb <- c("2.969058399", "4.722410064", "8.165514853",
5 | "8.24243893", "8.60815086")
6 | cc <- c("3.969058399", "5.722410064", "7.165514853",
7 | "6.24243893", "7.60815086")
8 | input_file <- data.frame(aa = aa, bb = bb, cc = cc)
9 |
10 | repAssign_result <- repAssign(input_file, " /// ")
11 | expect_true(!(" /// " %in% repAssign_result[, 1]))
12 | })
13 |
--------------------------------------------------------------------------------
/tests/testthat/test_repRemove.R:
--------------------------------------------------------------------------------
1 | test_that("can parse example rep1", {
2 | aa <- c("MARCH1 /// MMA", "MARC1", "MARCH2 /// MARCH3",
3 | "MARCH3 /// MARCH4", "MARCH1")
4 | bb <- c("2.969058399", "4.722410064", "8.165514853",
5 | "8.24243893", "8.60815086")
6 | cc <- c("3.969058399", "5.722410064", "7.165514853",
7 | "6.24243893", "7.60815086")
8 | input_file <- data.frame(aa = aa, bb = bb, cc = cc)
9 | repRemove_result <- repRemove(input_file, " /// ")
10 | expect_true(!(" /// " %in% repRemove_result[, 1]))
11 | })
12 |
--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 |
--------------------------------------------------------------------------------
/vignettes/GeoTcgaData.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "GeoTcgaData"
3 | output: rmarkdown::html_vignette
4 | vignette: >
5 | %\VignetteIndexEntry{GeoTcgaData}
6 | %\VignetteEngine{knitr::rmarkdown}
7 | %\VignetteEncoding{UTF-8}
8 | ---
9 |
10 | ```{r, include = FALSE}
11 | knitr::opts_chunk$set(
12 | collapse = TRUE,
13 | comment = "#>"
14 | )
15 | ```
16 | --------
17 |
18 | ## Authors
19 | Erqiang Hu
20 |
21 | Department of Bioinformatics, School of Basic Medical Sciences,
22 | Southern Medical University.
23 |
24 |
25 | ## Introduction
26 | GEO and TCGA provide us with a wealth of data, such as RNA-seq, DNA Methylation,
27 | single nucleotide Variation and Copy number variation data.
28 | It's easy to download data from TCGA using the gdc tool or `TCGAbiolinks`,
29 | and some software provides organized TCGA data, such as
30 | [UCSC Xena](http://xena.ucsc.edu/) , UCSCXenaTools, and
31 | [sangerbox](http://vip.sangerbox.com/), but processing these data into a format
32 | suitable for bioinformatics analysis requires more work. This R package was
33 | developed to handle these data.
34 |
35 | ```{r setup}
36 | library(GeoTcgaData)
37 | ```
38 |
39 | ## Example
40 |
41 | This is a basic example which shows you how to solve a common problem:
42 |
43 | ### RNA-seq data differential expression analysis
44 |
45 | It is convenient to use TCGAbiolinks or
46 | [`GDCRNATools`](https://bioconductor.org/packages/GDCRNATools/) to download
47 | and analysis Gene expression data. `TCGAbiolinks` use `edgeR` package to do
48 | differential expression analysis, while `GDCRNATools` can implement three most
49 | commonly used methods: limma, edgeR , and DESeq2 to identify differentially
50 | expressed genes (DEGs).
51 |
52 | Alicia Oshlack et al. claimed that unlike the chip data,
53 | the RNA-seq data had one [bias](https://pubmed.ncbi.nlm.nih.gov/20132535/)[1]:
54 | the larger the transcript length / mean read count , the more likely it was to
55 | be identified as a differential gene,
56 | while there was no such trend in the
57 | [chip data](https://pubmed.ncbi.nlm.nih.gov/19371405/)[2].
58 |
59 |
60 | However, when we use their chip data for difference analysis
61 | (using the limma package), we find that chip data has the same trend as
62 | RNA-seq data. And we also found this trend in the difference analysis results
63 | given by the data
64 | [authors](https://genome.cshlp.org/content/18/9/1509.long)[3].
65 |
66 |
67 |
68 | It is worse noting that only technical replicate data, which has small gene
69 | dispersions, shows this [bias](https://pubmed.ncbi.nlm.nih.gov/28545404/)[4].
70 | This is because in technical replicate RNA-seq data a long gene has more
71 | reads mapping to it compared to a short gene of similar expression,
72 | and most of the statistical methods used to detect differential expression
73 | have stronger detection ability for genes with more reads. However, we have
74 | not deduced why there is such a bias in the current difference
75 | analysis algorithms.
76 |
77 | Some software, such as [CQN](http://www.bioconductor.org/packages/cqn/) ,
78 | present a
79 | [normalization algorithm](https://pubmed.ncbi.nlm.nih.gov/22285995/) [5]
80 | to correct systematic biases(gene length bias and
81 | [GC-content bias](https://pubmed.ncbi.nlm.nih.gov/22177264/)[6].
82 | But they did not provide sufficient evidence to prove that the correction is
83 | effective. We use the
84 | [Marioni dataset](https://pubmed.ncbi.nlm.nih.gov/19371405/)[2] to verify the
85 | correction effect of CQN and find that there is still a deviation
86 | after correction:
87 |
88 |
89 |
90 | [GOseq](http://bioconductor.org/packages/goseq/) [1]based on
91 | Wallenius' noncentral hypergeometric distribution can effectively correct the
92 | gene length deviation in enrichment analysis. However, the current RNA-seq data
93 | often have no gene length bias, but only the expression amount(read count)
94 | bias, GOseq may overcorrect these data, correcting originally unbiased data
95 | into reverse bias.
96 |
97 | GOseq also fails to correct for expression bias, therefore, read count bias
98 | correction is still a challenge for us.
99 |
100 | ```{r, message=FALSE, warning=FALSE}
101 | # use user-defined data
102 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
103 | df <- as.data.frame(df)
104 | rownames(df) <- paste0("gene", 1:25)
105 | colnames(df) <- paste0("sample", 1:16)
106 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
107 | result <- differential_RNA(counts = df, group = group,
108 | filte = FALSE, method = "Wilcoxon")
109 | # use SummarizedExperiment object input
110 | df <- matrix(rnbinom(400, mu = 4, size = 10), 25, 16)
111 | rownames(df) <- paste0("gene", 1:25)
112 | colnames(df) <- paste0("sample", 1:16)
113 | group <- sample(c("group1", "group2"), 16, replace = TRUE)
114 |
115 | nrows <- 200; ncols <- 20
116 | counts <- matrix(
117 | runif(nrows * ncols, 1, 1e4), nrows,
118 | dimnames = list(paste0("cg",1:200),paste0("S",1:20))
119 | )
120 |
121 | colData <- S4Vectors::DataFrame(
122 | row.names = paste0("sample", 1:16),
123 | group = group
124 | )
125 | data <- SummarizedExperiment::SummarizedExperiment(
126 | assays=S4Vectors::SimpleList(counts=df),
127 | colData = colData)
128 |
129 | result <- differential_RNA(counts = data, groupCol = "group",
130 | filte = FALSE, method = "Wilcoxon")
131 | ```
132 |
133 |
134 | ### DNA Methylation data integration
135 |
136 | use `TCGAbiolinks` data.
137 |
138 | The codes may need to be modified if `TCGAbiolinks` updates.
139 | So please read its documents.
140 |
141 | ```{r, message=FALSE, warning=FALSE}
142 | # use user defined data
143 | library(ChAMP)
144 | cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
145 | rownames(cpgData) <- paste0("cpg", seq_len(200))
146 | colnames(cpgData) <- paste0("sample", seq_len(10))
147 | sampleGroup <- c(rep("group1", 5), rep("group2", 5))
148 | names(sampleGroup) <- colnames(cpgData)
149 | cpg2gene <- data.frame(cpg = rownames(cpgData),
150 | gene = rep(paste0("gene", seq_len(20)), 10))
151 | result <- differential_methy(cpgData, sampleGroup,
152 | cpg2gene = cpg2gene, normMethod = NULL)
153 | # use SummarizedExperiment object input
154 | library(ChAMP)
155 | cpgData <- matrix(runif(2000), nrow = 200, ncol = 10)
156 | rownames(cpgData) <- paste0("cpg", seq_len(200))
157 | colnames(cpgData) <- paste0("sample", seq_len(10))
158 | sampleGroup <- c(rep("group1", 5), rep("group2", 5))
159 | names(sampleGroup) <- colnames(cpgData)
160 | cpg2gene <- data.frame(cpg = rownames(cpgData),
161 | gene = rep(paste0("gene", seq_len(20)), 10))
162 | colData <- S4Vectors::DataFrame(
163 | row.names = colnames(cpgData),
164 | group = sampleGroup
165 | )
166 | data <- SummarizedExperiment::SummarizedExperiment(
167 | assays=S4Vectors::SimpleList(counts=cpgData),
168 | colData = colData)
169 | result <- differential_methy(cpgData = data,
170 | groupCol = "group", normMethod = NULL,
171 | cpg2gene = cpg2gene)
172 | ```
173 | **Note:** `ChAMP`has a large number of dependent packages.
174 | If you cannot install it successfully, you can download each dependent package
175 | separately(Source or Binary) and install it locally.
176 |
177 | We provide two models to get methylation difference genes:
178 |
179 | if model = "cpg", step1: calculate difference cpgs;
180 | step2: calculate difference genes;
181 |
182 | if model = "gene", step1: calculate the methylation level of genes;
183 | step2: calculate difference genes.
184 |
185 | We find that only model = "gene" has no deviation of CpG number.
186 |
187 |
188 | ### Copy number variation data integration and differential gene extraction
189 |
190 | use TCGAbiolinks to download TCGA data(Gene Level Copy Number Scores)
191 |
192 | ```{r, message=FALSE, warning=FALSE}
193 | # use random data as example
194 | aa <- matrix(sample(c(0, 1, -1), 200, replace = TRUE), 25, 8)
195 | rownames(aa) <- paste0("gene", 1:25)
196 | colnames(aa) <- paste0("sample", 1:8)
197 | sampleGroup <- sample(c("A", "B"), ncol(aa), replace = TRUE)
198 | diffCnv <- differential_CNV(aa, sampleGroup)
199 | ```
200 |
201 |
202 |
203 | ### Difference analysis of single nucleotide Variation data
204 | We provide SNP_QC function to do quality control of SNP data
205 | ```{r, message=FALSE, warning=FALSE}
206 | snpDf <- matrix(sample(c("AA", "Aa", "aa"), 100, replace = TRUE), 10, 10)
207 | snpDf <- as.data.frame(snpDf)
208 | sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
209 | result <- SNP_QC(snpDf)
210 | ```
211 |
212 | Then use differential_SNP to do differential analysis.
213 | ```{r, message=FALSE, warning=FALSE}
214 | #' snpDf <- matrix(sample(c("mutation", NA), 100, replace = TRUE), 10, 10)
215 | #' snpDf <- as.data.frame(snpDf)
216 | #' sampleGroup <- sample(c("A", "B"), 10, replace = TRUE)
217 | #' result <- differential_SNP(snpDf, sampleGroup)
218 | ```
219 |
220 |
221 | ### GEO chip data processing
222 | The function `gene_ave` could average the expression data of different
223 | ids for the same gene in the GEO chip data. For example:
224 |
225 | ```{r, message=FALSE, warning=FALSE}
226 | aa <- c("MARCH1","MARC1","MARCH1","MARCH1","MARCH1")
227 | bb <- c(2.969058399,4.722410064,8.165514853,8.24243893,8.60815086)
228 | cc <- c(3.969058399,5.722410064,7.165514853,6.24243893,7.60815086)
229 | file_gene_ave <- data.frame(aa=aa,bb=bb,cc=cc)
230 | colnames(file_gene_ave) <- c("Gene", "GSM1629982", "GSM1629983")
231 | result <- gene_ave(file_gene_ave, 1)
232 | ```
233 |
234 | Multiple genes symbols may correspond to a same chip id. The result of
235 | function `repAssign` is to assign the expression of this id to each gene,
236 | and function `repRemove` deletes the expression. For example:
237 |
238 | ```{r}
239 | aa <- c("MARCH1 /// MMA","MARC1","MARCH2 /// MARCH3",
240 | "MARCH3 /// MARCH4","MARCH1")
241 | bb <- c("2.969058399","4.722410064","8.165514853","8.24243893","8.60815086")
242 | cc <- c("3.969058399","5.722410064","7.165514853","6.24243893","7.60815086")
243 | input_file <- data.frame(aa=aa,bb=bb,cc=cc)
244 | repAssign_result <- repAssign(input_file," /// ")
245 | repRemove_result <- repRemove(input_file," /// ")
246 | ```
247 |
248 | ### Other downstream analyses
249 |
250 | 1. The function `id_conversion_TCGA` could convert ENSEMBL gene id to
251 | gene Symbol in TCGA. For example:
252 |
253 | ```{r, message=FALSE, warning=FALSE}
254 | data(profile)
255 | result <- id_conversion_TCGA(profile)
256 | ```
257 |
258 | The parameter `profile` is a data.frame or matrix of gene expression
259 | data in TCGA.
260 |
261 | **Note:** In previous versions(< 1.0.0) the `id_conversion` and
262 | `id_conversion_TCGA` used HGNC data to convert human gene id.
263 | In future versions, we will use `clusterProfiler::bitr` for ID conversion.
264 |
265 |
266 | 2. The function `countToFpkm` and `countToTpm` could convert
267 | count data to FPKM or TPM data.
268 |
269 | ```{r}
270 | data(gene_cov)
271 | lung_squ_count2 <- matrix(c(1,2,3,4,5,6,7,8,9),ncol=3)
272 | rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
273 | colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
274 | result <- countToFpkm(lung_squ_count2, keyType = "SYMBOL",
275 | gene_cov = gene_cov)
276 | ```
277 |
278 | ```{r, message=FALSE, warning=FALSE}
279 | data(gene_cov)
280 | lung_squ_count2 <- matrix(c(0.11,0.22,0.43,0.14,0.875,
281 | 0.66,0.77,0.18,0.29),ncol=3)
282 | rownames(lung_squ_count2) <- c("DISC1","TCOF1","SPPL3")
283 | colnames(lung_squ_count2) <- c("sample1","sample2","sample3")
284 | result <- countToTpm(lung_squ_count2, keyType = "SYMBOL",
285 | gene_cov = gene_cov)
286 | ```
287 |
288 | ```{r}
289 | sessionInfo()
290 | ```
291 |
292 | ## References
293 | 1. Young MD, Wakefield MJ, Smyth GK, Oshlack A (2010) Gene ontology analysis
294 | for RNA-seq: accounting for selection bias. Genome Biol 11: R14.
295 | 2. Oshlack A, Wakefield MJ (2009) Transcript length bias in RNA-seq data
296 | confounds systems biology. Biol Direct 4: 14.
297 | 3. Marioni JC, Mason CE, Mane SM, Stephens M, Gilad Y (2008) RNA-seq: an
298 | assessment of technical reproducibility and comparison with gene expression
299 | arrays. Genome Res 18: 1509-1517.
300 | 4. Yoon S, Nam D (2017) Gene dispersion is the key determinant of the read
301 | count bias in differential expression analysis of RNA-seq data.
302 | BMC Genomics 18: 408.
303 | 5. Hansen KD, Irizarry RA, Wu Z (2012) Removing technical variability in
304 | RNA-seq data using conditional quantile normalization.
305 | Biostatistics 13: 204-216.
306 | 6. Risso D, Schwartz K, Sherlock G, Dudoit S (2011) GC-content normalization
307 | for RNA-Seq data. BMC Bioinformatics 12: 480.
308 |
309 |
--------------------------------------------------------------------------------