├── APSS.R ├── CITATION.cff ├── DetectAndSolve_debugged.cpp ├── README.md ├── illustration.bmp ├── mainbody_cpp_pos_precise.R ├── mainbody_cpp_rsid_precise.R ├── step2_pos.R └── step2_rsid.R /APSS.R: -------------------------------------------------------------------------------- 1 | APSS <- function(directory.working, filename, auto = FALSE, do.return = FALSE, BIG = 2) { 2 | # Packages 3 | suppressMessages(library(data.table)) 4 | suppressMessages(library(magrittr)) 5 | suppressMessages(library(R.utils)) 6 | suppressMessages(library(stringr)) 7 | 8 | # Length of separator 9 | length.separator <- 120 10 | separator <- rep("=", length.separator) %>% paste0(., collapse = "") 11 | 12 | # Set working directory 13 | setwd(directory.working) 14 | 15 | # Read the input file 16 | check.BIG <- FALSE 17 | if (file.size(filename) >= BIG * 1024 * 1024 * 1024) { 18 | check.BIG <- TRUE 19 | cat(paste0("Input dataset is VERY VERY VERY BIG. We have to be selective :-<<<"), sep = "\n") 20 | cat(separator, sep = "\n") 21 | raw.try <- fread(filename, showProgress = FALSE, nrows = 100) %>% as.data.frame() 22 | 23 | # Print the head of the input dataset 24 | cat("Following are the first few lines of the dataset:", sep = "\n") 25 | cat(separator, sep = "\n") 26 | print(head(raw.try)) 27 | cat(separator, sep = "\n") 28 | 29 | # Ask user to pick some columns 30 | cat("Press ENTER to skip a column! Write something if you want to keep a column. ", sep = "\n") 31 | cat(separator, sep = "\n") 32 | keep <- character(ncol(raw.try)) 33 | for (index in 1:ncol(raw.try)) { 34 | keep[index] <- readline(prompt = paste0("Keep ", colnames(raw.try)[index], "? ")) 35 | } 36 | cat(separator, sep = "\n") 37 | keep <- which(keep != "") 38 | 39 | rm(raw.try) 40 | 41 | raw <- fread(filename, showProgress = FALSE, select = keep, fill = TRUE) %>% as.data.frame() 42 | } else { 43 | raw <- fread(filename, showProgress = FALSE, fill = TRUE) %>% as.data.frame() 44 | } 45 | 46 | if (check.BIG) { 47 | cat(paste0(nrow(raw), " lines read from the selected raw summary statistics file!"), sep = "\n") 48 | } else { 49 | cat(paste0(nrow(raw), " lines read from the raw summary statistics file!"), sep = "\n") 50 | } 51 | cat(separator, sep = "\n") 52 | 53 | # Get the header 54 | header.inner <- colnames(raw) 55 | 56 | # Auto mode or not? 57 | if ("V1" %in% header.inner | "V2" %in% header.inner | "V3" %in% header.inner & auto) { 58 | cat("I don't see any header! Switch to interactive mode!", sep = "\n") 59 | cat(separator, sep = "\n") 60 | } 61 | 62 | # Start interacting 63 | if (auto == FALSE) { 64 | # Print the head of the input data set 65 | cat("Following are the first few lines of the dataset:", sep = "\n") 66 | cat(separator, sep = "\n") 67 | print(head(raw)) 68 | cat(separator, sep = "\n") 69 | 70 | # Ask user for reasonable column names 71 | cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n") 72 | for (index in 1:ncol(raw)) { 73 | header.inner[index] <- readline(prompt = paste0("What would be a proper column name for ", colnames(raw)[index], "? ")) 74 | } 75 | cat(separator, sep = "\n") 76 | } 77 | 78 | # Make a copy of the desired header 79 | header.outer <- header.inner 80 | 81 | ###################### 82 | # Process the header # 83 | ###################### 84 | 85 | # Initialize the header 86 | header.inner <- tolower(header.inner) 87 | 88 | # SNP 89 | try.snp <- c("snp", "markername", "snpid", "rs", "rsid", "rs_number", "snps") 90 | header.inner[header.inner %in% try.snp] <- "SNP" 91 | 92 | # A1 93 | try.a1 <- c("a1", "allele1", "allele_1", "effect_allele", "reference_allele", "inc_allele", "ea", "ref", "a1lele1", "al1ele1") 94 | header.inner[header.inner %in% try.a1] <- "A1" 95 | 96 | # A2 97 | try.a2 <- c("a2", "allele2", "allele_2", "other_allele", "non_effect_allele", "dec_allele", "nea", "alt", "a0") 98 | header.inner[header.inner %in% try.a2] <- "A2" 99 | 100 | # Z-score 101 | try.z <- c("zscore", "z-score", "gc_zscore", "z") 102 | header.inner[header.inner %in% try.z] <- "Z" 103 | 104 | # P 105 | try.p <- c("pvalue", "p_value", "pval", "p_val", "gc_pvalue", "p") 106 | header.inner[header.inner %in% try.p] <- "P" 107 | 108 | # Beta 109 | try.beta <- c("b", "beta", "effects", "effect") 110 | header.inner[header.inner %in% try.beta] <- "BETA" 111 | 112 | # Odds ratio 113 | try.or <- c("or") 114 | header.inner[header.inner %in% try.or] <- "ODDS_RATIO" 115 | 116 | # Log odds 117 | try.logodds <- c("log_odds", "logor", "log_or") 118 | header.inner[header.inner %in% try.logodds] <- "LOG_ODDS" 119 | 120 | # MAF 121 | try.maf <- c("eaf", "frq", "maf", "frq_u", "f_u", "freq") 122 | header.inner[header.inner %in% try.maf] <- "MAF" 123 | 124 | # INFO 125 | try.info <- c("info", "info_score") 126 | header.inner[header.inner %in% try.info] <- "INFO" 127 | 128 | # Chromosome 129 | try.chromosome <- c("chrom", "ch", "chr", "chromosome") 130 | header.inner[header.inner %in% try.chromosome] <- "CHROMOSOME" 131 | 132 | # Position 133 | try.position <- c("pos", "posit", "position", "bp", "bpos") 134 | header.inner[header.inner %in% try.position] <- "POSITION" 135 | 136 | # Standard error 137 | try.se <- c("se", "sebeta", "beta_se") 138 | header.inner[header.inner %in% try.se] <- "SE" 139 | 140 | # CHR plus POSITION 141 | try.chrpos <- c("chr:pos", "chr_pos", "chr-pos", "chrpos") 142 | header.inner[header.inner %in% try.chrpos] <- "CHR_POS" 143 | 144 | # Samplesize 145 | try.samplesize <- c("n", "samplesize", "num_samples", "sample") 146 | header.inner[header.inner %in% try.samplesize] <- "SAMPLESIZE" 147 | 148 | # Update the header 149 | colnames(raw) <- header.inner 150 | 151 | # Drop some rows 152 | n.start <- ncol(raw) 153 | 154 | raw <- raw[, which(colnames(raw) != "")] 155 | header.outer <- header.outer[header.outer != ""] 156 | header.inner <- header.inner[header.inner != ""] 157 | 158 | n.end <- ncol(raw) 159 | cat(paste0(n.start - n.end, " columns were dropped from the input dataset!"), sep = "\n") 160 | cat(separator, sep = "\n") 161 | 162 | # Double-check the class of each column and coerce if needed 163 | list.coerce <- c("Z", "P", "BETA", "ODDS_RATIO", "LOG_ODDS", "MAF", "INFO", "SE") 164 | 165 | options(warn = -1) 166 | if ("POSITION" %in% header.inner) { 167 | if (class(raw$POSITION) == "character") { 168 | class(raw$POSITION) <- "integer" 169 | cat(paste0("Column POSITION has wrong class and has been coerced to integer."), sep = "\n") 170 | cat(separator, sep = "\n") 171 | } 172 | } 173 | 174 | for (i in 1:length(header.inner)) { 175 | if (header.inner[i] %in% list.coerce) { 176 | if (class(raw[, header.inner[i]]) != "numeric") { 177 | class(raw[, header.inner[i]]) <- "numeric" 178 | cat(paste0("Column ", header.inner[i], " has wrong class and has been coerced to numeric."), sep = "\n") 179 | cat(separator, sep = "\n") 180 | } 181 | } 182 | } 183 | options(warn = 0) 184 | 185 | # Drop rows with missing values 186 | do.missing <- readline(prompt = "Drop rows with missing values? Y/N ") 187 | 188 | if (do.missing == "Y") { 189 | n.start <- nrow(raw) 190 | 191 | raw <- raw[complete.cases(raw), ] 192 | 193 | n.end <- nrow(raw) 194 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 195 | 196 | if ((n.start - n.end) / n.start >= 0.8) { 197 | cat(paste0(round((n.start - n.end) / n.start * 100, digits = 2), "% of rows were removed!"), sep = "\n") 198 | cat("Please check the dataset manually because I am sensing something fishy.") 199 | Sys.sleep(5) 200 | .Internal(.invokeRestart(list(NULL, NULL), NULL)) 201 | } 202 | cat(separator, sep = "\n") 203 | } 204 | 205 | # Split CHR_POS 206 | if ("CHR_POS" %in% header.inner) { 207 | do.split <- readline(prompt = paste0("Do you want me to split the ID column? (This could take a while.) Y/N ")) %>% toupper() 208 | cat(separator, sep = "\n") 209 | if (do.split == "Y") { 210 | split.temp <- str_split(raw$CHR_POS, ":") %>% unlist() 211 | if (sum(grepl("chr", raw$CHR_POS)) != 0) { 212 | split.temp[2 * (1:nrow(raw)) - 1] <- gsub("chr", "", split.temp[2 * (1:nrow(raw)) - 1]) 213 | } 214 | 215 | split.temp <- as.numeric(split.temp) 216 | split.temp <- matrix(split.temp, nrow = nrow(raw), ncol = 2, byrow = TRUE) %>% as.data.frame() 217 | names(split.temp) <- c("V1", "V2") 218 | 219 | header.inner <- c(header.inner, "CHROMOSOME", "POSITION") 220 | raw["CHROMOSOME"] <- split.temp[, 1] 221 | raw["POSITION"] <- split.temp[, 2] 222 | 223 | # Print the head of split.temp 224 | cat("Following are the first few lines of the splitted column:", sep = "\n") 225 | cat(separator, sep = "\n") 226 | print(head(split.temp)) 227 | cat(separator, sep = "\n") 228 | 229 | # Ask user for reasonable column names 230 | cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n") 231 | for (index in 1:ncol(split.temp)) { 232 | header.outer[length(header.outer) + 1] <- readline(prompt = paste0("What would be a proper column name for ", colnames(split.temp)[index], "? ")) 233 | } 234 | cat(separator, sep = "\n") 235 | } 236 | } 237 | 238 | # Problematic rsid 239 | test.1 <- grepl(":", raw$SNP) 240 | test.2 <- grepl("_", raw$SNP) 241 | test.3 <- grepl("chr", raw$SNP) 242 | if (sum(test.1 & test.3) == nrow(raw)) { 243 | cat("I noticed a pattern (chrX:position) in SNPs' IDs!", sep = "\n") 244 | do.split <- readline(prompt = paste0("Do you want me to split the ID column? (This could take a while.) Y/N ")) 245 | cat(separator, sep = "\n") 246 | if (do.split == "Y") { 247 | split.temp <- str_split(raw$SNP, ":") %>% unlist() 248 | split.temp[2 * (1:nrow(raw)) - 1] <- gsub("chr", "", split.temp[2 * (1:nrow(raw)) - 1]) 249 | 250 | split.temp <- as.numeric(split.temp) 251 | split.temp <- matrix(split.temp, nrow = nrow(raw), ncol = 2, byrow = TRUE) %>% as.data.frame() 252 | names(split.temp) <- c("V1", "V2") 253 | 254 | header.inner <- c(header.inner, "CHROMOSOME", "POSITION") 255 | raw["CHROMOSOME"] <- split.temp[, 1] 256 | raw["POSITION"] <- split.temp[, 2] 257 | 258 | # Print the head of split.temp 259 | cat("Following are the first few lines of the splitted column:", sep = "\n") 260 | cat(separator, sep = "\n") 261 | print(head(split.temp)) 262 | cat(separator, sep = "\n") 263 | 264 | # Ask user for reasonable column names 265 | cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n") 266 | for (index in 1:ncol(split.temp)) { 267 | header.outer[length(header.outer) + 1] <- readline(prompt = paste0("What would be a proper column name for ", colnames(split.temp)[index], "? ")) 268 | } 269 | cat(separator, sep = "\n") 270 | } 271 | } 272 | 273 | if (sum(test.1) != 0) { 274 | cat(paste0(sum(test.1), " SNPs have ':' in their IDs. Please be careful!"), sep = "\n") 275 | cat(separator, sep = "\n") 276 | } 277 | 278 | if (sum(test.2) != 0) { 279 | cat(paste0(sum(test.2), " SNPs have '_' in their IDs. Please be careful!"), sep = "\n") 280 | cat(separator, sep = "\n") 281 | } 282 | 283 | # Compare raw sumstats with the "list" (if provided) 284 | do.hapmap3 <- readline(prompt = paste0("Subset the dataset by HapMap3? Y/N ")) 285 | cat(separator, sep = "\n") 286 | 287 | if (do.hapmap3 == "Y") { 288 | if (!file.exists("w_hm3.snplist.bz2")) { 289 | cat("I can't find HapMap3 list file and will try to download one from the Internet.", sep = "\n") 290 | cat(separator, sep = "\n") 291 | download.file("https://data.broadinstitute.org/alkesgroup/LDSCORE/w_hm3.snplist.bz2", destfile = "w_hm3.snplist.bz2", quiet = TRUE) 292 | } 293 | list.hapmap3 <- as.data.frame(fread("w_hm3.snplist.bz2")) 294 | 295 | n.start <- nrow(raw) 296 | 297 | raw <- raw[raw$SNP %in% list.hapmap3$SNP, ] 298 | 299 | n.end <- nrow(raw) 300 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 301 | cat(separator, sep = "\n") 302 | } 303 | 304 | # Drop rows with MAF<0.01 305 | if ("MAF" %in% header.inner) { 306 | do.maf <- readline(prompt = paste0("Drop rows with MAF<=0.01? Y/N ")) 307 | if (do.maf == "Y") { 308 | n.start <- nrow(raw) 309 | 310 | raw <- raw[raw$MAF >= 0.01,] 311 | 312 | n.end <- nrow(raw) 313 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 314 | cat(separator, sep = "\n") 315 | } else { 316 | cat(separator, sep = "\n") 317 | } 318 | } 319 | 320 | # Drop rows with INFO<0.9 321 | if ("INFO" %in% header.inner) { 322 | do.info <- readline(prompt = paste0("Drop rows with INFO <= 0.9? Y/N ")) 323 | if (do.info == "Y") { 324 | n.start <- nrow(raw) 325 | 326 | raw <- raw[raw$INFO >= 0.9,] 327 | 328 | n.end <- nrow(raw) 329 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 330 | cat(separator, sep = "\n") 331 | } else { 332 | cat(separator, sep = "\n") 333 | } 334 | } 335 | 336 | # Drop rows not on chromosome 1-22 337 | if ("CHROMOSOME" %in% header.inner) { 338 | raw$CHROMOSOME <- as.character(raw$CHROMOSOME) 339 | if (length(union(unique(raw$CHROMOSOME), as.character(1:22))) != 22) { 340 | do.dropXY <- readline(prompt = paste0("Drop SNPs on chromosome X and Y? Y/N ")) 341 | if (do.dropXY == "Y") { 342 | n.start <- nrow(raw) 343 | raw <- raw[raw$CHROMOSOME %in% as.character(1:22), ] 344 | n.end <- nrow(raw) 345 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 346 | cat(separator, sep = "\n") 347 | } 348 | } 349 | } 350 | 351 | # Drop rows for being on the MHC region 352 | if ("CHROMOSOME" %in% header.inner & "POSITION" %in% header.inner) { 353 | do.MHC <- readline(prompt = paste0("Drop rows on MHC region? Y/N ")) 354 | if (do.MHC == "Y") { 355 | n.start <- nrow(raw) 356 | raw$CHROMOSOME <- as.character(raw$CHROMOSOME) 357 | raw <- raw[!(raw$CHROMOSOME == "6" & raw$POSITION >= 26000000 & raw$POSITION <= 34000000),] 358 | n.end <- nrow(raw) 359 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 360 | cat(separator, sep = "\n") 361 | } else { 362 | cat(separator, sep = "\n") 363 | } 364 | } 365 | 366 | # Transform P into usual range (0,1) 367 | if ("P" %in% header.inner) { 368 | if (max(raw$P) > 1 & min(raw$P) < 0) { 369 | do.transform <- readline(prompt = paste0("Transform P into usual range (0,1)? Y/N ")) 370 | if (do.transform == "Y") { 371 | raw <- transform(raw, P = 10 ^ (-P)) 372 | cat(separator, sep = "\n") 373 | } else { 374 | cat(separator, sep = "\n") 375 | } 376 | } 377 | } 378 | 379 | # Process A1 and A2 380 | if ("A1" %in% header.inner & "A2" %in% header.inner) { 381 | # MAKE A1 AND A2 CAPS 382 | raw$A1 <- toupper(raw$A1) 383 | raw$A2 <- toupper(raw$A2) 384 | 385 | # Erroneous A1/A2 386 | do.error <- readline(prompt = paste0("Drop rows with erroneous A1/A2 ? Y/N ")) 387 | if (do.error == "Y") { 388 | n.start <- nrow(raw) 389 | 390 | raw <- raw[raw$A1 == 'A' | raw$A1 == 'C' | raw$A1 == 'T' | raw$A1 == 'G', ] 391 | raw <- raw[raw$A2 == 'A' | raw$A2 == 'C' | raw$A2 == 'T' | raw$A2 == 'G', ] 392 | 393 | n.end <- nrow(raw) 394 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 395 | cat(separator, sep = "\n") 396 | } else { 397 | cat(separator, sep = "\n") 398 | } 399 | 400 | # Ambigious SNPs 401 | do.ambi <- readline(prompt = paste0("Drop ambigious SNPs? Y/N ")) 402 | if (do.ambi == "Y") { 403 | n.start <- nrow(raw) 404 | 405 | raw <- raw[!((raw$A1 == 'A' & raw$A2 == 'T') | (raw$A1 == 'T' & raw$A2 == 'A') | (raw$A1 == 'C' & raw$A2 == 'G') | (raw$A1 == 'G' & raw$A2 == 'C')),] 406 | 407 | n.end <- nrow(raw) 408 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 409 | cat(separator, sep = "\n") 410 | } else { 411 | cat(separator, sep = "\n") 412 | } 413 | } else { 414 | cat("WARNING: There might be some issue with your settings or the data itself! Because I didn't find any A1/A2 columns!", sep = "\n") 415 | } 416 | 417 | # Missing z-score? 418 | calculate.z <- FALSE 419 | if (!("Z" %in% header.inner)) { 420 | do.z <- readline(prompt = paste0("No z-score column is found. Do you want me to create one? Y/N ")) 421 | if (do.z == "Y") { 422 | if ("BETA" %in% header.inner & "SE" %in% header.inner) { 423 | raw["Z"] <- raw$BETA / raw$SE 424 | calculate.z <- TRUE 425 | } else if ("ODDS_RATIO" %in% header.inner & "SE" %in% header.inner) { 426 | raw["Z"] <- log(raw$ODDS_RATIO) / raw$SE 427 | calculate.z <- TRUE 428 | } else if ("LOG_ODDS" %in% header.inner & "SE" %in% header.inner) { 429 | raw["Z"] <- raw$LOG_ODDS / raw$SE 430 | calculate.z <- TRUE 431 | } else if ("BETA" %in% header.inner & "P" %in% header.inner) { 432 | raw["Z"] <- sign(raw$BETA) * abs(qnorm(raw$P / 2)) 433 | calculate.z <- TRUE 434 | } else if ("ODDS_RATIO" %in% header.inner & "P" %in% header.inner) { 435 | raw["Z"] <- sign(log(raw$ODDS_RATIO)) * abs(qnorm(raw$P / 2)) 436 | calculate.z <- TRUE 437 | } else if ("LOG_ODDS" %in% header.inner & "P" %in% header.inner) { 438 | raw["Z"] <- sign(raw$ODDS_RATIO) * abs(qnorm(raw$P / 2)) 439 | calculate.z <- TRUE 440 | } else { 441 | cat("I can't calculate z-score based on the information I have. SAD FACE EMOJI.", sep = "\n") 442 | } 443 | 444 | if (sum(is.na(raw$Z)) != 0) { 445 | n.start <- nrow(raw) 446 | raw <- raw[!is.na(raw$Z),] 447 | n.end <- nrow(raw) 448 | cat(paste0(n.start - n.end, " rows removed for having invalid z-score!"), sep = "\n") 449 | } 450 | } 451 | cat(separator, sep = "\n") 452 | } 453 | 454 | # Calculate chi^2 and sieve out observations with chi^2 larger than 80 455 | if ("P" %in% header.inner) { 456 | do.chi2 <- readline(prompt = paste0("Sieve out observations with chi^2 larger than 80? Y/N ")) 457 | if (do.chi2 == "Y") { 458 | n.start <- nrow(raw) 459 | 460 | raw <- raw[qchisq(raw$P, 1, lower.tail = FALSE) <= 80,] 461 | 462 | n.end <- nrow(raw) 463 | cat(paste0(n.start - n.end, " rows removed!"), sep = "\n") 464 | cat(separator, sep = "\n") 465 | } else { 466 | cat(separator, sep = "\n") 467 | } 468 | } 469 | 470 | # Update the header 471 | if (calculate.z) { 472 | header.inner <- c(header.inner, "Z") 473 | header.outer <- c(header.outer, "Z") 474 | } 475 | colnames(raw) <- header.outer 476 | 477 | # Display output 478 | cat("Now take a final look at the output!", sep = "\n") 479 | print(head(raw)) 480 | 481 | cat(separator, sep = "\n") 482 | 483 | # Output 484 | do.output <- readline(prompt = "Do you want to write the processed summary statistics to your hard drive? Y/N ") %>% toupper() 485 | if (do.output == "Y") { 486 | char.write <- readline(prompt = "Please input your desired filename (with extension and if no input is given, a default name will be assigned):") 487 | if (char.write == "") { 488 | char.write <- paste0(filename, ".sumstats") 489 | } 490 | fwrite(raw, file = char.write, sep = " ", row.names = FALSE, quote = FALSE) 491 | 492 | CHROMOSOME <- raw[, header.inner == "CHROMOSOME"] 493 | for (i in unique(CHROMOSOME)) { 494 | fwrite(raw[CHROMOSOME == i, ], file = paste0(filename, "-", i, ".sumstats"), sep = " ", row.names = FALSE, quote = FALSE) 495 | } 496 | } 497 | 498 | # Return? 499 | if (do.return) { 500 | return(raw) 501 | } 502 | } 503 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Zhang" 5 | given-names: "Zichen" 6 | - family-names: "Wu" 7 | given-names: "Chong" 8 | title: "SUMMIT" 9 | version: 1.0.2 10 | doi: 10.5281/zenodo.7034435 11 | date-released: 2022-08-28 12 | url: "https://github.com/ChongWuLab/SUMMIT/" 13 | -------------------------------------------------------------------------------- /DetectAndSolve_debugged.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace arma; 7 | using namespace Rcpp; 8 | 9 | // [[Rcpp::depends(RcppArmadillo)]] 10 | // [[Rcpp::export]] 11 | 12 | int MCPDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold, int index_lambda) 13 | { 14 | const int n_var = r.size(); 15 | double lambda_used, z, dlx; 16 | 17 | arma::mat temp(1, 1); 18 | 19 | arma::vec beta; 20 | beta.zeros(n_var); 21 | 22 | arma::vec beta_current; 23 | beta_current.zeros(n_var); 24 | 25 | int problem = 1; 26 | 27 | index_lambda = index_lambda - 1; 28 | lambda_used = lambda(index_lambda); 29 | 30 | for (int j = 0; j < max_iteration; j++) 31 | { 32 | for (int q = 0; q < n_var; q++) 33 | { 34 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 35 | z = temp(0, 0); 36 | if (beta_current(q) <= gamma * lambda_used) 37 | { 38 | if (z > lambda_used) 39 | { 40 | beta_current(q) = (z - lambda_used) / (1 + s - 1 / gamma); 41 | } 42 | else if (z < -lambda_used) 43 | { 44 | beta_current(q) = (z + lambda_used) / (1 + s - 1 / gamma); 45 | } 46 | else 47 | { 48 | beta_current(q) = 0.0; 49 | } 50 | } 51 | else 52 | { 53 | beta_current(q) = z / (1 + s); 54 | } 55 | } 56 | 57 | dlx = arma::norm(beta_current, 2); 58 | 59 | if (dlx > threshold) 60 | { 61 | problem = 2; 62 | break; 63 | } 64 | 65 | beta = beta_current; 66 | } 67 | 68 | return (problem); 69 | } 70 | 71 | // [[Rcpp::depends(RcppArmadillo)]] 72 | // [[Rcpp::export]] 73 | 74 | arma::mat MCP(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold) 75 | { 76 | const int n_var = r.size(); 77 | const int n_lambda = lambda.size(); 78 | double lambda_used, z, dlx, dlx2; 79 | 80 | arma::mat res(n_var, n_lambda); 81 | res.fill(0); 82 | 83 | arma::mat temp(1, 1); 84 | 85 | arma::vec beta; 86 | beta.zeros(n_var); 87 | 88 | arma::vec beta_current; 89 | beta_current.zeros(n_var); 90 | 91 | int problem = 0; 92 | 93 | for (int i = 0; i < n_lambda; i++) 94 | { 95 | beta_current = beta; 96 | lambda_used = lambda(i); 97 | for (int j = 0; j < max_iteration; j++) 98 | { 99 | for (int q = 0; q < n_var; q++) 100 | { 101 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 102 | z = temp(0, 0); 103 | if (beta_current(q) <= gamma * lambda_used) 104 | { 105 | if (z > lambda_used) 106 | { 107 | beta_current(q) = (z - lambda_used) / (1 + s - 1 / gamma); 108 | } 109 | else if (z < -lambda_used) 110 | { 111 | beta_current(q) = (z + lambda_used) / (1 + s - 1 / gamma); 112 | } 113 | else 114 | { 115 | beta_current(q) = 0.0; 116 | } 117 | } 118 | else 119 | { 120 | beta_current(q) = z / (1 + s); 121 | } 122 | } 123 | 124 | arma::vec betaDiff = abs(beta_current - beta); 125 | dlx = betaDiff.max(); 126 | dlx2 = arma::norm(beta_current, 2); 127 | 128 | if (dlx < threshold) 129 | { 130 | break; 131 | } 132 | 133 | if (dlx2 > 100.0) 134 | { 135 | problem = 1; 136 | break; 137 | } 138 | 139 | beta = beta_current; 140 | } 141 | 142 | if (problem == 1) 143 | { 144 | break; 145 | } 146 | 147 | res.col(i) = beta_current; 148 | } 149 | 150 | return (res); 151 | } 152 | 153 | // [[Rcpp::depends(RcppArmadillo)]] 154 | // [[Rcpp::export]] 155 | int ElNetDetect(const arma::vec &r, const arma::mat &LD, const arma::vec lambda, const double s, const double alpha, const int max_iteration, const double threshold, int index_lambda) 156 | { 157 | const int n_var = r.size(); 158 | double z, lambda_used, dlx; 159 | 160 | arma::mat temp(1, 1); 161 | 162 | arma::vec beta; 163 | beta.zeros(n_var); 164 | 165 | arma::vec beta_current; 166 | beta_current.zeros(n_var); 167 | 168 | int problem = 1; 169 | 170 | beta_current = beta; 171 | index_lambda = index_lambda - 1; 172 | lambda_used = lambda(index_lambda); 173 | 174 | for (int j = 0; j < max_iteration; j++) 175 | { 176 | for (int q = 0; q < n_var; q++) 177 | { 178 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 179 | z = temp(0, 0); 180 | if (z > alpha * lambda_used) 181 | { 182 | beta_current(q) = (z - alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha)); 183 | } 184 | else if (z < -alpha * lambda_used) 185 | { 186 | beta_current(q) = (z + alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha)); 187 | } 188 | else 189 | { 190 | beta_current(q) = 0.0; 191 | } 192 | } 193 | 194 | dlx = arma::norm(beta_current, 2); 195 | 196 | if (dlx > threshold) 197 | { 198 | problem = 2; 199 | break; 200 | } 201 | 202 | beta = beta_current; 203 | } 204 | 205 | return (problem); 206 | } 207 | 208 | // [[Rcpp::depends(RcppArmadillo)]] 209 | // [[Rcpp::export]] 210 | arma::mat ElNet(const arma::vec &r, const arma::mat &LD, const arma::vec lambda, const double s, const double alpha, const int max_iteration, const double threshold) 211 | { 212 | const int n_var = r.size(); 213 | const int n_lambda = lambda.size(); 214 | double z, lambda_used, dlx, dlx2; 215 | 216 | arma::mat res(n_var, n_lambda); 217 | res.fill(0); 218 | 219 | arma::mat temp(1, 1); 220 | 221 | arma::vec beta; 222 | beta.zeros(n_var); 223 | 224 | arma::vec beta_current; 225 | beta_current.zeros(n_var); 226 | 227 | int problem = 0; 228 | 229 | for (int i = 0; i < n_lambda; i++) 230 | { 231 | beta_current = beta; 232 | lambda_used = lambda(i); 233 | 234 | for (int j = 0; j < max_iteration; j++) 235 | { 236 | for (int q = 0; q < n_var; q++) 237 | { 238 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 239 | z = temp(0, 0); 240 | if (z > alpha * lambda_used) 241 | { 242 | beta_current(q) = (z - alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha)); 243 | } 244 | else if (z < -alpha * lambda_used) 245 | { 246 | beta_current(q) = (z + alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha)); 247 | } 248 | else 249 | { 250 | beta_current(q) = 0.0; 251 | } 252 | } 253 | 254 | arma::vec betaDiff = abs(beta_current - beta); 255 | dlx = betaDiff.max(); 256 | dlx2 = arma::norm(beta_current, 2); 257 | 258 | if (dlx < threshold) 259 | { 260 | break; 261 | } 262 | 263 | if (dlx2 > 100.0) 264 | { 265 | problem = 1; 266 | break; 267 | } 268 | 269 | beta = beta_current; 270 | } 271 | 272 | if (problem == 1) 273 | { 274 | break; 275 | } 276 | 277 | res.col(i) = beta_current; 278 | } 279 | 280 | return (res); 281 | } 282 | 283 | // [[Rcpp::depends(RcppArmadillo)]] 284 | // [[Rcpp::export]] 285 | int MNetDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double alpha, const double gamma, const int max_iteration, const double threshold, int index_lambda) 286 | { 287 | const int n_var = r.size(); 288 | double z, lambda_used, dlx, lambda_1, lambda_2; 289 | 290 | arma::mat temp(1, 1); 291 | 292 | arma::vec beta; 293 | beta.zeros(n_var); 294 | 295 | arma::vec beta_current; 296 | beta_current.zeros(n_var); 297 | 298 | int problem = 1; 299 | 300 | beta_current = beta; 301 | index_lambda = index_lambda - 1; 302 | lambda_used = lambda(index_lambda); 303 | lambda_1 = lambda_used * alpha; 304 | lambda_2 = lambda_used * (1 - alpha); 305 | 306 | for (int j = 0; j < max_iteration; j++) 307 | { 308 | for (int q = 0; q < n_var; q++) 309 | { 310 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 311 | z = temp(0, 0); 312 | if (abs(z) <= gamma * lambda_1 * (1 + lambda_2)) 313 | { 314 | if (z > lambda_1) 315 | { 316 | beta_current(q) = (z - lambda_1) / (1 + s + lambda_2 - 1 / gamma); 317 | } 318 | else if (z < -lambda_1) 319 | { 320 | beta_current(q) = (z + lambda_1) / (1 + s + lambda_2 - 1 / gamma); 321 | } 322 | else 323 | { 324 | beta_current(q) = 0.0; 325 | } 326 | } 327 | else 328 | { 329 | beta_current(q) = z / (1 + s + lambda_2); 330 | } 331 | } 332 | 333 | dlx = arma::norm(beta_current, 2); 334 | 335 | if (dlx > threshold) 336 | { 337 | problem = 2; 338 | break; 339 | } 340 | 341 | beta = beta_current; 342 | } 343 | 344 | return (problem); 345 | } 346 | 347 | // [[Rcpp::depends(RcppArmadillo)]] 348 | // [[Rcpp::export]] 349 | arma::mat MNet(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double alpha, const double gamma, const int max_iteration, const double threshold) 350 | { 351 | const int n_var = r.size(); 352 | const int n_lambda = lambda.size(); 353 | double z, lambda_used, dlx, dlx2, lambda_1, lambda_2; 354 | 355 | arma::mat res(n_var, n_lambda); 356 | res.fill(0); 357 | 358 | arma::mat temp(1, 1); 359 | 360 | arma::vec beta; 361 | beta.zeros(n_var); 362 | 363 | arma::vec beta_current; 364 | beta_current.zeros(n_var); 365 | 366 | int problem = 0; 367 | 368 | for (int i = 0; i < n_lambda; i++) 369 | { 370 | beta_current = beta; 371 | lambda_used = lambda(i); 372 | lambda_1 = lambda_used * alpha; 373 | lambda_2 = lambda_used * (1 - alpha); 374 | for (int j = 0; j < max_iteration; j++) 375 | { 376 | 377 | for (int q = 0; q < n_var; q++) 378 | { 379 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 380 | z = temp(0, 0); 381 | if (abs(z) <= gamma * lambda_1 * (1 + lambda_2)) 382 | { 383 | if (z > lambda_1) 384 | { 385 | beta_current(q) = (z - lambda_1) / (1 + s + lambda_2 - 1 / gamma); 386 | } 387 | else if (z < -lambda_1) 388 | { 389 | beta_current(q) = (z + lambda_1) / (1 + s + lambda_2 - 1 / gamma); 390 | } 391 | else 392 | { 393 | beta_current(q) = 0.0; 394 | } 395 | } 396 | else 397 | { 398 | beta_current(q) = z / (1 + s + lambda_2); 399 | } 400 | } 401 | 402 | arma::vec betaDiff = abs(beta_current - beta); 403 | dlx = betaDiff.max(); 404 | 405 | dlx2 = arma::norm(beta_current, 2); 406 | 407 | if (dlx < threshold) 408 | { 409 | break; 410 | } 411 | 412 | if (dlx2 > 100.0) 413 | { 414 | problem = 1; 415 | break; 416 | } 417 | 418 | beta = beta_current; 419 | } 420 | 421 | if (problem == 1) 422 | { 423 | break; 424 | } 425 | 426 | res.col(i) = beta_current; 427 | } 428 | 429 | return (res); 430 | } 431 | 432 | // [[Rcpp::depends(RcppArmadillo)]] 433 | // [[Rcpp::export]] 434 | 435 | int SCADDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold, int index_lambda) 436 | { 437 | const int n_var = r.size(); 438 | double lambda_used, z, dlx; 439 | 440 | arma::mat temp(1, 1); 441 | 442 | arma::vec beta; 443 | beta.zeros(n_var); 444 | 445 | arma::vec beta_current; 446 | beta_current.zeros(n_var); 447 | 448 | int problem = 1; 449 | 450 | beta_current = beta; 451 | index_lambda = index_lambda - 1; 452 | lambda_used = lambda(index_lambda); 453 | 454 | for (int j = 0; j < max_iteration; j++) 455 | { 456 | for (int q = 0; q < n_var; q++) 457 | { 458 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 459 | z = temp(0, 0); 460 | if (beta_current(q) <= gamma * lambda_used) 461 | { 462 | if (beta_current(q) <= lambda_used) 463 | { 464 | if (z > lambda_used) 465 | { 466 | beta_current(q) = (z - lambda_used) / (1 + s); 467 | } 468 | else if (z < -lambda_used) 469 | { 470 | beta_current(q) = (z + lambda_used) / (1 + s); 471 | } 472 | else 473 | { 474 | beta_current(q) = 0.0; 475 | } 476 | } 477 | else 478 | { 479 | if (z > gamma / (gamma - 1) * lambda_used) 480 | { 481 | beta_current(q) = (z - gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1)); 482 | } 483 | else if (z < -gamma / (gamma - 1) * lambda_used) 484 | { 485 | beta_current(q) = (z + gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1)); 486 | } 487 | else 488 | { 489 | beta_current(q) = 0.0; 490 | } 491 | } 492 | } 493 | else 494 | { 495 | beta_current(q) = z / (1 + s); 496 | } 497 | } 498 | 499 | dlx = arma::norm(beta_current, 2); 500 | 501 | if (dlx > threshold) 502 | { 503 | problem = 2; 504 | break; 505 | } 506 | 507 | beta = beta_current; 508 | } 509 | 510 | return (problem); 511 | } 512 | 513 | // [[Rcpp::depends(RcppArmadillo)]] 514 | // [[Rcpp::export]] 515 | 516 | arma::mat SCAD(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold) 517 | { 518 | const int n_var = r.size(); 519 | const int n_lambda = lambda.size(); 520 | double lambda_used, z, dlx, dlx2; 521 | 522 | arma::mat res(n_var, n_lambda); 523 | res.fill(0); 524 | 525 | arma::mat temp(1, 1); 526 | 527 | arma::vec beta; 528 | beta.zeros(n_var); 529 | 530 | arma::vec beta_current; 531 | beta_current.zeros(n_var); 532 | 533 | int problem = 0; 534 | 535 | for (int i = 0; i < n_lambda; i++) 536 | { 537 | beta_current = beta; 538 | lambda_used = lambda(i); 539 | for (int j = 0; j < max_iteration; j++) 540 | { 541 | for (int q = 0; q < n_var; q++) 542 | { 543 | temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q); 544 | z = temp(0, 0); 545 | if (beta_current(q) <= gamma * lambda_used) 546 | { 547 | if (beta_current(q) <= lambda_used) 548 | { 549 | if (z > lambda_used) 550 | { 551 | beta_current(q) = (z - lambda_used) / (1 + s); 552 | } 553 | else if (z < -lambda_used) 554 | { 555 | beta_current(q) = (z + lambda_used) / (1 + s); 556 | } 557 | else 558 | { 559 | beta_current(q) = 0.0; 560 | } 561 | } 562 | else 563 | { 564 | if (z > gamma / (gamma - 1) * lambda_used) 565 | { 566 | beta_current(q) = (z - gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1)); 567 | } 568 | else if (z < -gamma / (gamma - 1) * lambda_used) 569 | { 570 | beta_current(q) = (z + gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1)); 571 | } 572 | else 573 | { 574 | beta_current(q) = 0.0; 575 | } 576 | } 577 | } 578 | else 579 | { 580 | beta_current(q) = z / (1 + s); 581 | } 582 | } 583 | 584 | arma::vec betaDiff = abs(beta_current - beta); 585 | dlx = betaDiff.max(); 586 | dlx2 = arma::norm(beta_current, 2); 587 | 588 | if (dlx < threshold) 589 | { 590 | break; 591 | } 592 | 593 | if (dlx2 > 100.0) 594 | { 595 | problem = 1; 596 | break; 597 | } 598 | 599 | beta = beta_current; 600 | } 601 | 602 | if (problem == 1) 603 | { 604 | break; 605 | } 606 | 607 | res.col(i) = beta_current; 608 | } 609 | 610 | return (res); 611 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SUMMIT 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7034435.svg)](https://doi.org/10.5281/zenodo.7034435) 4 | 5 | **Summary-level Unified Method for Modeling Integrated Transcriptome (SUMMIT)**, is a novel framework designed to improve the expression prediction model accuracy and the power of sequential TWAS by leveraging a very large expression quantitative trait loci (eQTL) summary-level dataset. One benefit of SUMMIT is that it can deal with genes with moderate-low expression heritability, which have been largely ignored by conventional TWAS. 6 | 7 | ### SUMMIT's workflow. 8 | 9 |

10 | workflow 11 |

12 | 13 | Please cite the following manuscript for using SUMMIT and gene expression models built by SUMMIT: 14 | 15 | > Zhang, Z., Bae, Y. E., Bradley, J. R., Wu, L., & Wu, C. (2021). SUMMIT: An integrative approach for better transcriptomic data imputation improves causal gene identification. medRxiv. Under Review. 16 | 17 | 18 | ### Replication: 19 | For a complete replication of the results in our manuscript, please see our tutorials at osf.io (https://doi.org/10.17605/OSF.IO/BS3QU). 20 | 21 | ## Outline 22 | 23 | 1. [Training imputation model](#TRAIN) 24 | 2. [Association test](#TWAS) 25 | 26 | ## Training models 27 | 28 | ```mainbody_cpp_pos_precise.R/mainbody_cpp_rsid_precise.R``` are SUMMIT's main functions to train imputation models. It has only 3 input arguments ```--name_batch``` ,```--method```, and ```wd```. ```--name_batch``` is the desired name for this batch of output and ```--method``` is designated penalized regression method that you can choose from (LASSO, ElNet, SCAD, MCP, MNet). ```--wd``` is the path of working directory. 29 | 30 | ### Data preparation 31 | 32 | The Mainbody_cpp_xxx_precise functions require a very specific set of processed data to work properly. Unfortunately, we cannot share all the data with you due to privacy and confidentiality concern. Here, we provide a list of the data that we used. If you need further assistance on how the data were organized/processed, please reach out to us. 33 | 34 | | Datasets referenced | Usage | How to obtain | 35 | | ----- | ----- | ---- | 36 | | gencode.v26.hg19.genes.rds | A look-up list for translating Ensembl IDs into gene names | downloaded from https://www.gencodegenes.org/human/release_26.html | 37 | | Whole Blood_QCed_rpkm.rds | GTEx-7's subject's gene expression levels | downloaded from https://gtexportal.org and processed | 38 | | response.8.RData | GTEx-8's subject's gene expression levels | downloaded from https://gtexportal.org and processed | 39 | | subset-hapmap3 | eQTLGen summary statistics subsetted by HapMap3 | downloaded from https://www.eqtlgen.org/cis-eqtls.html with standard QC, then split into smaller files by gene name | 40 | | seq.ref | Genotype matrix of reference panel (1000 Genomes Project) | downloaded from https://www.internationalgenome.org/data | 41 | | seq.8 | Genotype matrix of GTEx-8 subjects | downloaded from https://gtexportal.org and process | 42 | | chrX.OMNI.interpolated_genetic_map | Genetic distance of reference panel (1000 Genomes Project) | downloaded from https://github.com/joepickrell/1000-genomes-genetic-maps | 43 | 44 | ### Data alignment 45 | 46 | SUMMIT offers two approaches to align the reference panel with the eQTL summary-level data. If you wish to align your data by rsID, use mainbody_cpp_rsid_precise.R. If you wish to align your data by chromosome+position, use mainbody_cpp_pos_precise.R. 47 | 48 | ### Example run 49 | 50 | After we prepared the data, we can train imputation models via the following command. 51 | 52 | ``` 53 | Rscript mainbody_cpp_rsid.R \ 54 | --name_batch SCAD_1e-6_rsid \ 55 | --method SCAD \ 56 | --wd MY-WORK-DIR \ 57 | ``` 58 | 59 | ### Built-in parallel computing 60 | 61 | Both ```mainbody_cpp_rsid_precise.R``` and ```mainbody_cpp_pos_precise.R``` contain a snippet that guarantees mutual exclusion for every subjob. Simply put, you can run ```mainbody_cpp_rsid_precise.R``` and ```mainbody_cpp_pos_precise.R``` using many parallel instances as you want and it will figure out if there is unfinished job on its own. 62 | 63 | ## Association test 64 | 65 | ### Gene expression prediction models built by SUMMIT 66 | 67 | We have uploaded our pre-calculated expression imputation models (Tissue: whole blood) to osf.io (https://doi.org/10.17605/OSF.IO/7MXSA). 68 | 69 | The osf.io repository contains two zip files. ``SUMMIT-weight-pos.zip`` contains models that use chromosome plus position (CHR + POS) to match the SNPs in our models to the SNPs in GWAS summary data; ``SUMMIT-weight-rsid.zip`` uses rsID to match. 70 | 71 | ### Pre-process your summary statistics using APSS 72 | 73 | APSS is an interactive R function that can easily process GWAS summary statistics and shape GWAS summary statistics into any desired format. 74 | 75 | APSS has 3 principal input arguments. 76 | 77 | ```directory.working``` is the working directory. 78 | 79 | ```filename``` is the name of the summary statistics file to be processed. 80 | 81 | ```BIG``` is the number of GBs and default is 2. If ```BIG``` is set as 2, then for any summary statistics file bigger than 2GB, APSS will do an exploratory read first. By doing so, APSS could significantly shorten the runtime and handle summary statistics files bigger than 10GB. 82 | 83 | ### The must-have columns 84 | 85 | You can use any summary statistics files with reasonable quality control just as long they contain specific columns for SUMMIT to work with. 86 | 87 | The must-have columns are ```A1, A2, Z, CHR```. 88 | 89 | If you are using the rsID-aligned models, you must also include ```SNP``` column. 90 | 91 | If you are using the position-aligned models, you must also include ```POS``` column. 92 | 93 | ### The flags 94 | 95 | ```step2_pos.R``` is SUMMIT's function for association test using position-aligned models. 96 | 97 | ```step2_rsid.R``` is SUMMIT's function for association test using rsID-aligned models. 98 | 99 | For both functions, the input arguments are: 100 | 101 | ```--models``` is the path of the pre-calculated models (Please make sure that the folder contains only model files). 102 | 103 | ```--path.ref``` is the path of the reference panel used plus the prefixes of reference panel files. 104 | 105 | ```--path.ss``` is the path of the reference panel (with "/" at the end). 106 | 107 | ```--trait``` is the path plus name of the input summary statistics files. 108 | 109 | ```--path.out``` is the path of the output folder (with "/" at the end). 110 | 111 | ```--parallel``` is the number of parallel instances 112 | 113 | ### Parallelization 114 | 115 | Unlike other TWAS methods (e.g., PrediXcan), SUMMIT can be a bit more time-consuming. Because SUMMIT does not use a designated correlation matrix (LD matrix), the SUMMIT pipeline would spend more time matching models, summary statistics, and reference panel. In addition, each SUMMIT's pre-calculated model file contains models from 5 different types of penalized regression, hence more computation time is needed. 116 | 117 | However, with proper parallelization, a complete association study usually takes less than 10 minutes. For example, if you decided to split the association tests into 20 smaller subjobs, an index (from 1 to 20) would need to be explicitly passed on to R from the global environment and the ```--parallel``` flag should be set to 20. Depending on your computing environment, you 118 | may need to manually modify line 2 in ```step2_pos.R/step2_rsid.R```. 119 | 120 | ### Example run 121 | 122 | ``` 123 | Rscript step2_pos.R \ 124 | --models my-model-folder-pos \ 125 | --path.ref my-reference-panel/1000G.EUR.ALLSNP.QC.CHR \ 126 | --path.ss my-ss-folder/ \ 127 | --trait my-trait-1 \ 128 | --path.out my-output-folder \ 129 | --parallel 100 130 | ``` 131 | 132 | ``` 133 | Rscript step2_rsid.R \ 134 | --models my-model-folder-rsid \ 135 | --path.ref my-reference-panel/1000G.EUR.ALLSNP.QC.CHR \ 136 | --path.ss my-ss-folder/ \ 137 | --trait my-trait-2 \ 138 | --path.out my-output-folder \ 139 | --parallel 50 140 | ``` 141 | 142 | ### Output format 143 | 144 | | Column number | Column name | Description | 145 | | ----- | ----- | ---- | 146 | | 1 | gene_symbol | Gene name | 147 | | 2 | gene_id | Ensembl ID | 148 | | 3 | chromosome | Chromosome | 149 | | 4 | model_best | Best model | 150 | | 5 | r2_test | Best model's $R^2$ on testing data | 151 | | 6-10 | p_ElNet | p-value of TWAS (Method is after the underscore) | 152 | | 11-15 | z_ElNet | Z-score of TWAS (Method is after the underscore) | 153 | | 16 | p_ACAT | The combined ACAT p-value | 154 | | 17 | gene_pos | Gene position | 155 | | 18 | runtime | Runtime | 156 | 157 | ### Disclaimer 158 | 159 | The built gene expression prediction models and software are provided “as is”, without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. in no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the models or the use or other dealings in the models. 160 | 161 | ### License 162 | 163 | Maintainer: [Zichen Zhang] (zz17@fsu.edu) 164 | 165 | [MIT](http://opensource.org/licenses/MIT) 166 | 167 | Copyright (c) 2013-present, Zichen Zhang (zz17@fsu.edu), Chong Wu (cwu18@mdanderson.org) 168 | -------------------------------------------------------------------------------- /illustration.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChongWuLab/SUMMIT/9b11e3d1f4ca00f0dd35a1f777720a616f365528/illustration.bmp -------------------------------------------------------------------------------- /mainbody_cpp_pos_precise.R: -------------------------------------------------------------------------------- 1 | ############################################# 2 | # Package, parameter, and working directory # 3 | ############################################# 4 | 5 | # Package 6 | library(BEDMatrix) 7 | library(compiler) 8 | library(data.table) 9 | library(ddpcr) 10 | library(dplyr) 11 | library(optparse) 12 | library(Rcpp) 13 | 14 | # Parameter 15 | s.array <- 0.1 * (1:9) 16 | 17 | ########### 18 | # Options # 19 | ########### 20 | 21 | # Options 22 | option_list <- list( 23 | make_option("--name_batch", type = "character", default = FALSE, action = "store", help = "Name of this batch" 24 | ), 25 | make_option("--method" , type = "character", default = FALSE, action = "store", help = "Method to use" 26 | ), 27 | make_option("--wd" , type = "character", default = FALSE, action = "store", help = "Working directory" 28 | ) 29 | ) 30 | 31 | opt <- parse_args(OptionParser(option_list = option_list)) 32 | 33 | # Pass the options to variables 34 | batch <- opt$name_batch 35 | method <- opt$method 36 | setwd(opt$wd) 37 | 38 | ####################### 39 | # Quick-submit gadget # 40 | ####################### 41 | 42 | # Randomly sleep for a little while 43 | Sys.sleep(runif(1, 0, 30)) 44 | 45 | # Setting up the folders 46 | null.object <- NULL 47 | 48 | dir.create("quick-submit", showWarnings = TRUE) 49 | if (!dir.exists("quick-submit/to-do")) { 50 | dir.create("quick-submit/to-do") 51 | dir.create("quick-submit/working") 52 | dir.create("quick-submit/done") 53 | dir.create("quick-submit/error") 54 | for (qwerty in 1:19227) { 55 | save(null.object, file = paste0("quick-submit/to-do/", qwerty)) 56 | } 57 | } 58 | 59 | ########################## 60 | # User-defined functions # 61 | ########################## 62 | 63 | # Cpp functions 64 | suppressMessages(sourceCpp("DetectAndSolve_debugged.cpp")) 65 | 66 | # FindOptimalResult 67 | FindOptimalResult <- function(response.true) { 68 | # Find the optimal out of 900 results 69 | index.optimal <- 0 70 | max <- -1 71 | 72 | for (t in 1:900) { 73 | if (norm(result[[t]][[1]], type = "2") <= 100) { 74 | # Compute the predicted response 75 | response.pred <- genotype.train %*% result[[t]][[1]] 76 | 77 | # Do the regression 78 | reg <- summary(lm(response.true ~ response.pred)) 79 | 80 | # Keep only the optimal result 81 | if (reg$adj.r.sq > max) { 82 | max <- reg$adj.r.sq 83 | result.optimal <- list(result[[t]][[1]], result[[t]][[2]], reg$adj.r.sq, reg$coef) 84 | t.optimal <- t 85 | } 86 | } 87 | } 88 | 89 | # Compute the P-value of R square 90 | quiet( 91 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 92 | ) 93 | result.optimal[[5]] <- test$p.value 94 | 95 | return(result.optimal) 96 | } 97 | 98 | FindOptimalResult <- cmpfun(FindOptimalResult) 99 | 100 | # PatchUp 101 | source("PatchUp.R") 102 | 103 | PatchUp <- cmpfun(PatchUp) 104 | 105 | # Standardize 106 | Standardize <- function(M) { 107 | # Centralize 108 | M <- M - matrix(rep(colMeans(M), times = nrow(M)), nrow = nrow(M) , ncol = ncol(M), byrow = T) 109 | 110 | # Standardize 111 | M <- sweep(M, 2, sqrt(apply(M, 2, crossprod) / nrow(M)), "/") 112 | 113 | return(M) 114 | } 115 | 116 | Standardize <- cmpfun(Standardize) 117 | 118 | # Translate 119 | list.translation <- readRDS("gencode.v26.hg19.genes.rds") 120 | list.translation$gene_id <- substr(list.translation$gene_id, 1, 15) 121 | list.translation$gene_name <- as.character(list.translation$gene_name) 122 | 123 | list.supplementary <- readRDS("list.supplementary.rds") 124 | 125 | Translate <- function(gene.ENSG) { 126 | temp.1 <- list.translation[which(list.translation$gene_id == gene.ENSG), ] 127 | 128 | if (nrow(temp.1) == 0) { 129 | temp.2 <- list.supplementary[which(list.supplementary$ensembl_gene_id == gene.ENSG), ] 130 | 131 | if (nrow(temp.2) == 0) { 132 | gene.proper <- gene.ENSG 133 | } else { 134 | gene.proper <- temp.2[1, 3] 135 | } 136 | } else { 137 | gene.proper <- temp.1[1, 12] 138 | } 139 | 140 | if (gene.proper == "") { 141 | gene.proper <- gene.ENSG 142 | } 143 | 144 | return(gene.proper) 145 | } 146 | 147 | Translate <- cmpfun(Translate) 148 | 149 | ############################## 150 | # Read GTEx-7's subject list # 151 | ############################## 152 | 153 | raw <- readRDS("Whole Blood_QCed_rpkm.rds") 154 | list.7 <- rownames(raw$qced.exp) 155 | 156 | rm(raw) 157 | 158 | ########################################################################## 159 | # Read GTEx-8's expression data (response.8.RData is already translated) # 160 | ########################################################################## 161 | 162 | load("response.8.RData") 163 | 164 | #################### 165 | # List of subjects # 166 | #################### 167 | 168 | # Split subjects in GTEx-8 into two groups 169 | list.8 <- rownames(response.8) 170 | 171 | list.train <- list.8[which(list.8 %in% list.7)] 172 | list.valid <- list.8[which(!(list.8 %in% list.7))] 173 | 174 | #################################### 175 | # There are 19,227 genes in total! # 176 | #################################### 177 | 178 | filenames <- dir("subset-hapmap3") 179 | 180 | # Quick-submit gadget 181 | while (TRUE) { 182 | if (length(dir("quick-submit/to-do")) == 0) { 183 | print("All jobs are done!") 184 | break 185 | } else { 186 | # Get the job 187 | to_do <- as.numeric(dir("quick-submit/to-do")) 188 | if (length(to_do) == 1) { 189 | i <- to_do[1] 190 | } else { 191 | i <- sample(to_do, 1) 192 | } 193 | # Ship the job to "working" folder 194 | save(null.object, file = paste0("quick-submit/working/", i)) 195 | file.remove(paste0("quick-submit/to-do/", i)) 196 | } 197 | 198 | # Start keeping track of runtime 199 | time.start <- proc.time()[3] 200 | 201 | ################# 202 | # Preprocess ss # 203 | ################# 204 | 205 | load(paste0("subset-hapmap3/", filenames[i])) 206 | 207 | # Get the name and chromosome of to-be-processed gene 208 | gene.ENSG <- ss$Gene[1] 209 | gene.proper <- Translate(gene.ENSG) 210 | chr <- ss$SNPChr[1] 211 | 212 | ########################## 213 | # Clumping and filtering # 214 | ########################## 215 | 216 | do.clumping <- FALSE 217 | 218 | # Obsolete 219 | if (do.clumping) { 220 | ########### 221 | # Extract # 222 | ########### 223 | 224 | # Create a temporary SNP list for Plink 225 | temp.outdir <- "/gpfs/home/zz17/current-project/temp/" 226 | temp.snpfile <- paste0(temp.outdir, gene.ENSG, ".txt") 227 | write.table(ss[, 1], file = temp.snpfile, quote = FALSE, col.names = FALSE, row.names = FALSE) 228 | 229 | # Put together the command and execute 230 | command <- path.Plink 231 | command <- paste0(command) 232 | command <- paste0(command, " --bfile /gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR", chr) 233 | command <- paste0(command, " --chr ", chr) 234 | command <- paste0(command, " --extract ", temp.snpfile) 235 | command <- paste0(command, " --make-bed") 236 | command <- paste0(command, " --out ", temp.outdir, gene.ENSG) 237 | 238 | quiet( 239 | system(command, intern = TRUE) 240 | ) 241 | 242 | ########### 243 | # Process # 244 | ########### 245 | 246 | # Create a temporary ss file and add a p-value column 247 | ss$pval <- 2 * pnorm(abs(ss$Zscore), lower.tail = FALSE) 248 | temp.ssfile <- paste0(temp.outdir, gene.ENSG, "_ss.txt") 249 | write.table(ss, file = temp.ssfile, quote = FALSE, col.names = TRUE, row.names = FALSE) 250 | 251 | # Put together the command and execute 252 | command <- path.Plink 253 | command <- paste0(command, " --bfile ", temp.outdir, gene.ENSG) 254 | command <- paste0(command, " --clump ", temp.ssfile) 255 | command <- paste0(command, " --clump-field pval") 256 | command <- paste0(command, " --clump-kb 250") 257 | command <- paste0(command, " --clump-p1 1") 258 | command <- paste0(command, " --clump-r2 0.95") 259 | command <- paste0(command, " --clump-snp-field SNP") 260 | command <- paste0(command, " --out ", temp.outdir, gene.ENSG) 261 | 262 | quiet( 263 | system(command, intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE) 264 | ) 265 | 266 | clumped.snp <- fread(paste0(temp.outdir, gene.ENSG, ".clumped")) 267 | clumped.snp <- as.data.frame(clumped.snp) 268 | clumped.snp <- clumped.snp[, "SNP"] 269 | 270 | # Clean up and update 271 | system(paste0("rm ", temp.outdir, gene.ENSG, "*")) 272 | 273 | ss <- ss[ss$SNP %in% clumped.snp, ] 274 | } 275 | 276 | ############ 277 | # Quick QC # 278 | ############ 279 | 280 | # Skip this round of iteration if ss is empty to begin with 281 | if (nrow(ss) == 0) { 282 | file.connection <- file(paste0("quick-submit/error/", i, ".error0")) 283 | writeLines("Empty summary statistics to begin with!", file.connection) 284 | close(file.connection) 285 | file.remove(paste0("quick-submit/working/", i)) 286 | save(null.object, file=paste0("quick-submit/done/", i)) 287 | 288 | next 289 | } 290 | 291 | # Read the bim files of GTEx-8 and reference panel 292 | bim.8 <- as.data.frame(fread(paste0("qced_chr", chr, ".bim"))) 293 | bim.ref <- as.data.frame(fread(paste0("1000G.EUR.ALLSNP.QC.CHR", chr, ".bim"))) 294 | 295 | # Create new identifiers 296 | ss["ID"] <- paste0(chr, "_", ss$SNPPos) 297 | bim.8["ID"] <- paste0(bim.8$V1, "_", bim.8$V4) 298 | bim.ref["ID"] <- paste0(bim.ref$V1, "_", bim.ref$V4) 299 | 300 | # Only keep the SNPs that are simultaneously IN (reference panel, GTEx-8) 301 | ss <- ss[ss$ID %in% bim.ref$ID & ss$ID %in% bim.8$ID, ] 302 | 303 | # Sieve out problematic SNPs 304 | list.1 <- bim.8$ID[duplicated(bim.8$ID)] 305 | list.2 <- bim.8$ID[nchar(bim.8$V5) > 1 | nchar(bim.8$V6) > 1] 306 | list.3 <- bim.ref$ID[duplicated(bim.ref$ID)] 307 | list.4 <- bim.ref$ID[nchar(bim.ref$V5) > 1 | nchar(bim.ref$V6) > 1] 308 | list.5 <- ss$ID[duplicated(ss$ID)] 309 | problem <- ss$ID %in% list.1 | ss$ID %in% list.2 | ss$ID %in% list.3 | ss$ID %in% list.4 | ss$ID %in% list.5 310 | 311 | ss <- ss[!problem, ] 312 | 313 | rm(list.1) 314 | rm(list.2) 315 | rm(list.3) 316 | rm(list.4) 317 | rm(list.5) 318 | rm(problem) 319 | 320 | # Skip this round of iteration if ss has few row left 321 | if (nrow(ss) <= 1) { 322 | file.connection <- file(paste0("quick-submit/error/", i, ".error1")) 323 | writeLines("Empty summary statistics after QC!", file.connection) 324 | close(file.connection) 325 | file.remove(paste0("quick-submit/working/", i)) 326 | save(null.object, file=paste0("quick-submit/done/", i)) 327 | 328 | next 329 | } 330 | 331 | ######################################################## 332 | # Make sure that the gene of the hour is in response.8 # 333 | ######################################################## 334 | 335 | if (!(gene.proper %in% colnames(response.8))) { 336 | file.connection <- file(paste0("quick-submit/error/", i, ".error2")) 337 | writeLines("Gene is not in response.8 and we move on to the next gene!", file.connection) 338 | close(file.connection) 339 | file.remove(paste0("quick-submit/working/", i)) 340 | save(null.object, file=paste0("quick-submit/done/", i)) 341 | 342 | next 343 | } 344 | 345 | ################################################# 346 | # Preprocess genotype matrix of reference panel # 347 | ################################################# 348 | 349 | # Extract genotype information 350 | quiet( 351 | seq.ref <- BEDMatrix(paste0("sequence/1000G.EUR.ALLSNP.QC.CHR", chr), simple_names = TRUE) 352 | ) 353 | 354 | colnames(seq.ref) <- bim.ref$ID 355 | genotype.ref <- seq.ref[, ss$ID] 356 | rm(seq.ref) 357 | 358 | # Skip this round of iteration if genotype is empty 359 | if (ncol(genotype.ref) == 0) { 360 | file.connection <- file(paste0("quick-submit/error/", i, ".error3")) 361 | writeLines("Empty reference genotype matrix!", file.connection) 362 | close(file.connection) 363 | file.remove(paste0("quick-submit/working/", i)) 364 | save(null.object, file=paste0("quick-submit/done/", i)) 365 | 366 | next 367 | } 368 | 369 | # Compare to see if all (A1 and A2) pairs are well-aligned 370 | # Always use reference panel's A1/A2 as the standard order 371 | bim.temp <- subset(bim.ref, ID %in% ss$ID) 372 | ss.temp <- left_join(ss, bim.temp, by = "ID") 373 | problem <- !(ss.temp$a1 == ss.temp$V5) 374 | 375 | if (sum(problem) != 0) { 376 | # Flip the problematic pairs 377 | ss$Zscore[problem] <- -1 * ss$Zscore[problem] 378 | 379 | ss$a1 <- ss.temp$V5 380 | ss$a2 <- ss.temp$V6 381 | } 382 | 383 | rm(bim.temp) 384 | rm(ss.temp) 385 | rm(problem) 386 | 387 | # Patch up the NAs and centralize genotype matrix 388 | genotype.ref <- PatchUp(genotype.ref) 389 | genotype.ref <- Standardize(genotype.ref) 390 | 391 | ####################################### 392 | # Preprocess genotype matrix of GTEx8 # 393 | ####################################### 394 | 395 | # The big genotype matrix 396 | quiet( 397 | seq.8 <- BEDMatrix(paste0("qced_chr", chr), simple_names = TRUE) 398 | ) 399 | 400 | # Remove the "0_" part of subject IDs 401 | rownames(seq.8) <- gsub("0_", "", rownames(seq.8)) 402 | 403 | # Subset by list.8 and ID from ss 404 | colnames(seq.8) <- bim.8$ID 405 | genotype.8 <- seq.8[list.8, ss$ID] 406 | 407 | # Skip this round of iteration if genotype is empty 408 | if (ncol(genotype.8) == 0) { 409 | file.connection <- file(paste0("quick-submit/error/", i, ".error4")) 410 | writeLines("Empty genotype.8 matrix!", file.connection) 411 | close(file.connection) 412 | file.remove(paste0("quick-submit/working/", i)) 413 | save(null.object, file=paste0("quick-submit/done/", i)) 414 | 415 | next 416 | } 417 | 418 | # Compare to see if all (a1 and a2) are well-aligned 419 | bim.temp1 <- subset(bim.ref, ID %in% ss$ID) 420 | bim.temp2 <- subset(bim.8, ID %in% ss$ID) 421 | bim.temp3 <- left_join(bim.temp1, bim.temp2, by = "ID") 422 | problem <- !(bim.temp3$V5.x == bim.temp3$V5.y) 423 | 424 | if (sum(problem) != 0) { 425 | genotype.8[, problem] <- 2 - genotype.8[, problem] 426 | } 427 | 428 | rm(bim.temp1) 429 | rm(bim.temp2) 430 | rm(bim.temp3) 431 | rm(seq.8) 432 | rm(problem) 433 | 434 | ####################################################### 435 | # EXTREMELY ANNOYING ISSUE FOUND!!!!!!!!!!!!!!!!!!!!! # 436 | ####################################################### 437 | 438 | # Subject number is NOT the same for all chromosomes! 439 | genotype.train <- genotype.8[which(rownames(genotype.8) %in% list.train), ] 440 | genotype.train <- scale(PatchUp(genotype.train)) 441 | 442 | genotype.valid <- genotype.8[which(rownames(genotype.8) %in% list.valid), ] 443 | genotype.valid <- scale(PatchUp(genotype.valid)) 444 | 445 | genotype.train[is.na(genotype.train)] <- 0 446 | genotype.valid[is.na(genotype.valid)] <- 0 447 | 448 | ########################## 449 | # The original LD matrix # 450 | ########################## 451 | 452 | matrix.LD <- t(genotype.ref) %*% genotype.ref / nrow(genotype.ref) 453 | 454 | ######################### 455 | # Additional adjustment # 456 | ######################### 457 | 458 | do.adjust <- TRUE 459 | 460 | if (do.adjust) { 461 | # Some given parameters 462 | cutoff <- 0.001 463 | matrix.adjust <- matrix(0, nrow(ss), nrow(ss)) 464 | n.effective <- 11400 465 | n.size <- 183 466 | 467 | temp.cof <- (-2 * n.effective) / n.size 468 | 469 | # Read distance information and create an identifer 470 | distance <- as.data.frame(fread(paste0("distance-genetic/subset-omni/chr", chr, ".OMNI.interpolated_genetic_map"))) 471 | names(distance) <- c("SNP", "position", "distance") 472 | 473 | distance["ID"] <- paste0(chr, "_", distance$position) 474 | distance <- distance[!duplicated(distance$ID), ] 475 | 476 | # Merge and process 477 | ss.temp <- left_join(ss, distance, by = "ID") 478 | 479 | for (col in 1:(nrow(ss.temp) - 1)) { 480 | for (row in (col + 1):nrow(ss.temp)) { 481 | entry.temp <- exp(temp.cof * abs(ss.temp[col, "distance"] - ss.temp[row, "distance"])) 482 | entry.temp <- entry.temp * (entry.temp >= cutoff) 483 | matrix.adjust[row, col] <- entry.temp 484 | } 485 | } 486 | 487 | matrix.adjust <- t(matrix.adjust) + matrix.adjust + diag(nrow(ss.temp)) 488 | 489 | rm(distance) 490 | rm(ss.temp) 491 | rm(temp.cof) 492 | 493 | ################### 494 | # Final LD matrix # 495 | ################### 496 | 497 | # "*" stands for elementwise multiplication 498 | matrix.LD <- matrix.LD * matrix.adjust 499 | } 500 | 501 | ############ 502 | # r vector # 503 | ############ 504 | 505 | # Compute r vector 506 | r <- ss$Zscore / sqrt(ss$NrSamples - 1 + ss$Zscore ^2) 507 | 508 | # Get size 509 | size <- nrow(ss) 510 | 511 | ########################## 512 | # Iterate By parameter s # 513 | ########################## 514 | 515 | result <- list() 516 | 517 | for (k in 1:length(s.array)) { 518 | s <- s.array[k] 519 | 520 | ######################################### 521 | # Constructing the initial lambda array # 522 | ######################################### 523 | 524 | # Get the maximum for lambda 525 | z.temp <- numeric(size) 526 | for (m in 1:size) { 527 | z.temp[m] <- abs(r[m]) 528 | } 529 | 530 | lambda.max <- max(z.temp) 531 | lambda.min <- lambda.max * 1E-3 532 | lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 10000) 533 | rm(z.temp) 534 | 535 | ########################################### 536 | # Explore the possible minimum for lambda # 537 | ########################################### 538 | 539 | ###################################### 540 | # What do 0, 1, and 2 stand for? # 541 | # 0: Not tested # 542 | # 1: tested and no problem detected # 543 | # 2: tested and problem is detected # 544 | ###################################### 545 | 546 | problem <- numeric(10000) 547 | start <- 1 548 | end <- 10000 549 | 550 | max.iteration <- 500 551 | 552 | threshold <- 100 553 | alpha <- 0.5 554 | 555 | if (method == "SCAD") { 556 | gamma <- 3.7 557 | } else { 558 | gamma <- 3 559 | } 560 | 561 | dummy.detect <- TRUE 562 | 563 | while (dummy.detect) { 564 | # Stop if all lambdas are OK 565 | if (sum(problem == rep(1, 10000)) == 10000) { 566 | print("All lambdas check out!") 567 | break 568 | } 569 | 570 | start <- min(which(problem == 0)) 571 | end <- max(which(problem == 0)) 572 | 573 | try <- ceiling((start + end) /2) 574 | lambda <- lambda.array[try] 575 | 576 | if (method == "MCP") { 577 | problem[try] <- MCPDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try) 578 | } 579 | if (method == "LASSO") { 580 | problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold, try) 581 | } 582 | if (method == "ElNet") { 583 | problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold, try) 584 | } 585 | if (method == "MNet") { 586 | problem[try] <- MNetDetect(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold, try) 587 | } 588 | if (method == "SCAD") { 589 | problem[try] <- SCADDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try) 590 | } 591 | 592 | if (problem[try] == 1) { 593 | problem[1:try] <- 1 594 | } else { 595 | problem[try:10000] <- 2 596 | } 597 | 598 | if (problem[try] == 1 & problem[min(10000, try + 1)] == 2) { 599 | print(paste0("Problem detected for lambda=", lambda.array[try + 1], ". Finer interval has been saved!")) 600 | lambda.min <- lambda 601 | dummy.detect <- FALSE 602 | } 603 | 604 | if (problem[try] == 2 & problem[try - 1] == 1) { 605 | print(paste0("Problem detected for lambda=", lambda.array[try], ". Finer interval has been saved!")) 606 | lambda.min <- lambda.array[try - 1] 607 | dummy.detect <- FALSE 608 | } 609 | } 610 | 611 | # Update the lambda array 612 | lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 100) 613 | 614 | ####################################################################### 615 | # Optimizing with the chosen penalty (using the updated lambda array) # 616 | ####################################################################### 617 | 618 | ##################### 619 | # Iterate by lambda # 620 | ##################### 621 | 622 | beta <- t(matrix(0, nrow = 1, ncol = size)) 623 | 624 | max.iteration <- 300 625 | 626 | threshold <- 1e-5 627 | alpha <- 0.5 628 | 629 | if (method == "SCAD") { 630 | gamma <- 3.7 631 | } else { 632 | gamma <- 3 633 | } 634 | 635 | if (method == "MCP") { 636 | res <- MCP(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold) 637 | } 638 | if (method == "LASSO") { 639 | res <- ElNet(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold) 640 | } 641 | if (method == "ElNet") { 642 | res <- ElNet(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold) 643 | } 644 | if (method == "MNet") { 645 | res <- MNet(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold) 646 | } 647 | if (method == "SCAD") { 648 | res <- SCAD(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold) 649 | } 650 | 651 | for (b in 1:100) { 652 | result[[length(result) + 1]] <- list(res[, b], lambda.array[b]) 653 | } 654 | } 655 | 656 | ############################################################################################################################ 657 | # -----. osssso ---------..` # 658 | # `ssssso osssso `sssssssssssss+:` # 659 | # `ssssso `...` osssso `` `ssssssssssssssss/ `...` ``` `...` # 660 | # `ssssso `/osssssss+:` osssso:osssso/` `sssss/ `.:osssss+ `:+sssssss+:` +sssso.+sssso/` -+sssssss+- # 661 | # `ssssso /sssssssssssso- ossssssssssssss- `sssss/ `ssssss` :sssssssssssss- +ssssssssssssso .ossso/::ossso` # 662 | # `ssssso -sssss:``./sssss` osssso-``./sssss``sssss/ osssss..sssss:``./sssss. +sssss-``osssss` ossss/---:ssss+ # 663 | # .ssssso /ssss+ `sssss. ossss/ `sssss.`sssss/ `/ssssso :sssso sssss- +sssss /sssss``sssssssssssssso # 664 | # .:/oso/osssss/ .sssss+--:osssso osssss/--:osssso `ssssso+++ossssss+` `sssss+:-:+sssso` +sssss /sssss` +ssss/` `-` # 665 | # .ossssssssss+` .osssssssssss+` ossssosssssssso. `ssssssssssssss+- .osssssssssss+. +sssss /sssss` `/sssssssssso/` # 666 | # -/+osso+/. ./+oosoo+:` /++++.-/+oo+/- `++++++++//:-.` .:+oosoo+:. :+++++ :+++++` `:/oosoo+/-` # 667 | ############################################################################################################################ 668 | 669 | ######################### 670 | # Select optimal lambda # 671 | ######################### 672 | 673 | ############################# 674 | # Calculate goodness of fit # 675 | ############################# 676 | 677 | ############################################### 678 | # Structure of the list result/result.optimal # 679 | ############################################### 680 | # index: # 681 | # [[1]] beta vector # 682 | # [[2]] lambda # 683 | # [[3]] r square adjusted (to be added) # 684 | # [[4]] regression coefficient # 685 | # [[5]] p-value of r square # 686 | # [[6]] r-square projected by GTEx8 - GTEx7 # 687 | # [[7]] runtime for the gene # 688 | # [[8]] average sample size # 689 | # [[9]] iteration index # 690 | # [[10]] correlation # 691 | ############################################### 692 | 693 | # Get the true response vector 694 | response.true <- response.8[list.train, which(colnames(response.8) == gene.proper)] 695 | 696 | # Figure out if response.true is just a vector or multiple vectors and find the optimal 697 | if (class(response.true) == "numeric") { 698 | result.optimal <- FindOptimalResult(response.true) 699 | } else { 700 | max <- -1 701 | for (asd in 1:ncol(response.true)) { 702 | temp <- FindOptimalResult(response.true[, asd])[[3]] 703 | if (temp > max) { 704 | max <- temp 705 | n.col <- asd 706 | } 707 | } 708 | 709 | response.true <- response.true[, n.col] 710 | result.optimal <- FindOptimalResult(response.true) 711 | } 712 | 713 | # Compute the P-value of R square 714 | response.pred <- genotype.train %*% result.optimal[[1]] 715 | quiet( 716 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 717 | ) 718 | result.optimal[[5]] <- test$p.value 719 | 720 | # Little tweak to make beta vector an one-column matrix 721 | result.optimal[[1]] <- as.matrix(result.optimal[[1]]) 722 | 723 | ############################## 724 | # Validate model with GTex-8 # 725 | ############################## 726 | 727 | # Get the response vectors 728 | response.true <- response.8[list.valid, which(colnames(response.8) == gene.proper)] 729 | 730 | if (exists("n.col")) { 731 | response.true <- response.true[, n.col] 732 | rm(n.col) 733 | } 734 | 735 | response.pred <- genotype.valid %*% result.optimal[[1]] 736 | 737 | # Cross validation with subject that are NOT INCLUDED in GTEx7 738 | reg <- summary(lm(response.true ~ response.pred)) 739 | result.optimal[[6]] <- reg$adj.r.squared 740 | 741 | # Keep track of runtime 742 | time.end <- proc.time()[3] 743 | result.optimal[[7]] <- time.end - time.start 744 | 745 | # Keep track of average sample size and miscellaneous items 746 | result.optimal[[8]] <- mean(ss$NrSamples) 747 | result.optimal[[9]] <- i 748 | result.optimal[[10]] <- cor(response.true, response.pred) 749 | 750 | ######## 751 | # Save # 752 | ######## 753 | 754 | save <- result.optimal 755 | 756 | dir.create(paste0("output/output_", batch), showWarnings = FALSE) 757 | save.chr <- paste0("output/output_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData") 758 | 759 | save(save, file = save.chr) 760 | 761 | ######################## 762 | # For TWAS-fusion only # 763 | ######################## 764 | 765 | # cv.performance 766 | cv.performance <- matrix(nrow = 2, ncol = 1) 767 | colnames(cv.performance) <- method 768 | rownames(cv.performance) <- c("rsq", "pval") 769 | cv.performance[1, 1] <- reg$adj.r.squared 770 | quiet( 771 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 772 | ) 773 | cv.performance[2, 1] <- test$p.value 774 | 775 | # snps 776 | snps <- ss 777 | 778 | # wgt.matrix 779 | wgt.matrix <- result.optimal[[1]] 780 | colnames(wgt.matrix) <- method 781 | rownames(wgt.matrix) <- ss$SNP 782 | 783 | dir.create(paste0("output/output_twas_", batch), showWarnings = FALSE) 784 | save.twas.chr <- paste0("output/output_twas_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData") 785 | save(cv.performance, snps, wgt.matrix, file = save.twas.chr) 786 | 787 | ############### 788 | # Cleaning-up # 789 | ############### 790 | 791 | file.remove(paste0("quick-submit/working/", i)) 792 | save(null.object, file = paste0("quick-submit/done/", i)) 793 | } 794 | -------------------------------------------------------------------------------- /mainbody_cpp_rsid_precise.R: -------------------------------------------------------------------------------- 1 | ############################################# 2 | # Package, parameter, and working directory # 3 | ############################################# 4 | 5 | # Package 6 | library(BEDMatrix) 7 | library(compiler) 8 | library(data.table) 9 | library(ddpcr) 10 | library(dplyr) 11 | library(optparse) 12 | library(Rcpp) 13 | 14 | # Parameter 15 | s.array <- 0.1 * (1:9) 16 | 17 | ########### 18 | # Options # 19 | ########### 20 | 21 | # Options 22 | option_list <- list( 23 | make_option("--name_batch", type = "character", default = FALSE, action = "store", help = "Name of this batch" 24 | ), 25 | make_option("--method" , type = "character", default = FALSE, action = "store", help = "Method to use" 26 | ), 27 | make_option("--wd" , type = "character", default = FALSE, action = "store", help = "Working directory" 28 | ) 29 | ) 30 | 31 | opt <- parse_args(OptionParser(option_list = option_list)) 32 | 33 | # Pass the options to variables 34 | batch <- opt$name_batch 35 | method <- opt$method 36 | setwd(opt$wd) 37 | 38 | ####################### 39 | # Quick-submit gadget # 40 | ####################### 41 | 42 | # Randomly sleep for a little while 43 | Sys.sleep(runif(1, 0, 30)) 44 | 45 | # Setting up the folders 46 | null.object <- NULL 47 | 48 | dir.create("quick-submit", showWarnings = TRUE) 49 | if (!dir.exists("quick-submit/to-do")) { 50 | dir.create("quick-submit/to-do") 51 | dir.create("quick-submit/working") 52 | dir.create("quick-submit/done") 53 | dir.create("quick-submit/error") 54 | for (qwerty in 1:19227) { 55 | save(null.object, file = paste0("quick-submit/to-do/", qwerty)) 56 | } 57 | } 58 | 59 | ########################## 60 | # User-defined functions # 61 | ########################## 62 | 63 | # Cpp functions 64 | suppressMessages(sourceCpp("DetectAndSolve_debugged.cpp")) 65 | 66 | # FindOptimalResult 67 | FindOptimalResult <- function(response.true) { 68 | # Find the optimal out of 900 results 69 | index.optimal <- 0 70 | max <- -1 71 | 72 | for (t in 1:900) { 73 | if (norm(result[[t]][[1]], type = "2") <= 100) { 74 | # Compute the predicted response 75 | response.pred <- genotype.train %*% result[[t]][[1]] 76 | 77 | # Do the regression 78 | reg <- summary(lm(response.true ~ response.pred)) 79 | 80 | # Keep only the optimal result 81 | if (reg$adj.r.sq > max) { 82 | max <- reg$adj.r.sq 83 | result.optimal <- list(result[[t]][[1]], result[[t]][[2]], reg$adj.r.sq, reg$coef) 84 | t.optimal <- t 85 | } 86 | } 87 | } 88 | 89 | # Compute the P-value of R square 90 | quiet( 91 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 92 | ) 93 | result.optimal[[5]] <- test$p.value 94 | 95 | return(result.optimal) 96 | } 97 | 98 | FindOptimalResult <- cmpfun(FindOptimalResult) 99 | 100 | # PatchUp 101 | source("PatchUp.R") 102 | 103 | PatchUp <- cmpfun(PatchUp) 104 | 105 | # Standardize 106 | Standardize <- function(M) { 107 | # Centralize 108 | M <- M - matrix(rep(colMeans(M), times = nrow(M)), nrow = nrow(M) , ncol = ncol(M), byrow = T) 109 | 110 | # Standardize 111 | M <- sweep(M, 2, sqrt(apply(M, 2, crossprod) / nrow(M)), "/") 112 | 113 | return(M) 114 | } 115 | 116 | Standardize <- cmpfun(Standardize) 117 | 118 | # Translate 119 | list.translation <- readRDS("resource/list-lookup/gencode.v26.hg19.genes.rds") 120 | list.translation$gene_id <- substr(list.translation$gene_id, 1, 15) 121 | list.translation$gene_name <- as.character(list.translation$gene_name) 122 | 123 | list.supplementary <- readRDS("resource/list-lookup/list.supplementary.rds") 124 | 125 | Translate <- function(gene.ENSG) { 126 | temp.1 <- list.translation[which(list.translation$gene_id == gene.ENSG), ] 127 | 128 | if (nrow(temp.1) == 0) { 129 | temp.2 <- list.supplementary[which(list.supplementary$ensembl_gene_id == gene.ENSG), ] 130 | 131 | if (nrow(temp.2) == 0) { 132 | gene.proper <- gene.ENSG 133 | } else { 134 | gene.proper <- temp.2[1, 3] 135 | } 136 | } else { 137 | gene.proper <- temp.1[1, 12] 138 | } 139 | 140 | if (gene.proper == "") { 141 | gene.proper <- gene.ENSG 142 | } 143 | 144 | return(gene.proper) 145 | } 146 | 147 | Translate <- cmpfun(Translate) 148 | 149 | ############################## 150 | # Read GTEx-7's subject list # 151 | ############################## 152 | 153 | raw <- readRDS("Whole Blood_QCed_rpkm.rds") 154 | list.7 <- rownames(raw$qced.exp) 155 | 156 | rm(raw) 157 | 158 | ########################################################################## 159 | # Read GTEx-8's expression data (response.8.RData is already translated) # 160 | ########################################################################## 161 | 162 | load("response.8.RData") 163 | 164 | #################### 165 | # List of subjects # 166 | #################### 167 | 168 | # Split subjects in GTEx-8 into two groups 169 | list.8 <- rownames(response.8) 170 | 171 | list.train <- list.8[which(list.8 %in% list.7)] 172 | list.valid <- list.8[which(!(list.8 %in% list.7))] 173 | 174 | #################################### 175 | # There are 19,227 genes in total! # 176 | #################################### 177 | 178 | filenames <- dir("subset-hapmap3") 179 | 180 | # Quick-submit gadget 181 | while (TRUE) { 182 | if (length(dir("quick-submit/to-do")) == 0) { 183 | print("All jobs are done!") 184 | break 185 | } else { 186 | # Get the job 187 | to_do <- as.numeric(dir("quick-submit/to-do")) 188 | if (length(to_do) == 1) { 189 | i <- to_do[1] 190 | } else { 191 | i <- sample(to_do, 1) 192 | } 193 | # Ship the job to "working" folder 194 | save(null.object, file = paste0("quick-submit/working/", i)) 195 | file.remove(paste0("quick-submit/to-do/", i)) 196 | } 197 | 198 | # Start keeping track of runtime 199 | time.start <- proc.time()[3] 200 | 201 | ################# 202 | # Preprocess ss # 203 | ################# 204 | 205 | load(paste0("ss-eqtlgen/subset-hapmap3/", filenames[i])) 206 | 207 | # Get the name and chromosome of to-be-processed gene 208 | gene.ENSG <- ss$Gene[1] 209 | gene.proper <- Translate(gene.ENSG) 210 | chr <- ss$SNPChr[1] 211 | 212 | ########################## 213 | # Clumping and filtering # 214 | ########################## 215 | 216 | do.clumping <- FALSE 217 | 218 | # Obsolete 219 | if (do.clumping) { 220 | ########### 221 | # Extract # 222 | ########### 223 | 224 | # Create a temporary SNP list for Plink 225 | temp.outdir <- "/gpfs/home/zz17/current-project/temp/" 226 | temp.snpfile <- paste0(temp.outdir, gene.ENSG, ".txt") 227 | write.table(ss[, 1], file = temp.snpfile, quote = FALSE, col.names = FALSE, row.names = FALSE) 228 | 229 | # Put together the command and execute 230 | command <- path.Plink 231 | command <- paste0(command) 232 | command <- paste0(command, " --bfile /gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR", chr) 233 | command <- paste0(command, " --chr ", chr) 234 | command <- paste0(command, " --extract ", temp.snpfile) 235 | command <- paste0(command, " --make-bed") 236 | command <- paste0(command, " --out ", temp.outdir, gene.ENSG) 237 | 238 | quiet( 239 | system(command, intern = TRUE) 240 | ) 241 | 242 | ########### 243 | # Process # 244 | ########### 245 | 246 | # Create a temporary ss file and add a p-value column 247 | ss$pval <- 2 * pnorm(abs(ss$Zscore), lower.tail = FALSE) 248 | temp.ssfile <- paste0(temp.outdir, gene.ENSG, "_ss.txt") 249 | write.table(ss, file = temp.ssfile, quote = FALSE, col.names = TRUE, row.names = FALSE) 250 | 251 | # Put together the command and execute 252 | command <- path.Plink 253 | command <- paste0(command, " --bfile ", temp.outdir, gene.ENSG) 254 | command <- paste0(command, " --clump ", temp.ssfile) 255 | command <- paste0(command, " --clump-field pval") 256 | command <- paste0(command, " --clump-kb 250") 257 | command <- paste0(command, " --clump-p1 1") 258 | command <- paste0(command, " --clump-r2 0.95") 259 | command <- paste0(command, " --clump-snp-field SNP") 260 | command <- paste0(command, " --out ", temp.outdir, gene.ENSG) 261 | 262 | quiet( 263 | system(command, intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE) 264 | ) 265 | 266 | clumped.snp <- fread(paste0(temp.outdir, gene.ENSG, ".clumped")) 267 | clumped.snp <- as.data.frame(clumped.snp) 268 | clumped.snp <- clumped.snp[, "SNP"] 269 | 270 | # Clean up and update 271 | system(paste0("rm ", temp.outdir, gene.ENSG, "*")) 272 | 273 | ss <- ss[ss$SNP %in% clumped.snp, ] 274 | } 275 | 276 | ############ 277 | # Quick QC # 278 | ############ 279 | 280 | # Skip this round of iteration if ss is empty to begin with 281 | if (nrow(ss) == 0) { 282 | file.connection <- file(paste0("quick-submit/error/", i, ".error0")) 283 | writeLines("Empty summary statistics to begin with!", file.connection) 284 | close(file.connection) 285 | file.remove(paste0("quick-submit/working/", i)) 286 | save(null.object, file=paste0("quick-submit/done/", i)) 287 | 288 | next 289 | } 290 | 291 | # Read the bim files of GTEx-8 and reference panel 292 | bim.8 <- as.data.frame(fread(paste0("qced_chr", chr, ".bim"))) 293 | bim.ref <- as.data.frame(fread(paste0("1000G.EUR.ALLSNP.QC.CHR", chr, ".bim"))) 294 | 295 | # Only keep the SNPs that are simultaneously IN (reference panel, GTEx-8) 296 | ss <- ss[ss$SNP %in% bim.ref$V2 & ss$SNP %in% bim.8$V2, ] 297 | 298 | # Sieve out problematic SNPs 299 | list.1 <- bim.8$V2[duplicated(bim.8$V2)] 300 | list.2 <- bim.8$V2[nchar(bim.8$V5) > 1 | nchar(bim.8$V6) > 1] 301 | list.3 <- bim.ref$V2[duplicated(bim.ref$V2)] 302 | list.4 <- bim.ref$V2[nchar(bim.ref$V5) > 1 | nchar(bim.ref$V6) > 1] 303 | list.5 <- ss$SNP[duplicated(ss$SNP)] 304 | problem <- ss$SNP %in% list.1 | ss$SNP %in% list.2 | ss$SNP %in% list.3 | ss$SNP %in% list.4 | ss$SNP %in% list.5 305 | 306 | ss <- ss[!problem, ] 307 | 308 | rm(list.1) 309 | rm(list.2) 310 | rm(list.3) 311 | rm(list.4) 312 | rm(list.5) 313 | rm(problem) 314 | 315 | # Skip this round of iteration if ss has few row left 316 | if (nrow(ss) <= 1) { 317 | file.connection <- file(paste0("quick-submit/error/", i, ".error1")) 318 | writeLines("Empty summary statistics after QC!", file.connection) 319 | close(file.connection) 320 | file.remove(paste0("quick-submit/working/", i)) 321 | save(null.object, file=paste0("quick-submit/done/", i)) 322 | 323 | next 324 | } 325 | 326 | ######################################################## 327 | # Make sure that the gene of the hour is in response.8 # 328 | ######################################################## 329 | 330 | if (!(gene.proper %in% colnames(response.8))) { 331 | file.connection <- file(paste0("quick-submit/error/", i, ".error2")) 332 | writeLines("Gene is not in response.8 and we move on to the next gene!", file.connection) 333 | close(file.connection) 334 | file.remove(paste0("quick-submit/working/", i)) 335 | save(null.object, file=paste0("quick-submit/done/", i)) 336 | 337 | next 338 | } 339 | 340 | ################################################# 341 | # Preprocess genotype matrix of reference panel # 342 | ################################################# 343 | 344 | # Extract genotype information 345 | quiet( 346 | seq.ref <- BEDMatrix(paste0("1000G.EUR.ALLSNP.QC.CHR", chr), simple_names = TRUE) 347 | ) 348 | 349 | genotype.ref <- seq.ref[, ss$SNP] 350 | rm(seq.ref) 351 | 352 | # Skip this round of iteration if genotype is empty 353 | if (ncol(genotype.ref) == 0) { 354 | file.connection <- file(paste0("quick-submit/error/", i, ".error3")) 355 | writeLines("Empty reference genotype matrix!", file.connection) 356 | close(file.connection) 357 | file.remove(paste0("quick-submit/working/", i)) 358 | save(null.object, file=paste0("quick-submit/done/", i)) 359 | 360 | next 361 | } 362 | 363 | # Compare to see if all (A1 and A2) pairs are well-aligned 364 | # Always use reference panel's A1/A2 as the standard order 365 | names(bim.ref)[2] <- "SNP" 366 | bim.temp <- subset(bim.ref, SNP %in% ss$SNP) 367 | ss.temp <- left_join(ss, bim.temp, by = "SNP") 368 | problem <- !(ss.temp$a1 == ss.temp$V5) 369 | 370 | if (sum(problem) != 0) { 371 | # Flip the problematic pairs 372 | ss$Zscore[problem] <- -1 * ss$Zscore[problem] 373 | 374 | ss$a1 <- ss.temp$V5 375 | ss$a2 <- ss.temp$V6 376 | } 377 | 378 | rm(bim.temp) 379 | rm(ss.temp) 380 | rm(problem) 381 | 382 | # Patch up the NAs and centralize genotype matrix 383 | genotype.ref <- PatchUp(genotype.ref) 384 | genotype.ref <- Standardize(genotype.ref) 385 | 386 | ####################################### 387 | # Preprocess genotype matrix of GTEx8 # 388 | ####################################### 389 | 390 | # The big genotype matrix 391 | quiet( 392 | seq.8 <- BEDMatrix(paste0("qced_chr", chr), simple_names = TRUE) 393 | ) 394 | 395 | # Remove the "0_" part of subject IDs 396 | rownames(seq.8) <- gsub("0_", "", rownames(seq.8)) 397 | 398 | # Subset by list.8 and ID from ss 399 | genotype.8 <- seq.8[list.8, ss$SNP] 400 | 401 | # Skip this round of iteration if genotype is empty 402 | if (ncol(genotype.8) == 0) { 403 | file.connection <- file(paste0("quick-submit/error/", i, ".error4")) 404 | writeLines("Empty genotype.8 matrix!", file.connection) 405 | close(file.connection) 406 | file.remove(paste0("quick-submit/working/", i)) 407 | save(null.object, file=paste0("quick-submit/done/", i)) 408 | 409 | next 410 | } 411 | 412 | # Compare to see if all (a1 and a2) are well-aligned 413 | names(bim.ref)[2] <- "SNP" 414 | names(bim.8)[2] <- "SNP" 415 | bim.temp1 <- subset(bim.ref, SNP %in% ss$SNP) 416 | bim.temp2 <- subset(bim.8, SNP %in% ss$SNP) 417 | bim.temp3 <- left_join(bim.temp1, bim.temp2, by = "SNP") 418 | problem <- !(bim.temp3$V5.x == bim.temp3$V5.y) 419 | 420 | if (sum(problem) != 0) { 421 | genotype.8[, problem] <- 2 - genotype.8[, problem] 422 | } 423 | 424 | rm(bim.temp1) 425 | rm(bim.temp2) 426 | rm(bim.temp3) 427 | rm(seq.8) 428 | rm(problem) 429 | 430 | ####################################################### 431 | # EXTREMELY ANNOYING ISSUE FOUND!!!!!!!!!!!!!!!!!!!!! # 432 | ####################################################### 433 | 434 | # Subject number is NOT the same for all chromosomes! 435 | genotype.train <- genotype.8[which(rownames(genotype.8) %in% list.train), ] 436 | genotype.train <- scale(PatchUp(genotype.train)) 437 | 438 | genotype.valid <- genotype.8[which(rownames(genotype.8) %in% list.valid), ] 439 | genotype.valid <- scale(PatchUp(genotype.valid)) 440 | 441 | genotype.train[is.na(genotype.train)] <- 0 442 | genotype.valid[is.na(genotype.valid)] <- 0 443 | 444 | ########################## 445 | # The original LD matrix # 446 | ########################## 447 | 448 | matrix.LD <- t(genotype.ref) %*% genotype.ref / nrow(genotype.ref) 449 | 450 | ######################### 451 | # Additional adjustment # 452 | ######################### 453 | 454 | do.adjust <- TRUE 455 | 456 | if (do.adjust) { 457 | # Some given parameters 458 | cutoff <- 0.001 459 | matrix.adjust <- matrix(0, nrow(ss), nrow(ss)) 460 | n.effective <- 11400 461 | n.size <- 183 462 | 463 | temp.cof <- (-2 * n.effective) / n.size 464 | 465 | # Read distance information and create an identifer 466 | distance <- as.data.frame(fread(paste0("chr", chr, ".OMNI.interpolated_genetic_map"))) 467 | names(distance) <- c("SNP", "position", "distance") 468 | 469 | # Merge and process 470 | ss.temp <- left_join(ss, distance, by = "SNP") 471 | 472 | for (col in 1:(nrow(ss.temp) - 1)) { 473 | for (row in (col + 1):nrow(ss.temp)) { 474 | entry.temp <- exp(temp.cof * abs(ss.temp[col, "distance"] - ss.temp[row, "distance"])) 475 | entry.temp <- entry.temp * (entry.temp >= cutoff) 476 | matrix.adjust[row, col] <- entry.temp 477 | } 478 | } 479 | 480 | matrix.adjust <- t(matrix.adjust) + matrix.adjust + diag(nrow(ss.temp)) 481 | 482 | rm(distance) 483 | rm(ss.temp) 484 | rm(temp.cof) 485 | 486 | ################### 487 | # Final LD matrix # 488 | ################### 489 | 490 | # "*" stands for elementwise multiplication 491 | matrix.LD <- matrix.LD * matrix.adjust 492 | } 493 | 494 | ############ 495 | # r vector # 496 | ############ 497 | 498 | # Compute r vector 499 | r <- ss$Zscore / sqrt(ss$NrSamples - 1 + ss$Zscore ^2) 500 | 501 | # Get size 502 | size <- nrow(ss) 503 | 504 | ########################## 505 | # Iterate By parameter s # 506 | ########################## 507 | 508 | result <- list() 509 | 510 | for (k in 1:length(s.array)) { 511 | s <- s.array[k] 512 | 513 | ######################################### 514 | # Constructing the initial lambda array # 515 | ######################################### 516 | 517 | # Get the maximum for lambda 518 | z.temp <- numeric(size) 519 | for (m in 1:size) { 520 | z.temp[m] <- abs(r[m]) 521 | } 522 | 523 | lambda.max <- max(z.temp) 524 | lambda.min <- lambda.max * 1E-3 525 | lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 10000) 526 | rm(z.temp) 527 | 528 | ########################################### 529 | # Explore the possible minimum for lambda # 530 | ########################################### 531 | 532 | ###################################### 533 | # What do 0, 1, and 2 stand for? # 534 | # 0: Not tested # 535 | # 1: tested and no problem detected # 536 | # 2: tested and problem is detected # 537 | ###################################### 538 | 539 | problem <- numeric(10000) 540 | start <- 1 541 | end <- 10000 542 | 543 | max.iteration <- 300 544 | 545 | threshold <- 100 546 | alpha <- 0.5 547 | 548 | if (method == "SCAD") { 549 | gamma <- 3.7 550 | } else { 551 | gamma <- 3 552 | } 553 | 554 | dummy.detect <- TRUE 555 | 556 | while (dummy.detect) { 557 | # Stop if all lambdas are OK 558 | if (sum(problem == rep(1, 10000)) == 10000) { 559 | print("All lambdas check out!") 560 | break 561 | } 562 | 563 | start <- min(which(problem == 0)) 564 | end <- max(which(problem == 0)) 565 | 566 | try <- ceiling((start + end) /2) 567 | lambda <- lambda.array[try] 568 | 569 | if (method == "MCP") { 570 | problem[try] <- MCPDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try) 571 | } 572 | if (method == "LASSO") { 573 | problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold, try) 574 | } 575 | if (method == "ElNet") { 576 | problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold, try) 577 | } 578 | if (method == "MNet") { 579 | problem[try] <- MNetDetect(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold, try) 580 | } 581 | if (method == "SCAD") { 582 | problem[try] <- SCADDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try) 583 | } 584 | 585 | if (problem[try] == 1) { 586 | problem[1:try] <- 1 587 | } else { 588 | problem[try:10000] <- 2 589 | } 590 | 591 | if (problem[try] == 1 & problem[min(10000, try + 1)] == 2) { 592 | print(paste0("Problem detected for lambda=", lambda.array[try + 1], ". Finer interval has been saved!")) 593 | lambda.min <- lambda 594 | dummy.detect <- FALSE 595 | } 596 | 597 | if (problem[try] == 2 & problem[try - 1] == 1) { 598 | print(paste0("Problem detected for lambda=", lambda.array[try], ". Finer interval has been saved!")) 599 | lambda.min <- lambda.array[try - 1] 600 | dummy.detect <- FALSE 601 | } 602 | } 603 | 604 | # Update the lambda array 605 | lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 100) 606 | 607 | ####################################################################### 608 | # Optimizing with the chosen penalty (using the updated lambda array) # 609 | ####################################################################### 610 | 611 | ##################### 612 | # Iterate by lambda # 613 | ##################### 614 | 615 | beta <- t(matrix(0, nrow = 1, ncol = size)) 616 | 617 | max.iteration <- 300 618 | 619 | threshold <- 1e-6 620 | alpha <- 0.5 621 | 622 | if (method == "SCAD") { 623 | gamma <- 3.7 624 | } else { 625 | gamma <- 3 626 | } 627 | 628 | if (method == "MCP") { 629 | res <- MCP(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold) 630 | } 631 | if (method == "LASSO") { 632 | res <- ElNet(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold) 633 | } 634 | if (method == "ElNet") { 635 | res <- ElNet(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold) 636 | } 637 | if (method == "MNet") { 638 | res <- MNet(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold) 639 | } 640 | if (method == "SCAD") { 641 | res <- SCAD(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold) 642 | } 643 | 644 | for (b in 1:100) { 645 | result[[length(result) + 1]] <- list(res[, b], lambda.array[b]) 646 | } 647 | } 648 | 649 | ############################################################################################################################ 650 | # -----. osssso ---------..` # 651 | # `ssssso osssso `sssssssssssss+:` # 652 | # `ssssso `...` osssso `` `ssssssssssssssss/ `...` ``` `...` # 653 | # `ssssso `/osssssss+:` osssso:osssso/` `sssss/ `.:osssss+ `:+sssssss+:` +sssso.+sssso/` -+sssssss+- # 654 | # `ssssso /sssssssssssso- ossssssssssssss- `sssss/ `ssssss` :sssssssssssss- +ssssssssssssso .ossso/::ossso` # 655 | # `ssssso -sssss:``./sssss` osssso-``./sssss``sssss/ osssss..sssss:``./sssss. +sssss-``osssss` ossss/---:ssss+ # 656 | # .ssssso /ssss+ `sssss. ossss/ `sssss.`sssss/ `/ssssso :sssso sssss- +sssss /sssss``sssssssssssssso # 657 | # .:/oso/osssss/ .sssss+--:osssso osssss/--:osssso `ssssso+++ossssss+` `sssss+:-:+sssso` +sssss /sssss` +ssss/` `-` # 658 | # .ossssssssss+` .osssssssssss+` ossssosssssssso. `ssssssssssssss+- .osssssssssss+. +sssss /sssss` `/sssssssssso/` # 659 | # -/+osso+/. ./+oosoo+:` /++++.-/+oo+/- `++++++++//:-.` .:+oosoo+:. :+++++ :+++++` `:/oosoo+/-` # 660 | ############################################################################################################################ 661 | 662 | ######################### 663 | # Select optimal lambda # 664 | ######################### 665 | 666 | ############################# 667 | # Calculate goodness of fit # 668 | ############################# 669 | 670 | ############################################### 671 | # Structure of the list result/result.optimal # 672 | ############################################### 673 | # index: # 674 | # [[1]] beta vector # 675 | # [[2]] lambda # 676 | # [[3]] r square adjusted (to be added) # 677 | # [[4]] regression coefficient # 678 | # [[5]] p-value of r square # 679 | # [[6]] r-square projected by GTEx8 - GTEx7 # 680 | # [[7]] runtime for the gene # 681 | # [[8]] average sample size # 682 | # [[9]] iteration index # 683 | # [[10]] correlation # 684 | ############################################### 685 | 686 | # Get the true response vector 687 | response.true <- response.8[list.train, which(colnames(response.8) == gene.proper)] 688 | 689 | # Figure out if response.true is just a vector or multiple vectors and find the optimal 690 | if (class(response.true) == "numeric") { 691 | result.optimal <- FindOptimalResult(response.true) 692 | } else { 693 | max <- -1 694 | for (asd in 1:ncol(response.true)) { 695 | temp <- FindOptimalResult(response.true[, asd])[[3]] 696 | if (temp > max) { 697 | max <- temp 698 | n.col <- asd 699 | } 700 | } 701 | 702 | response.true <- response.true[, n.col] 703 | result.optimal <- FindOptimalResult(response.true) 704 | } 705 | 706 | # Compute the P-value of R square 707 | response.pred <- genotype.train %*% result.optimal[[1]] 708 | quiet( 709 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 710 | ) 711 | result.optimal[[5]] <- test$p.value 712 | 713 | # Little tweak to make beta vector an one-column matrix 714 | result.optimal[[1]] <- as.matrix(result.optimal[[1]]) 715 | 716 | ############################## 717 | # Validate model with GTex-8 # 718 | ############################## 719 | 720 | # Get the response vectors 721 | response.true <- response.8[list.valid, which(colnames(response.8) == gene.proper)] 722 | 723 | if (exists("n.col")) { 724 | response.true <- response.true[, n.col] 725 | rm(n.col) 726 | } 727 | 728 | response.pred <- genotype.valid %*% result.optimal[[1]] 729 | 730 | # Cross validation with subject that are NOT INCLUDED in GTEx7 731 | reg <- summary(lm(response.true ~ response.pred)) 732 | result.optimal[[6]] <- reg$adj.r.squared 733 | 734 | # Keep track of runtime 735 | time.end <- proc.time()[3] 736 | result.optimal[[7]] <- time.end - time.start 737 | 738 | # Keep track of average sample size and miscellaneous items 739 | result.optimal[[8]] <- mean(ss$NrSamples) 740 | result.optimal[[9]] <- i 741 | result.optimal[[10]] <- cor(response.true, response.pred) 742 | 743 | ######## 744 | # Save # 745 | ######## 746 | 747 | save <- result.optimal 748 | 749 | dir.create(paste0("output_", batch), showWarnings = FALSE) 750 | save.chr <- paste0("output_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData") 751 | 752 | save(save, file = save.chr) 753 | 754 | ######################## 755 | # For TWAS-fusion only # 756 | ######################## 757 | 758 | # cv.performance 759 | cv.performance <- matrix(nrow = 2, ncol = 1) 760 | colnames(cv.performance) <- method 761 | rownames(cv.performance) <- c("rsq", "pval") 762 | cv.performance[1, 1] <- reg$adj.r.squared 763 | quiet( 764 | test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater") 765 | ) 766 | cv.performance[2, 1] <- test$p.value 767 | 768 | # snps 769 | snps <- ss 770 | 771 | # wgt.matrix 772 | wgt.matrix <- result.optimal[[1]] 773 | colnames(wgt.matrix) <- method 774 | rownames(wgt.matrix) <- ss$SNP 775 | 776 | dir.create(paste0("output_twas_", batch), showWarnings = FALSE) 777 | save.twas.chr <- paste0("output_twas_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData") 778 | save(cv.performance, snps, wgt.matrix, file = save.twas.chr) 779 | 780 | ############### 781 | # Cleaning-up # 782 | ############### 783 | 784 | file.remove(paste0("quick-submit/working/", i)) 785 | save(null.object, file = paste0("quick-submit/done/", i)) 786 | } 787 | -------------------------------------------------------------------------------- /step2_pos.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | array <- Sys.getenv("SLURM_ARRAY_TASK_ID") 3 | id.job <- as.numeric(array) 4 | 5 | # Packages 6 | library(BEDMatrix) 7 | suppressMessages(library(data.table)) 8 | suppressMessages(library(ddpcr)) 9 | suppressMessages(library(dplyr)) 10 | library(optparse) 11 | 12 | # Options 13 | option_list <- list( 14 | make_option("--models", type = "character", default = FALSE, action = "store", help = "Path of the model folder" 15 | ), 16 | make_option("--path.ref", type = "character", default = FALSE, action = "store", help = "Path of the reference panel plus the prefixes" 17 | ), 18 | make_option("--path.ss", type = "character", default = FALSE, action = "store", help = "Path of the summary statistics" 19 | ), 20 | make_option("--trait" , type = "character", default = FALSE, action = "store", help = "Name of the summary statistics" 21 | ), 22 | make_option("--path.out", type = "character", default = FALSE, action = "store", help = "Path of the output file plus the prefixes" 23 | ), 24 | make_option("--parallel", type = "numeric", default = 100, action = "store", help = "The number of parallel instances" 25 | ) 26 | ) 27 | 28 | opt <- parse_args(OptionParser(option_list = option_list)) 29 | 30 | path.model <- opt$models 31 | path.ref <- opt$path.ref 32 | path.ss <- opt$path.ss 33 | trait <- opt$trait 34 | path.out <- opt$path.out 35 | pieces <- opt$parallel 36 | 37 | # User-defined functions 38 | # ACAT 39 | ACAT <- function(Pvals, Weights = NULL) { 40 | if (sum(is.na(Pvals)) > 0) { 41 | stop("Cannot have NAs in the p-values!") 42 | } 43 | if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0){ 44 | stop("P-values must be between 0 and 1!") 45 | } 46 | is.zero <- (sum(Pvals == 0) >= 1) 47 | is.one <- (sum(Pvals == 1) >= 1) 48 | if (is.zero && is.one) { 49 | return(-1) 50 | } 51 | if (is.zero) { 52 | return(0) 53 | } 54 | if (is.one) { 55 | return(1) 56 | } 57 | 58 | if (is.null(Weights)) { 59 | Weights <- rep(1 / length(Pvals), length(Pvals)) 60 | } else if (length(Weights) != length(Pvals)) { 61 | stop("The length of weights should be the same as that of the p-values") 62 | } else if (sum(Weights < 0) > 0){ 63 | stop("All the weights must be positive!") 64 | } else { 65 | Weights <- Weights / sum(Weights) 66 | } 67 | 68 | is.small <- (Pvals < 1e-16) 69 | if (sum(is.small) == 0){ 70 | cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi)) 71 | } else { 72 | cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi) 73 | cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi)) 74 | } 75 | 76 | if (cct.stat > 1e15){ 77 | pval <- (1 / cct.stat) / pi 78 | } else { 79 | pval <- pcauchy(cct.stat, lower.tail = F) 80 | } 81 | 82 | return(pval) 83 | } 84 | 85 | # PatchUp 86 | PatchUp <- function(M) { 87 | M <- apply(M, 2, function(x) { 88 | x[is.na(x)] <- mean(x, na.rm = TRUE) 89 | return(x) 90 | }) 91 | 92 | return(M) 93 | } 94 | 95 | # Object to store the output 96 | out <- data.frame( 97 | gene_symbol = character(), 98 | gene_id = character(), 99 | chromosome = numeric(), 100 | model_best = character(), 101 | r2_test = numeric(), 102 | p_SCAD = numeric(), 103 | p_ElNet = numeric(), 104 | p_LASSO = numeric(), 105 | p_MCP = numeric(), 106 | p_MNet = numeric(), 107 | z_SCAD = numeric(), 108 | z_ElNet = numeric(), 109 | z_LASSO = numeric(), 110 | z_MCP = numeric(), 111 | z_MNet = numeric(), 112 | p_ACAT = numeric(), 113 | gene_pos = numeric(), 114 | runtime = numeric(), 115 | stringsAsFactors = FALSE 116 | ) 117 | 118 | # Total number of jobs should be 16884 119 | files <- dir(path.model) 120 | 121 | size.pieces <- ceiling(length(files) / pieces) 122 | 123 | # Main iteration 124 | for (gene.index in (1 + (id.job - 1) * size.pieces):(min(length(files), id.job * size.pieces))) { 125 | print(gene.index) 126 | 127 | # Start tracking runtime 128 | time.start <- proc.time()[3] 129 | 130 | # The vector to store all the updates in this iteration 131 | update <- rep(NA, 18) 132 | 133 | # Load weight 134 | load(paste0(path.model, files[gene.index])) 135 | update[5] <- max(out.r2[1, ], na.rm = TRUE) 136 | if (update[5] <= 0.005) { 137 | next 138 | } 139 | 140 | # Get the chromosome 141 | chr <- snps$SNPChr[1] 142 | 143 | # Read the summary statistics file 144 | if (file.exists(paste0(path.ss, trait, "-", chr, ".sumstats"))) { 145 | ss <- paste0(path.ss, trait, "-", chr, ".sumstats") %>% fread() %>% as.data.frame() 146 | } else { 147 | next 148 | } 149 | names(ss) <- colnames(ss) %>% tolower() 150 | ss["ID"] <- paste0(ss$chr, "_", ss$pos) 151 | 152 | # Load the reference panel 153 | quiet( 154 | bim.ref <- as.data.frame(fread(paste0(path.ref, chr, ".bim"))) 155 | ) 156 | quiet( 157 | genotype.ref <- BEDMatrix(paste0(path.ref, chr), simple_names = TRUE) 158 | ) 159 | 160 | names(bim.ref)[c(2, 5, 6)] <- c("SNP" ,"a1", "a2") 161 | 162 | # Find the best model 163 | model.best <- which.max(out.r2[1, ]) 164 | 165 | # Create new identifier for reference panel 166 | bim.ref["ID"] <- paste0(bim.ref$V1, "_", bim.ref$V4) 167 | 168 | # Find the common snps in all three data sets 169 | list.common <- intersect(bim.ref$ID, snps$ID) %>% intersect(., ss$ID) 170 | 171 | # Skip if no common snps found 172 | if (length(list.common) == 0) { 173 | next 174 | } 175 | 176 | # Trim genotype.ref 177 | genotype.temp <- genotype.ref[, bim.ref$ID %in% list.common] 178 | bim.temp <- bim.ref[bim.ref$ID %in% list.common, ] 179 | 180 | # Fix the NAs in reference panel 181 | if (sum(is.na(genotype.temp)) != 0) { 182 | genotype.temp <- PatchUp(genotype.temp) 183 | } 184 | 185 | # Trim ss 186 | ss.temp <- ss[ss$ID %in% list.common, ] 187 | 188 | # Trim snps and wgt.matrix 189 | index.temp <- snps$ID %in% list.common 190 | 191 | snps <- snps[index.temp, ] 192 | out.weight <- out.weight[index.temp, ] 193 | if ("numeric" %in% class(out.weight)) { 194 | out.weight <- t(as.matrix(out.weight)) 195 | } 196 | 197 | rm(index.temp) 198 | 199 | # New 200 | # Re-arrange every data sets 201 | m.1 <- match(ss.temp$ID, bim.temp$ID) 202 | m.2 <- match(ss.temp$ID, snps$ID) 203 | 204 | bim.temp <- bim.temp[m.1, ] 205 | genotype.temp <- genotype.temp[, m.1] 206 | 207 | snps <- snps[m.2, ] 208 | out.weight <- out.weight[m.2, ] 209 | 210 | # Align the mismatched alleles 211 | problem.1 <- ss.temp$a1 != bim.temp$a1 212 | genotype.temp[, problem.1] <- 2 - genotype.temp[, problem.1] 213 | 214 | problem.2 <- ss.temp$a1 != snps$a1 215 | # flipped <- snps$SNP[problem.2] 216 | out.weight[problem.2, ] <- -1 * out.weight[problem.2, ] 217 | 218 | # # Old 219 | # # Re-arrange every data sets 220 | # m.1 <- match(bim.temp$ID, ss.temp$ID) 221 | # m.2 <- match(bim.temp$ID, snps$ID) 222 | 223 | # ss.temp <- ss.temp[m.1, ] 224 | # snps <- snps[m.2, ] 225 | # out.weight <- out.weight[m.2, ] 226 | 227 | # # Align the mismatched alleles 228 | # problem <- ss.temp$a1 != snps$a1 229 | # ss.temp$a1 <- snps$a1 230 | # ss.temp$a2 <- snps$a2 231 | # ss.temp$beta[problem] <- -1 * ss.temp$beta[problem] 232 | # ss.temp$z[problem] <- -1 * ss.temp$z[problem] 233 | 234 | # Compute LD matrix 235 | genotype.temp <- scale(genotype.temp) 236 | matrix.LD <- t(genotype.temp) %*% genotype.temp / (nrow(genotype.temp) - 1) 237 | 238 | # Catch: When there is only one row in wgt.matrix 239 | if ("numeric" %in% class(out.weight)) { 240 | out.weight <- out.weight %>% as.matrix() %>% t() %>% as.data.frame() 241 | } 242 | 243 | # Catch: Over-fitting 244 | for (qwe in 1:5) { 245 | if (max(abs(out.weight[, qwe])) >= 10) { 246 | out.weight[, qwe] <- 0 247 | } 248 | } 249 | 250 | # Iteration by method 251 | for (w in 1:ncol(out.weight)) { 252 | # Settings 253 | weights <- out.weight[, w] 254 | 255 | # Skip if weight is a zero vector 256 | if (sum(weights) == 0) { 257 | update[5 + w] <- NA 258 | next 259 | } 260 | 261 | # Keep the non-zero components of weights vector 262 | keep <- (weights != 0) 263 | print(sum(keep)) 264 | weights <- weights[keep] 265 | #print(sum(names(weights) %in% flipped)) 266 | 267 | # Compute TWAS z-score, r2, and p-value 268 | z.twas <- as.numeric(weights %*% ss.temp$z[keep]) 269 | r2.twas <- as.numeric(weights %*% matrix.LD[keep, keep] %*% weights) 270 | 271 | update[5 + w] <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas)), lower.tail = F)) 272 | update[10 + w] <- z.twas 273 | } 274 | 275 | # ACAT 276 | check.na <- !is.na(update[6:10]) 277 | check.sign <- out.r2[1, ] > 0 278 | 279 | check.final <- check.na & check.sign 280 | 281 | if (sum(check.final) == 0) { 282 | update[16] <- NA 283 | } else { 284 | update[16] <- ACAT(update[6:10][check.final], out.r2[1, check.final] / sum(out.r2[1, check.final])) 285 | } 286 | 287 | # Stop tracking runtime 288 | time.end <- proc.time()[3] 289 | 290 | ################# 291 | # Output format # 292 | ################# 293 | # 1. Gene symbol 294 | # 2. ENSG id 295 | # 3. Chromosome 296 | # 4. Best model 297 | # 5. Best model's R^2 on testing data (Skipped) 298 | # 6-10. P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet) 299 | # 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet) 300 | # 16. P-value from ACAT 301 | # 17. Gene position 302 | # 18. Runtime 303 | 304 | # Update 305 | update[1] <- snps$GeneSymbol[1] 306 | update[2] <- snps$Gene[1] 307 | update[3] <- chr 308 | update[4] <- names(model.best) 309 | update[17] <- snps$GenePos[1] 310 | update[18] <- time.end - time.start 311 | 312 | out[nrow(out) + 1, ] <- update 313 | } 314 | 315 | # Write the result 316 | dir.create(paste0(path.out, trait)) 317 | write.table(out, file = paste0(path.out, trait, "/", trait, "-", id.job), row.names = FALSE, quote = FALSE) 318 | -------------------------------------------------------------------------------- /step2_rsid.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | array <- Sys.getenv("SLURM_ARRAY_TASK_ID") 3 | job.id <- as.numeric(array) 4 | 5 | # Packages 6 | library(BEDMatrix) 7 | suppressMessages(library(data.table)) 8 | suppressMessages(library(ddpcr)) 9 | suppressMessages(library(dplyr)) 10 | library(optparse) 11 | 12 | # Options 13 | option_list <- list( 14 | make_option("--models", type = "character", default = FALSE, action = "store", help = "Path of the model folder" 15 | ), 16 | make_option("--path.ref", type = "character", default = FALSE, action = "store", help = "Path of the reference panel plus the prefixes" 17 | ), 18 | make_option("--path.ss", type = "character", default = FALSE, action = "store", help = "Path of the summary statistics" 19 | ), 20 | make_option("--trait" , type = "character", default = FALSE, action = "store", help = "Name of the summary statistics" 21 | ), 22 | make_option("--path.out", type = "character", default = FALSE, action = "store", help = "Path of the output file plus the prefixes" 23 | ), 24 | make_option("--parallel", type = "numeric", default = 100, action = "store", help = "The number of parallel instances" 25 | ) 26 | ) 27 | 28 | opt <- parse_args(OptionParser(option_list = option_list)) 29 | 30 | path.model <- opt$models 31 | path.ref <- opt$path.ref 32 | path.ss <- opt$path.ss 33 | trait <- opt$trait 34 | path.out <- opt$path.out 35 | pieces <- opt$parallel 36 | 37 | # User-defined functions 38 | # ACAT 39 | ACAT <- function(Pvals, Weights = NULL) { 40 | if (sum(is.na(Pvals)) > 0) { 41 | stop("Cannot have NAs in the p-values!") 42 | } 43 | 44 | if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0){ 45 | stop("P-values must be between 0 and 1!") 46 | } 47 | 48 | is.zero <- (sum(Pvals == 0) >= 1) 49 | is.one <- (sum(Pvals == 1) >= 1) 50 | if (is.zero && is.one) { 51 | return(-1) 52 | } 53 | 54 | if (is.zero) { 55 | return(0) 56 | } 57 | 58 | if (is.one) { 59 | return(1) 60 | } 61 | 62 | if (is.null(Weights)) { 63 | Weights <- rep(1 / length(Pvals), length(Pvals)) 64 | } else if (length(Weights) != length(Pvals)) { 65 | stop("The length of weights should be the same as that of the p-values") 66 | } else if (sum(Weights < 0) > 0){ 67 | stop("All the weights must be positive!") 68 | } else { 69 | Weights <- Weights / sum(Weights) 70 | } 71 | 72 | is.small <- (Pvals < 1e-16) 73 | if (sum(is.small) == 0){ 74 | cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi)) 75 | } else { 76 | cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi) 77 | cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi)) 78 | } 79 | 80 | if (cct.stat > 1e15){ 81 | pval <- (1 / cct.stat) / pi 82 | } else { 83 | pval <- pcauchy(cct.stat, lower.tail = F) 84 | } 85 | 86 | return(pval) 87 | } 88 | 89 | # PatchUp 90 | PatchUp <- function(M) { 91 | M <- apply(M, 2, function(x) { 92 | x[is.na(x)] <- mean(x, na.rm = TRUE) 93 | return(x) 94 | }) 95 | 96 | return(M) 97 | } 98 | 99 | # Object to store the output 100 | out <- data.frame( 101 | gene_symbol = character(), 102 | gene_id = character(), 103 | chromosome = numeric(), 104 | model_best = character(), 105 | r2_test = numeric(), 106 | p_SCAD = numeric(), 107 | p_ElNet = numeric(), 108 | p_LASSO = numeric(), 109 | p_MCP = numeric(), 110 | p_MNet = numeric(), 111 | z_SCAD = numeric(), 112 | z_ElNet = numeric(), 113 | z_LASSO = numeric(), 114 | z_MCP = numeric(), 115 | z_MNet = numeric(), 116 | p_ACAT = numeric(), 117 | gene_pos = numeric(), 118 | runtime = numeric(), 119 | stringsAsFactors = FALSE 120 | ) 121 | 122 | # Total number of jobs = 16864 123 | files <- dir(path.model) 124 | 125 | size.pieces <- ceiling(length(files) / pieces) 126 | 127 | # Main iteration 128 | for (gene.index in (1 + (id.job - 1) * size.pieces):(min(length(files), id.job * size.pieces))) { 129 | print(gene.index) 130 | 131 | # Start tracking runtime 132 | time.start <- proc.time()[3] 133 | 134 | # The vector to store all the updates in this iteration 135 | update <- rep(NA, 18) 136 | 137 | # Load weight 138 | load(paste0(path.model, files[gene.index])) 139 | update[5] <- max(out.r2[1, ], na.rm = TRUE) 140 | if (update[5] <= 0.005) { 141 | next 142 | } 143 | 144 | # Get the chromosome 145 | chr <- snps$SNPChr[1] 146 | 147 | # Read the summary statistics file 148 | if (file.exists(paste0(path.ss, trait, "-", chr, ".sumstats"))) { 149 | ss <- paste0(path.ss, trait, "-", chr, ".sumstats" ) %>% fread() %>% as.data.frame() 150 | } else { 151 | next 152 | } 153 | names(ss) <- colnames(ss) %>% tolower() 154 | 155 | # Load the reference panel 156 | quiet( 157 | bim.ref <- as.data.frame(fread(paste0(path.ref, chr, ".bim"))) 158 | ) 159 | quiet( 160 | genotype.ref <- BEDMatrix(paste0(path.ref, chr), simple_names = TRUE) 161 | ) 162 | 163 | names(bim.ref)[c(2, 5, 6)] <- c("SNP" ,"a1", "a2") 164 | 165 | # Find the best model 166 | model.best <- which.max(out.r2[1, ]) 167 | 168 | # Find the common snps in all three data sets 169 | list.common <- intersect(intersect(bim.ref$SNP, snps$SNP), ss$snp) 170 | 171 | # Skip if no common snps found 172 | if (length(list.common) == 0) { 173 | next 174 | } 175 | 176 | # Trim genotype.ref 177 | genotype.temp <- genotype.ref[, bim.ref$SNP %in% list.common] 178 | bim.temp <- bim.ref[bim.ref$SNP %in% list.common, ] 179 | 180 | # Fix the NAs in reference panel and scale 181 | if (sum(is.na(genotype.temp)) != 0) { 182 | genotype.temp <- PatchUp(genotype.temp) 183 | } 184 | 185 | genotype.temp <- scale(genotype.temp) 186 | 187 | # Trim ss 188 | ss.temp <- ss[ss$snp %in% list.common, ] 189 | 190 | # Trim snps and wgt.matrix 191 | index.temp <- snps$SNP %in% list.common 192 | 193 | snps <- snps[index.temp, ] 194 | out.weight <- out.weight[index.temp, ] 195 | if ("numeric" %in% class(out.weight)) { 196 | out.weight <- t(as.matrix(out.weight)) 197 | } 198 | 199 | rm(index.temp) 200 | 201 | # Re-arrange every data sets 202 | m.1 <- match(bim.temp$SNP, ss.temp$snp) 203 | m.2 <- match(bim.temp$SNP, snps$SNP) 204 | 205 | ss.temp <- ss.temp[m.1, ] 206 | snps <- snps[m.2, ] 207 | out.weight <- out.weight[m.2, ] 208 | 209 | # Align the mismatched alleles 210 | problem <- ss.temp$a1 != snps$a1 211 | ss.temp$a1 <- snps$a1 212 | ss.temp$a2 <- snps$a2 213 | ss.temp$beta[problem] <- -1 * ss.temp$beta[problem] 214 | ss.temp$z[problem] <- -1 * ss.temp$z[problem] 215 | 216 | # Compute LD matrix 217 | matrix.LD <- t(genotype.temp) %*% genotype.temp / (nrow(genotype.temp) - 1) 218 | 219 | # Catch: When there is only one row in wgt.matrix 220 | if ("numeric" %in% class(out.weight)) { 221 | out.weight <- out.weight %>% as.matrix() %>% t() %>% as.data.frame() 222 | } 223 | 224 | # Catch: Over-fitting 225 | for (qwe in 1:5) { 226 | if (max(abs(out.weight[, qwe])) >= 10) { 227 | out.weight[, qwe] <- 0 228 | } 229 | } 230 | 231 | # Iteration by method 232 | for (w in 1:ncol(out.weight)) { 233 | # Settings 234 | weights <- out.weight[, w] 235 | 236 | # Skip if weight is a zero vector 237 | if (sum(weights) == 0) { 238 | update[5 + w] <- NA 239 | next 240 | } 241 | 242 | # Keep the non-zero components of weights vector 243 | keep <- (weights != 0) 244 | weights <- weights[keep] 245 | 246 | # Compute TWAS z-score, r2, and p-value 247 | z.twas <- as.numeric(weights %*% ss.temp$z[keep]) 248 | r2.twas <- as.numeric(weights %*% matrix.LD[keep, keep] %*% weights) 249 | 250 | update[5 + w] <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas)), lower.tail = F)) 251 | update[10 + w] <- z.twas 252 | } 253 | 254 | # ACAT 255 | check.na <- !is.na(update[6:10]) 256 | check.sign <- out.r2[1, ] > 0 257 | 258 | check.final <- check.na & check.sign 259 | 260 | if (sum(check.final) == 0) { 261 | update[16] <- NA 262 | } else { 263 | update[16] <- ACAT(update[6:10][check.final], out.r2[1, check.final] / sum(out.r2[1, check.final])) 264 | } 265 | 266 | # Stop tracking runtime 267 | time.end <- proc.time()[3] 268 | 269 | ################# 270 | # Output format # 271 | ################# 272 | # 1. Gene symbol 273 | # 2. ENSG id 274 | # 3. Chromosome 275 | # 4. Best model 276 | # 5. Best model's R^2 on testing data (Skipped) 277 | # 6-10. P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet) 278 | # 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet) 279 | # 16. P-value from ACAT 280 | # 17. Gene position 281 | # 18. Runtime 282 | 283 | # Update 284 | update[1] <- snps$GeneSymbol[1] 285 | update[2] <- snps$Gene[1] 286 | update[3] <- chr 287 | update[4] <- names(model.best) 288 | update[17] <- snps$GenePos[1] 289 | update[18] <- time.end - time.start 290 | 291 | out[nrow(out) + 1, ] <- update 292 | } 293 | 294 | # Write the result 295 | dir.create(paste0(path.out, trait)) 296 | write.table(out, file = paste0(path.out, trait, "/", trait, "-", id.job), row.names = FALSE, quote = FALSE) 297 | --------------------------------------------------------------------------------