├── APSS.R
├── CITATION.cff
├── DetectAndSolve_debugged.cpp
├── README.md
├── illustration.bmp
├── mainbody_cpp_pos_precise.R
├── mainbody_cpp_rsid_precise.R
├── step2_pos.R
└── step2_rsid.R


/APSS.R:
--------------------------------------------------------------------------------
  1 | APSS <- function(directory.working, filename, auto = FALSE, do.return = FALSE, BIG = 2) {
  2 |     # Packages
  3 |     suppressMessages(library(data.table))
  4 |     suppressMessages(library(magrittr))
  5 |     suppressMessages(library(R.utils))
  6 |     suppressMessages(library(stringr))
  7 | 
  8 |     # Length of separator
  9 |     length.separator <- 120
 10 |     separator <- rep("=", length.separator) %>% paste0(., collapse = "")
 11 | 
 12 |     # Set working directory
 13 |     setwd(directory.working)
 14 | 
 15 |     # Read the input file
 16 |     check.BIG <- FALSE
 17 |     if (file.size(filename) >= BIG * 1024 * 1024 * 1024) {
 18 |         check.BIG <- TRUE
 19 |         cat(paste0("Input dataset is VERY VERY VERY BIG. We have to be selective :-<<<"), sep = "\n")
 20 |         cat(separator, sep = "\n")
 21 |         raw.try <- fread(filename, showProgress = FALSE, nrows = 100) %>% as.data.frame()
 22 | 
 23 |         # Print the head of the input dataset
 24 |         cat("Following are the first few lines of the dataset:", sep = "\n")
 25 |         cat(separator, sep = "\n")
 26 |         print(head(raw.try))
 27 |         cat(separator, sep = "\n")
 28 | 
 29 |         # Ask user to pick some columns 
 30 |         cat("Press ENTER to skip a column! Write something if you want to keep a column. ", sep = "\n")
 31 |         cat(separator, sep = "\n")
 32 |         keep <- character(ncol(raw.try))
 33 |         for (index in 1:ncol(raw.try)) {
 34 |             keep[index] <- readline(prompt = paste0("Keep ", colnames(raw.try)[index], "? "))
 35 |         }
 36 |         cat(separator, sep = "\n")
 37 |         keep <- which(keep != "")
 38 | 
 39 |         rm(raw.try)
 40 | 
 41 |         raw <- fread(filename, showProgress = FALSE, select = keep, fill = TRUE) %>% as.data.frame()
 42 |     } else {
 43 |         raw <- fread(filename, showProgress = FALSE, fill = TRUE) %>% as.data.frame()
 44 |     }
 45 | 
 46 |     if (check.BIG) {
 47 |         cat(paste0(nrow(raw), " lines read from the selected raw summary statistics file!"), sep = "\n")
 48 |     } else {
 49 |         cat(paste0(nrow(raw), " lines read from the raw summary statistics file!"), sep = "\n")
 50 |     }
 51 |     cat(separator, sep = "\n")
 52 | 
 53 |     # Get the header
 54 |     header.inner <- colnames(raw)
 55 | 
 56 |     # Auto mode or not?
 57 |     if ("V1" %in% header.inner | "V2" %in% header.inner | "V3" %in% header.inner & auto) {
 58 |         cat("I don't see any header! Switch to interactive mode!", sep = "\n")
 59 |         cat(separator, sep = "\n")
 60 |     }
 61 | 
 62 |     # Start interacting
 63 |     if (auto == FALSE) {
 64 |         # Print the head of the input data set
 65 |         cat("Following are the first few lines of the dataset:", sep = "\n")
 66 |         cat(separator, sep = "\n")
 67 |         print(head(raw))
 68 |         cat(separator, sep = "\n")
 69 | 
 70 |         # Ask user for reasonable column names
 71 |         cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n")
 72 |         for (index in 1:ncol(raw)) {
 73 |             header.inner[index] <- readline(prompt = paste0("What would be a proper column name for ", colnames(raw)[index], "? "))
 74 |         }
 75 |         cat(separator, sep = "\n")
 76 |     }
 77 | 
 78 |     # Make a copy of the desired header
 79 |     header.outer <- header.inner
 80 | 
 81 |     ######################
 82 |     # Process the header #
 83 |     ######################
 84 | 
 85 |     # Initialize the header
 86 |     header.inner <- tolower(header.inner)
 87 | 
 88 |     # SNP
 89 |     try.snp <- c("snp", "markername", "snpid", "rs", "rsid", "rs_number", "snps")
 90 |     header.inner[header.inner %in% try.snp] <- "SNP"
 91 | 
 92 |     # A1
 93 |     try.a1 <- c("a1", "allele1", "allele_1", "effect_allele", "reference_allele", "inc_allele", "ea", "ref", "a1lele1", "al1ele1")
 94 |     header.inner[header.inner %in% try.a1] <- "A1"
 95 | 
 96 |     # A2
 97 |     try.a2 <- c("a2", "allele2", "allele_2", "other_allele", "non_effect_allele", "dec_allele", "nea", "alt", "a0")
 98 |     header.inner[header.inner %in% try.a2] <- "A2"
 99 | 
100 |     # Z-score
101 |     try.z <- c("zscore", "z-score", "gc_zscore", "z")
102 |     header.inner[header.inner %in% try.z] <- "Z"
103 | 
104 |     # P
105 |     try.p <- c("pvalue", "p_value", "pval", "p_val", "gc_pvalue", "p")
106 |     header.inner[header.inner %in% try.p] <- "P"
107 | 
108 |     # Beta
109 |     try.beta <- c("b", "beta", "effects", "effect")
110 |     header.inner[header.inner %in% try.beta] <- "BETA"
111 | 
112 |     # Odds ratio
113 |     try.or <- c("or")
114 |     header.inner[header.inner %in% try.or] <- "ODDS_RATIO"
115 | 
116 |     # Log odds
117 |     try.logodds <- c("log_odds", "logor", "log_or")
118 |     header.inner[header.inner %in% try.logodds] <- "LOG_ODDS"
119 | 
120 |     # MAF
121 |     try.maf <- c("eaf", "frq", "maf", "frq_u", "f_u", "freq")
122 |     header.inner[header.inner %in% try.maf] <- "MAF"
123 | 
124 |     # INFO
125 |     try.info <- c("info", "info_score")
126 |     header.inner[header.inner %in% try.info] <- "INFO"
127 | 
128 |     # Chromosome
129 |     try.chromosome <- c("chrom", "ch", "chr", "chromosome")
130 |     header.inner[header.inner %in% try.chromosome] <- "CHROMOSOME"
131 | 
132 |     # Position
133 |     try.position <- c("pos", "posit", "position", "bp", "bpos")
134 |     header.inner[header.inner %in% try.position] <- "POSITION"
135 | 
136 |     # Standard error
137 |     try.se <- c("se", "sebeta", "beta_se")
138 |     header.inner[header.inner %in% try.se] <- "SE"
139 | 
140 |     # CHR plus POSITION
141 |     try.chrpos <- c("chr:pos", "chr_pos", "chr-pos", "chrpos")
142 |     header.inner[header.inner %in% try.chrpos] <- "CHR_POS"
143 | 
144 |     # Samplesize
145 |     try.samplesize <- c("n", "samplesize", "num_samples", "sample")
146 |     header.inner[header.inner %in% try.samplesize] <- "SAMPLESIZE"
147 | 
148 |     # Update the header
149 |     colnames(raw) <- header.inner
150 | 
151 |     # Drop some rows
152 |     n.start <- ncol(raw)
153 | 
154 |     raw <- raw[, which(colnames(raw) != "")]
155 |     header.outer <- header.outer[header.outer != ""]
156 |     header.inner <- header.inner[header.inner != ""]
157 | 
158 |     n.end <- ncol(raw)
159 |     cat(paste0(n.start - n.end, " columns were dropped from the input dataset!"), sep = "\n")
160 |     cat(separator, sep = "\n")
161 | 
162 |     # Double-check the class of each column and coerce if needed
163 |     list.coerce <- c("Z", "P", "BETA", "ODDS_RATIO", "LOG_ODDS", "MAF", "INFO", "SE")
164 | 
165 |     options(warn = -1)
166 |     if ("POSITION" %in% header.inner) {
167 |         if (class(raw$POSITION) == "character") {
168 |             class(raw$POSITION) <- "integer"
169 |             cat(paste0("Column POSITION has wrong class and has been coerced to integer."), sep = "\n")
170 |             cat(separator, sep = "\n")
171 |         }
172 |     }
173 | 
174 |     for (i in 1:length(header.inner)) {
175 |         if (header.inner[i] %in% list.coerce) {
176 |             if (class(raw[, header.inner[i]]) != "numeric") {
177 |                 class(raw[, header.inner[i]]) <- "numeric"
178 |                 cat(paste0("Column ", header.inner[i], " has wrong class and has been coerced to numeric."), sep = "\n")
179 |                 cat(separator, sep = "\n")
180 |             }
181 |         }
182 |     }
183 |     options(warn = 0)
184 | 
185 |     # Drop rows with missing values
186 |     do.missing <- readline(prompt = "Drop rows with missing values? Y/N ")
187 | 
188 |     if (do.missing == "Y") {
189 |         n.start <- nrow(raw)
190 | 
191 |         raw <- raw[complete.cases(raw), ]
192 | 
193 |         n.end <- nrow(raw)
194 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
195 | 
196 |         if ((n.start - n.end) / n.start >= 0.8) {
197 |             cat(paste0(round((n.start - n.end) / n.start * 100, digits = 2), "% of rows were removed!"), sep = "\n")
198 |             cat("Please check the dataset manually because I am sensing something fishy.")
199 |             Sys.sleep(5)
200 |             .Internal(.invokeRestart(list(NULL, NULL), NULL))
201 |         }
202 |         cat(separator, sep = "\n")
203 |     }
204 | 
205 |     # Split CHR_POS
206 |     if ("CHR_POS" %in% header.inner) {
207 |         do.split <- readline(prompt = paste0("Do you want me to split the ID column? (This could take a while.) Y/N ")) %>% toupper()
208 |         cat(separator, sep = "\n")
209 |         if (do.split == "Y") {
210 |             split.temp <- str_split(raw$CHR_POS, ":") %>% unlist()
211 |             if (sum(grepl("chr", raw$CHR_POS)) != 0) {
212 |                 split.temp[2 * (1:nrow(raw)) - 1] <- gsub("chr", "", split.temp[2 * (1:nrow(raw)) - 1])
213 |             }
214 | 
215 |             split.temp <- as.numeric(split.temp)
216 |             split.temp <- matrix(split.temp, nrow = nrow(raw), ncol = 2, byrow = TRUE) %>% as.data.frame()
217 |             names(split.temp) <- c("V1", "V2")
218 | 
219 |             header.inner <- c(header.inner, "CHROMOSOME", "POSITION")
220 |             raw["CHROMOSOME"] <- split.temp[, 1]
221 |             raw["POSITION"]   <- split.temp[, 2]
222 | 
223 |             # Print the head of split.temp
224 |             cat("Following are the first few lines of the splitted column:", sep = "\n")
225 |             cat(separator, sep = "\n")
226 |             print(head(split.temp))
227 |             cat(separator, sep = "\n")
228 | 
229 |             # Ask user for reasonable column names
230 |             cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n")
231 |             for (index in 1:ncol(split.temp)) {
232 |                 header.outer[length(header.outer) + 1] <- readline(prompt = paste0("What would be a proper column name for ", colnames(split.temp)[index], "? "))
233 |             }
234 |             cat(separator, sep = "\n")
235 |         }
236 |     }
237 | 
238 |     # Problematic rsid
239 |     test.1 <- grepl(":", raw$SNP)
240 |     test.2 <- grepl("_", raw$SNP)
241 |     test.3 <- grepl("chr", raw$SNP)
242 |     if (sum(test.1 & test.3) == nrow(raw)) {
243 |         cat("I noticed a pattern (chrX:position) in SNPs' IDs!", sep = "\n")
244 |         do.split <- readline(prompt = paste0("Do you want me to split the ID column? (This could take a while.) Y/N "))
245 |         cat(separator, sep = "\n")
246 |         if (do.split == "Y") {
247 |             split.temp <- str_split(raw$SNP, ":") %>% unlist()
248 |             split.temp[2 * (1:nrow(raw)) - 1] <- gsub("chr", "", split.temp[2 * (1:nrow(raw)) - 1])
249 | 
250 |             split.temp <- as.numeric(split.temp)
251 |             split.temp <- matrix(split.temp, nrow = nrow(raw), ncol = 2, byrow = TRUE) %>% as.data.frame()
252 |             names(split.temp) <- c("V1", "V2")
253 | 
254 |             header.inner <- c(header.inner, "CHROMOSOME", "POSITION")
255 |             raw["CHROMOSOME"] <- split.temp[, 1]
256 |             raw["POSITION"]   <- split.temp[, 2]
257 | 
258 |             # Print the head of split.temp
259 |             cat("Following are the first few lines of the splitted column:", sep = "\n")
260 |             cat(separator, sep = "\n")
261 |             print(head(split.temp))
262 |             cat(separator, sep = "\n")
263 | 
264 |             # Ask user for reasonable column names
265 |             cat("Feel free to input whatever column names you want as long as they are unique! Press ENTER to skip a column!", sep = "\n")
266 |             for (index in 1:ncol(split.temp)) {
267 |                 header.outer[length(header.outer) + 1] <- readline(prompt = paste0("What would be a proper column name for ", colnames(split.temp)[index], "? "))
268 |             }
269 |             cat(separator, sep = "\n")
270 |         }
271 |     }
272 | 
273 |     if (sum(test.1) != 0) {
274 |         cat(paste0(sum(test.1), " SNPs have ':' in their IDs. Please be careful!"), sep = "\n")
275 |         cat(separator, sep = "\n")
276 |     }
277 | 
278 |     if (sum(test.2) != 0) {
279 |         cat(paste0(sum(test.2), " SNPs have '_' in their IDs. Please be careful!"), sep = "\n")
280 |         cat(separator, sep = "\n")
281 |     }
282 | 
283 |     # Compare raw sumstats with the "list" (if provided)
284 |     do.hapmap3 <- readline(prompt = paste0("Subset the dataset by HapMap3? Y/N "))
285 |     cat(separator, sep = "\n")
286 | 
287 |     if (do.hapmap3 == "Y") {
288 |         if (!file.exists("w_hm3.snplist.bz2")) {
289 |             cat("I can't find HapMap3 list file and will try to download one from the Internet.", sep = "\n")
290 |             cat(separator, sep = "\n")
291 |             download.file("https://data.broadinstitute.org/alkesgroup/LDSCORE/w_hm3.snplist.bz2", destfile = "w_hm3.snplist.bz2", quiet = TRUE)
292 |         }
293 |         list.hapmap3 <- as.data.frame(fread("w_hm3.snplist.bz2"))
294 | 
295 |         n.start <- nrow(raw)
296 | 
297 |         raw <- raw[raw$SNP %in% list.hapmap3$SNP, ]
298 | 
299 |         n.end <- nrow(raw)
300 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
301 |         cat(separator, sep = "\n")
302 |     }
303 | 
304 |     # Drop rows with MAF<0.01
305 |     if ("MAF" %in% header.inner) {
306 |         do.maf <- readline(prompt = paste0("Drop rows with MAF<=0.01? Y/N "))
307 |         if (do.maf == "Y") {
308 |         n.start <- nrow(raw)
309 | 
310 |         raw <- raw[raw$MAF >= 0.01,]
311 | 
312 |         n.end <- nrow(raw)
313 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
314 |         cat(separator, sep = "\n")
315 |         } else {
316 |         cat(separator, sep = "\n")
317 |         }
318 |     }
319 | 
320 |     # Drop rows with INFO<0.9
321 |     if ("INFO" %in% header.inner) {
322 |         do.info <- readline(prompt = paste0("Drop rows with INFO <= 0.9? Y/N "))
323 |         if (do.info == "Y") {
324 |         n.start <- nrow(raw)
325 | 
326 |         raw <- raw[raw$INFO >= 0.9,]
327 | 
328 |         n.end <- nrow(raw)
329 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
330 |         cat(separator, sep = "\n")
331 |         } else {
332 |         cat(separator, sep = "\n")
333 |         }
334 |     }
335 | 
336 |     # Drop rows not on chromosome 1-22
337 |     if ("CHROMOSOME" %in% header.inner) {
338 |         raw$CHROMOSOME <- as.character(raw$CHROMOSOME)
339 |         if (length(union(unique(raw$CHROMOSOME), as.character(1:22))) != 22) {
340 |             do.dropXY <- readline(prompt = paste0("Drop SNPs on chromosome X and Y? Y/N "))
341 |             if (do.dropXY == "Y") {
342 |                 n.start <- nrow(raw)
343 |                 raw <- raw[raw$CHROMOSOME %in% as.character(1:22), ]
344 |                 n.end <- nrow(raw)
345 |                 cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
346 |                 cat(separator, sep = "\n")
347 |             }
348 |         }
349 |     }
350 | 
351 |     # Drop rows for being on the MHC region
352 |     if ("CHROMOSOME" %in% header.inner & "POSITION" %in% header.inner) {
353 |         do.MHC <- readline(prompt = paste0("Drop rows on MHC region? Y/N "))
354 |         if (do.MHC == "Y") {
355 |             n.start <- nrow(raw)
356 |             raw$CHROMOSOME <- as.character(raw$CHROMOSOME)
357 |             raw <- raw[!(raw$CHROMOSOME == "6" & raw$POSITION >= 26000000 & raw$POSITION <= 34000000),]
358 |             n.end <- nrow(raw)
359 |             cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
360 |             cat(separator, sep = "\n")
361 |         } else {
362 |             cat(separator, sep = "\n")
363 |         }
364 |     }
365 | 
366 |     # Transform P into usual range (0,1)
367 |     if ("P" %in% header.inner) {
368 |         if (max(raw$P) > 1 & min(raw$P) < 0) {
369 |             do.transform <- readline(prompt = paste0("Transform P into usual range (0,1)? Y/N "))
370 |             if (do.transform == "Y") {
371 |                 raw <- transform(raw, P = 10 ^ (-P))
372 |                 cat(separator, sep = "\n")
373 |             } else {
374 |                 cat(separator, sep = "\n")
375 |             }
376 |         }
377 |     }
378 | 
379 |     # Process A1 and A2
380 |     if ("A1" %in% header.inner & "A2" %in% header.inner) {
381 |         # MAKE A1 AND A2 CAPS
382 |         raw$A1 <- toupper(raw$A1)
383 |         raw$A2 <- toupper(raw$A2)
384 | 
385 |         # Erroneous A1/A2
386 |         do.error <- readline(prompt = paste0("Drop rows with erroneous A1/A2 ? Y/N "))
387 |         if (do.error == "Y") {
388 |         n.start <- nrow(raw)
389 | 
390 |         raw <- raw[raw$A1 == 'A' | raw$A1 == 'C' | raw$A1 == 'T' | raw$A1 == 'G', ]
391 |         raw <- raw[raw$A2 == 'A' | raw$A2 == 'C' | raw$A2 == 'T' | raw$A2 == 'G', ]
392 | 
393 |         n.end <- nrow(raw)
394 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
395 |         cat(separator, sep = "\n")
396 |         } else {
397 |         cat(separator, sep = "\n")
398 |         }
399 | 
400 |         # Ambigious SNPs
401 |         do.ambi <- readline(prompt = paste0("Drop ambigious SNPs? Y/N "))
402 |         if (do.ambi == "Y") {
403 |         n.start <- nrow(raw)
404 | 
405 |         raw <- raw[!((raw$A1 == 'A' & raw$A2 == 'T') | (raw$A1 == 'T' & raw$A2 == 'A') | (raw$A1 == 'C' & raw$A2 == 'G') | (raw$A1 == 'G' & raw$A2 == 'C')),]
406 | 
407 |         n.end <- nrow(raw)
408 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
409 |         cat(separator, sep = "\n")
410 |         } else {
411 |         cat(separator, sep = "\n")
412 |         }
413 |     } else {
414 |         cat("WARNING: There might be some issue with your settings or the data itself! Because I didn't find any A1/A2 columns!", sep = "\n")
415 |     }
416 | 
417 |     # Missing z-score?
418 |     calculate.z <- FALSE
419 |     if (!("Z" %in% header.inner)) {
420 |         do.z <- readline(prompt = paste0("No z-score column is found. Do you want me to create one? Y/N "))
421 |         if (do.z == "Y") {
422 |             if ("BETA" %in% header.inner & "SE" %in% header.inner) {
423 |                 raw["Z"] <- raw$BETA / raw$SE
424 |                 calculate.z <- TRUE
425 |             } else if ("ODDS_RATIO" %in% header.inner & "SE" %in% header.inner) {
426 |                 raw["Z"] <- log(raw$ODDS_RATIO) / raw$SE
427 |                 calculate.z <- TRUE
428 |             } else if ("LOG_ODDS" %in% header.inner & "SE" %in% header.inner) {
429 |                 raw["Z"] <- raw$LOG_ODDS / raw$SE
430 |                 calculate.z <- TRUE
431 |             } else if ("BETA" %in% header.inner & "P" %in% header.inner) {
432 |                 raw["Z"] <- sign(raw$BETA) * abs(qnorm(raw$P / 2))
433 |                 calculate.z <- TRUE
434 |             } else if ("ODDS_RATIO" %in% header.inner & "P" %in% header.inner) {
435 |                 raw["Z"] <- sign(log(raw$ODDS_RATIO)) * abs(qnorm(raw$P / 2))
436 |                 calculate.z <- TRUE
437 |             } else if ("LOG_ODDS" %in% header.inner & "P" %in% header.inner) {
438 |                 raw["Z"] <- sign(raw$ODDS_RATIO) * abs(qnorm(raw$P / 2))
439 |                 calculate.z <- TRUE
440 |             } else {
441 |                 cat("I can't calculate z-score based on the information I have. SAD FACE EMOJI.", sep = "\n")
442 |             }
443 | 
444 |             if (sum(is.na(raw$Z)) != 0) {
445 |                 n.start <- nrow(raw)
446 |                 raw <- raw[!is.na(raw$Z),]
447 |                 n.end <- nrow(raw)
448 |                 cat(paste0(n.start - n.end, " rows removed for having invalid z-score!"), sep = "\n")
449 |             }
450 |         }
451 |         cat(separator, sep = "\n")
452 |     }
453 | 
454 |     # Calculate chi^2 and sieve out observations with chi^2 larger than 80
455 |     if ("P" %in% header.inner) {
456 |         do.chi2 <- readline(prompt = paste0("Sieve out observations with chi^2 larger than 80? Y/N "))
457 |         if (do.chi2 == "Y") {
458 |         n.start <- nrow(raw)
459 | 
460 |         raw <- raw[qchisq(raw$P, 1, lower.tail = FALSE) <= 80,]
461 | 
462 |         n.end <- nrow(raw)
463 |         cat(paste0(n.start - n.end, " rows removed!"), sep = "\n")
464 |         cat(separator, sep = "\n")
465 |         } else {
466 |         cat(separator, sep = "\n")
467 |         }
468 |     }
469 | 
470 |     # Update the header
471 |     if (calculate.z) {
472 |         header.inner <- c(header.inner, "Z")
473 |         header.outer <- c(header.outer, "Z")
474 |     }
475 |     colnames(raw) <- header.outer
476 | 
477 |     # Display output
478 |     cat("Now take a final look at the output!", sep = "\n")
479 |     print(head(raw))
480 | 
481 |     cat(separator, sep = "\n")
482 | 
483 |     # Output
484 |     do.output <- readline(prompt = "Do you want to write the processed summary statistics to your hard drive? Y/N ") %>% toupper()
485 |     if (do.output == "Y") {
486 |         char.write <- readline(prompt = "Please input your desired filename (with extension and if no input is given, a default name will be assigned):")
487 |         if (char.write == "") {
488 |             char.write <- paste0(filename, ".sumstats")
489 |         }
490 |         fwrite(raw, file = char.write, sep = " ", row.names = FALSE, quote = FALSE)
491 | 
492 |         CHROMOSOME <- raw[, header.inner == "CHROMOSOME"]
493 |         for (i in unique(CHROMOSOME)) {
494 |             fwrite(raw[CHROMOSOME == i, ], file = paste0(filename, "-", i, ".sumstats"), sep = " ", row.names = FALSE, quote = FALSE)
495 |         }
496 |     }
497 | 
498 |     # Return?
499 |     if (do.return) {
500 |         return(raw)
501 |     }
502 | }
503 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Zhang"
 5 |   given-names: "Zichen"
 6 | - family-names: "Wu"
 7 |   given-names: "Chong"
 8 | title: "SUMMIT"
 9 | version: 1.0.2
10 | doi: 10.5281/zenodo.7034435
11 | date-released: 2022-08-28
12 | url: "https://github.com/ChongWuLab/SUMMIT/"
13 | 


--------------------------------------------------------------------------------
/DetectAndSolve_debugged.cpp:
--------------------------------------------------------------------------------
  1 | #include <armadillo>
  2 | #include <RcppArmadillo.h>
  3 | #include <Rcpp.h>
  4 | #include <vector>
  5 | 
  6 | using namespace arma;
  7 | using namespace Rcpp;
  8 | 
  9 | // [[Rcpp::depends(RcppArmadillo)]]
 10 | // [[Rcpp::export]]
 11 | 
 12 | int MCPDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold, int index_lambda)
 13 | {
 14 |     const int n_var = r.size();
 15 |     double lambda_used, z, dlx;
 16 | 
 17 |     arma::mat temp(1, 1);
 18 | 
 19 |     arma::vec beta;
 20 |     beta.zeros(n_var);
 21 | 
 22 |     arma::vec beta_current;
 23 |     beta_current.zeros(n_var);
 24 | 
 25 |     int problem = 1;
 26 | 
 27 |     index_lambda = index_lambda - 1;
 28 |     lambda_used = lambda(index_lambda);
 29 | 
 30 |     for (int j = 0; j < max_iteration; j++)
 31 |     {
 32 |         for (int q = 0; q < n_var; q++)
 33 |         {
 34 |             temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
 35 |             z = temp(0, 0);
 36 |             if (beta_current(q) <= gamma * lambda_used)
 37 |             {
 38 |                 if (z > lambda_used)
 39 |                 {
 40 |                     beta_current(q) = (z - lambda_used) / (1 + s - 1 / gamma);
 41 |                 }
 42 |                 else if (z < -lambda_used)
 43 |                 {
 44 |                     beta_current(q) = (z + lambda_used) / (1 + s - 1 / gamma);
 45 |                 }
 46 |                 else
 47 |                 {
 48 |                     beta_current(q) = 0.0;
 49 |                 }
 50 |             }
 51 |             else
 52 |             {
 53 |                 beta_current(q) = z / (1 + s);
 54 |             }
 55 |         }
 56 | 
 57 |         dlx = arma::norm(beta_current, 2);
 58 | 
 59 |         if (dlx > threshold)
 60 |         {
 61 |             problem = 2;
 62 |             break;
 63 |         }
 64 | 
 65 |         beta = beta_current;
 66 |     }
 67 | 
 68 |     return (problem);
 69 | }
 70 | 
 71 | // [[Rcpp::depends(RcppArmadillo)]]
 72 | // [[Rcpp::export]]
 73 | 
 74 | arma::mat MCP(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold)
 75 | {
 76 |     const int n_var = r.size();
 77 |     const int n_lambda = lambda.size();
 78 |     double lambda_used, z, dlx, dlx2;
 79 | 
 80 |     arma::mat res(n_var, n_lambda);
 81 |     res.fill(0);
 82 | 
 83 |     arma::mat temp(1, 1);
 84 | 
 85 |     arma::vec beta;
 86 |     beta.zeros(n_var);
 87 | 
 88 |     arma::vec beta_current;
 89 |     beta_current.zeros(n_var);
 90 | 
 91 |     int problem = 0;
 92 | 
 93 |     for (int i = 0; i < n_lambda; i++)
 94 |     {
 95 |         beta_current = beta;
 96 |         lambda_used = lambda(i);
 97 |         for (int j = 0; j < max_iteration; j++)
 98 |         {
 99 |             for (int q = 0; q < n_var; q++)
100 |             {
101 |                 temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
102 |                 z = temp(0, 0);
103 |                 if (beta_current(q) <= gamma * lambda_used)
104 |                 {
105 |                     if (z > lambda_used)
106 |                     {
107 |                         beta_current(q) = (z - lambda_used) / (1 + s - 1 / gamma);
108 |                     }
109 |                     else if (z < -lambda_used)
110 |                     {
111 |                         beta_current(q) = (z + lambda_used) / (1 + s - 1 / gamma);
112 |                     }
113 |                     else
114 |                     {
115 |                         beta_current(q) = 0.0;
116 |                     }
117 |                 }
118 |                 else
119 |                 {
120 |                     beta_current(q) = z / (1 + s);
121 |                 }
122 |             }
123 | 
124 |             arma::vec betaDiff = abs(beta_current - beta);
125 |             dlx = betaDiff.max();
126 |             dlx2 = arma::norm(beta_current, 2);
127 | 
128 |             if (dlx < threshold)
129 |             {
130 |                 break;
131 |             }
132 | 
133 |             if (dlx2 > 100.0)
134 |             {
135 |                 problem = 1;
136 |                 break;
137 |             }
138 | 
139 |             beta = beta_current;
140 |         }
141 | 
142 |         if (problem == 1)
143 |         {
144 |             break;
145 |         }
146 | 
147 |         res.col(i) = beta_current;
148 |     }
149 | 
150 |     return (res);
151 | }
152 | 
153 | // [[Rcpp::depends(RcppArmadillo)]]
154 | // [[Rcpp::export]]
155 | int ElNetDetect(const arma::vec &r, const arma::mat &LD, const arma::vec lambda, const double s, const double alpha, const int max_iteration, const double threshold, int index_lambda)
156 | {
157 |     const int n_var = r.size();
158 |     double z, lambda_used, dlx;
159 | 
160 |     arma::mat temp(1, 1);
161 | 
162 |     arma::vec beta;
163 |     beta.zeros(n_var);
164 | 
165 |     arma::vec beta_current;
166 |     beta_current.zeros(n_var);
167 | 
168 |     int problem = 1;
169 | 
170 |     beta_current = beta;
171 |     index_lambda = index_lambda - 1;
172 |     lambda_used = lambda(index_lambda);
173 | 
174 |     for (int j = 0; j < max_iteration; j++)
175 |     {
176 |         for (int q = 0; q < n_var; q++)
177 |         {
178 |             temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
179 |             z = temp(0, 0);
180 |             if (z > alpha * lambda_used)
181 |             {
182 |                 beta_current(q) = (z - alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha));
183 |             }
184 |             else if (z < -alpha * lambda_used)
185 |             {
186 |                 beta_current(q) = (z + alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha));
187 |             }
188 |             else
189 |             {
190 |                 beta_current(q) = 0.0;
191 |             }
192 |         }
193 | 
194 |         dlx = arma::norm(beta_current, 2);
195 | 
196 |         if (dlx > threshold)
197 |         {
198 |             problem = 2;
199 |             break;
200 |         }
201 | 
202 |         beta = beta_current;
203 |     }
204 | 
205 |     return (problem);
206 | }
207 | 
208 | // [[Rcpp::depends(RcppArmadillo)]]
209 | // [[Rcpp::export]]
210 | arma::mat ElNet(const arma::vec &r, const arma::mat &LD, const arma::vec lambda, const double s, const double alpha, const int max_iteration, const double threshold)
211 | {
212 |     const int n_var = r.size();
213 |     const int n_lambda = lambda.size();
214 |     double z, lambda_used, dlx, dlx2;
215 | 
216 |     arma::mat res(n_var, n_lambda);
217 |     res.fill(0);
218 | 
219 |     arma::mat temp(1, 1);
220 | 
221 |     arma::vec beta;
222 |     beta.zeros(n_var);
223 | 
224 |     arma::vec beta_current;
225 |     beta_current.zeros(n_var);
226 | 
227 |     int problem = 0;
228 | 
229 |     for (int i = 0; i < n_lambda; i++)
230 |     {
231 |         beta_current = beta;
232 |         lambda_used = lambda(i);
233 | 
234 |         for (int j = 0; j < max_iteration; j++)
235 |         {
236 |             for (int q = 0; q < n_var; q++)
237 |             {
238 |                 temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
239 |                 z = temp(0, 0);
240 |                 if (z > alpha * lambda_used)
241 |                 {
242 |                     beta_current(q) = (z - alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha));
243 |                 }
244 |                 else if (z < -alpha * lambda_used)
245 |                 {
246 |                     beta_current(q) = (z + alpha * lambda_used) / (1 + s + 2 * lambda_used * (1 - alpha));
247 |                 }
248 |                 else
249 |                 {
250 |                     beta_current(q) = 0.0;
251 |                 }
252 |             }
253 | 
254 |             arma::vec betaDiff = abs(beta_current - beta);
255 |             dlx = betaDiff.max();
256 |             dlx2 = arma::norm(beta_current, 2);
257 | 
258 |             if (dlx < threshold)
259 |             {
260 |                 break;
261 |             }
262 | 
263 |             if (dlx2 > 100.0)
264 |             {
265 |                 problem = 1;
266 |                 break;
267 |             }
268 | 
269 |             beta = beta_current;
270 |         }
271 | 
272 |         if (problem == 1)
273 |         {
274 |             break;
275 |         }
276 | 
277 |         res.col(i) = beta_current;
278 |     }
279 | 
280 |     return (res);
281 | }
282 | 
283 | // [[Rcpp::depends(RcppArmadillo)]]
284 | // [[Rcpp::export]]
285 | int MNetDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double alpha, const double gamma, const int max_iteration, const double threshold, int index_lambda)
286 | {
287 |     const int n_var = r.size();
288 |     double z, lambda_used, dlx, lambda_1, lambda_2;
289 | 
290 |     arma::mat temp(1, 1);
291 | 
292 |     arma::vec beta;
293 |     beta.zeros(n_var);
294 | 
295 |     arma::vec beta_current;
296 |     beta_current.zeros(n_var);
297 | 
298 |     int problem = 1;
299 | 
300 |     beta_current = beta;
301 |     index_lambda = index_lambda - 1;
302 |     lambda_used = lambda(index_lambda);
303 |     lambda_1 = lambda_used * alpha;
304 |     lambda_2 = lambda_used * (1 - alpha);
305 | 
306 |     for (int j = 0; j < max_iteration; j++)
307 |     {
308 |         for (int q = 0; q < n_var; q++)
309 |         {
310 |             temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
311 |             z = temp(0, 0);
312 |             if (abs(z) <= gamma * lambda_1 * (1 + lambda_2))
313 |             {
314 |                 if (z > lambda_1)
315 |                 {
316 |                     beta_current(q) = (z - lambda_1) / (1 + s + lambda_2 - 1 / gamma);
317 |                 }
318 |                 else if (z < -lambda_1)
319 |                 {
320 |                     beta_current(q) = (z + lambda_1) / (1 + s + lambda_2 - 1 / gamma);
321 |                 }
322 |                 else
323 |                 {
324 |                     beta_current(q) = 0.0;
325 |                 }
326 |             }
327 |             else
328 |             {
329 |                 beta_current(q) = z / (1 + s + lambda_2);
330 |             }
331 |         }
332 | 
333 |         dlx = arma::norm(beta_current, 2);
334 | 
335 |         if (dlx > threshold)
336 |         {
337 |             problem = 2;
338 |             break;
339 |         }
340 | 
341 |         beta = beta_current;
342 |     }
343 | 
344 |     return (problem);
345 | }
346 | 
347 | // [[Rcpp::depends(RcppArmadillo)]]
348 | // [[Rcpp::export]]
349 | arma::mat MNet(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double alpha, const double gamma, const int max_iteration, const double threshold)
350 | {
351 |     const int n_var = r.size();
352 |     const int n_lambda = lambda.size();
353 |     double z, lambda_used, dlx, dlx2, lambda_1, lambda_2;
354 | 
355 |     arma::mat res(n_var, n_lambda);
356 |     res.fill(0);
357 | 
358 |     arma::mat temp(1, 1);
359 | 
360 |     arma::vec beta;
361 |     beta.zeros(n_var);
362 | 
363 |     arma::vec beta_current;
364 |     beta_current.zeros(n_var);
365 | 
366 |     int problem = 0;
367 | 
368 |     for (int i = 0; i < n_lambda; i++)
369 |     {
370 |         beta_current = beta;
371 |         lambda_used = lambda(i);
372 |         lambda_1 = lambda_used * alpha;
373 |         lambda_2 = lambda_used * (1 - alpha);
374 |         for (int j = 0; j < max_iteration; j++)
375 |         {
376 | 
377 |             for (int q = 0; q < n_var; q++)
378 |             {
379 |                 temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
380 |                 z = temp(0, 0);
381 |                 if (abs(z) <= gamma * lambda_1 * (1 + lambda_2))
382 |                 {
383 |                     if (z > lambda_1)
384 |                     {
385 |                         beta_current(q) = (z - lambda_1) / (1 + s + lambda_2 - 1 / gamma);
386 |                     }
387 |                     else if (z < -lambda_1)
388 |                     {
389 |                         beta_current(q) = (z + lambda_1) / (1 + s + lambda_2 - 1 / gamma);
390 |                     }
391 |                     else
392 |                     {
393 |                         beta_current(q) = 0.0;
394 |                     }
395 |                 }
396 |                 else
397 |                 {
398 |                     beta_current(q) = z / (1 + s + lambda_2);
399 |                 }
400 |             }
401 | 
402 |             arma::vec betaDiff = abs(beta_current - beta);
403 |             dlx = betaDiff.max();
404 | 
405 |             dlx2 = arma::norm(beta_current, 2);
406 | 
407 |             if (dlx < threshold)
408 |             {
409 |                 break;
410 |             }
411 | 
412 |             if (dlx2 > 100.0)
413 |             {
414 |                 problem = 1;
415 |                 break;
416 |             }
417 | 
418 |             beta = beta_current;
419 |         }
420 | 
421 |         if (problem == 1)
422 |         {
423 |             break;
424 |         }
425 | 
426 |         res.col(i) = beta_current;
427 |     }
428 | 
429 |     return (res);
430 | }
431 | 
432 | // [[Rcpp::depends(RcppArmadillo)]]
433 | // [[Rcpp::export]]
434 | 
435 | int SCADDetect(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold, int index_lambda)
436 | {
437 |     const int n_var = r.size();
438 |     double lambda_used, z, dlx;
439 | 
440 |     arma::mat temp(1, 1);
441 | 
442 |     arma::vec beta;
443 |     beta.zeros(n_var);
444 | 
445 |     arma::vec beta_current;
446 |     beta_current.zeros(n_var);
447 | 
448 |     int problem = 1;
449 | 
450 |     beta_current = beta;
451 |     index_lambda = index_lambda - 1;
452 |     lambda_used = lambda(index_lambda);
453 | 
454 |     for (int j = 0; j < max_iteration; j++)
455 |     {
456 |         for (int q = 0; q < n_var; q++)
457 |         {
458 |             temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
459 |             z = temp(0, 0);
460 |             if (beta_current(q) <= gamma * lambda_used)
461 |             {
462 |                 if (beta_current(q) <= lambda_used)
463 |                 {
464 |                     if (z > lambda_used)
465 |                     {
466 |                         beta_current(q) = (z - lambda_used) / (1 + s);
467 |                     }
468 |                     else if (z < -lambda_used)
469 |                     {
470 |                         beta_current(q) = (z + lambda_used) / (1 + s);
471 |                     }
472 |                     else
473 |                     {
474 |                         beta_current(q) = 0.0;
475 |                     }
476 |                 }
477 |                 else
478 |                 {
479 |                     if (z > gamma / (gamma - 1) * lambda_used)
480 |                     {
481 |                         beta_current(q) = (z - gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1));
482 |                     }
483 |                     else if (z < -gamma / (gamma - 1) * lambda_used)
484 |                     {
485 |                         beta_current(q) = (z + gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1));
486 |                     }
487 |                     else
488 |                     {
489 |                         beta_current(q) = 0.0;
490 |                     }
491 |                 }
492 |             }
493 |             else
494 |             {
495 |                 beta_current(q) = z / (1 + s);
496 |             }
497 |         }
498 | 
499 |         dlx = arma::norm(beta_current, 2);
500 | 
501 |         if (dlx > threshold)
502 |         {
503 |             problem = 2;
504 |             break;
505 |         }
506 | 
507 |         beta = beta_current;
508 |     }
509 | 
510 |     return (problem);
511 | }
512 | 
513 | // [[Rcpp::depends(RcppArmadillo)]]
514 | // [[Rcpp::export]]
515 | 
516 | arma::mat SCAD(const arma::vec &r, const arma::mat &LD, arma::vec lambda, const double s, const double gamma, const int max_iteration, const double threshold)
517 | {
518 |     const int n_var = r.size();
519 |     const int n_lambda = lambda.size();
520 |     double lambda_used, z, dlx, dlx2;
521 | 
522 |     arma::mat res(n_var, n_lambda);
523 |     res.fill(0);
524 | 
525 |     arma::mat temp(1, 1);
526 | 
527 |     arma::vec beta;
528 |     beta.zeros(n_var);
529 | 
530 |     arma::vec beta_current;
531 |     beta_current.zeros(n_var);
532 | 
533 |     int problem = 0;
534 | 
535 |     for (int i = 0; i < n_lambda; i++)
536 |     {
537 |         beta_current = beta;
538 |         lambda_used = lambda(i);
539 |         for (int j = 0; j < max_iteration; j++)
540 |         {
541 |             for (int q = 0; q < n_var; q++)
542 |             {
543 |                 temp = r(q) - LD.col(q).t() * beta_current + LD(q, q) * beta_current(q);
544 |                 z = temp(0, 0);
545 |                 if (beta_current(q) <= gamma * lambda_used)
546 |                 {
547 |                     if (beta_current(q) <= lambda_used)
548 |                     {
549 |                         if (z > lambda_used)
550 |                         {
551 |                             beta_current(q) = (z - lambda_used) / (1 + s);
552 |                         }
553 |                         else if (z < -lambda_used)
554 |                         {
555 |                             beta_current(q) = (z + lambda_used) / (1 + s);
556 |                         }
557 |                         else
558 |                         {
559 |                             beta_current(q) = 0.0;
560 |                         }
561 |                     }
562 |                     else
563 |                     {
564 |                         if (z > gamma / (gamma - 1) * lambda_used)
565 |                         {
566 |                             beta_current(q) = (z - gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1));
567 |                         }
568 |                         else if (z < -gamma / (gamma - 1) * lambda_used)
569 |                         {
570 |                             beta_current(q) = (z + gamma / (gamma - 1) * lambda_used) / (1 + s - 1 / (gamma - 1));
571 |                         }
572 |                         else
573 |                         {
574 |                             beta_current(q) = 0.0;
575 |                         }
576 |                     }
577 |                 }
578 |                 else
579 |                 {
580 |                     beta_current(q) = z / (1 + s);
581 |                 }
582 |             }
583 | 
584 |             arma::vec betaDiff = abs(beta_current - beta);
585 |             dlx = betaDiff.max();
586 |             dlx2 = arma::norm(beta_current, 2);
587 | 
588 |             if (dlx < threshold)
589 |             {
590 |                 break;
591 |             }
592 | 
593 |             if (dlx2 > 100.0)
594 |             {
595 |                 problem = 1;
596 |                 break;
597 |             }
598 | 
599 |             beta = beta_current;
600 |         }
601 | 
602 |         if (problem == 1)
603 |         {
604 |             break;
605 |         }
606 | 
607 |         res.col(i) = beta_current;
608 |     }
609 | 
610 |     return (res);
611 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SUMMIT
  2 | 
  3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.7034435.svg)](https://doi.org/10.5281/zenodo.7034435)
  4 | 
  5 | **Summary-level Unified Method for Modeling Integrated Transcriptome (SUMMIT)**, is a novel framework designed to improve the expression prediction model accuracy and the power of sequential TWAS by leveraging a very large expression quantitative trait loci (eQTL) summary-level dataset. One benefit of SUMMIT is that it can deal with genes with moderate-low expression heritability, which have been largely ignored by conventional TWAS.
  6 | 
  7 | ### SUMMIT's workflow.
  8 | 
  9 | <p align="center">
 10 | <img src="illustration.bmp" alt="workflow" width="700"/>
 11 | </p>
 12 | 
 13 | Please cite the following manuscript for using SUMMIT and gene expression models built by SUMMIT:
 14 | 
 15 | > Zhang, Z., Bae, Y. E., Bradley, J. R., Wu, L., & Wu, C. (2021). SUMMIT: An integrative approach for better transcriptomic data imputation improves causal gene identification. medRxiv. Under Review.
 16 | 
 17 | 
 18 | ### Replication:
 19 | For a complete replication of the results in our manuscript, please see our tutorials at osf.io (https://doi.org/10.17605/OSF.IO/BS3QU).
 20 | 
 21 | ## Outline
 22 | 
 23 | 1. [Training imputation model](#TRAIN)
 24 | 2. [Association test](#TWAS)
 25 | 
 26 | ## <a name="TRAIN"></a>Training models
 27 | 
 28 | ```mainbody_cpp_pos_precise.R/mainbody_cpp_rsid_precise.R``` are SUMMIT's main functions to train imputation models. It has only 3 input arguments ```--name_batch``` ,```--method```, and ```wd```. ```--name_batch``` is the desired name for this batch of output and ```--method``` is designated penalized regression method that you can choose from (LASSO, ElNet, SCAD, MCP, MNet). ```--wd``` is the path of working directory.
 29 | 
 30 | ### Data preparation
 31 | 
 32 | The Mainbody_cpp_xxx_precise functions require a very specific set of processed data to work properly. Unfortunately, we cannot share all the data with you due to privacy and confidentiality concern. Here, we provide a list of the data that we used. If you need further assistance on how the data were organized/processed, please reach out to us.
 33 | 
 34 | | Datasets referenced | Usage | How to obtain |
 35 | | ----- | ----- | ---- |
 36 | | gencode.v26.hg19.genes.rds | A look-up list for translating Ensembl IDs into gene names | downloaded from https://www.gencodegenes.org/human/release_26.html |
 37 | | Whole Blood_QCed_rpkm.rds | GTEx-7's subject's gene expression levels | downloaded from https://gtexportal.org and processed |
 38 | | response.8.RData | GTEx-8's subject's gene expression levels | downloaded from https://gtexportal.org and processed |
 39 | | subset-hapmap3 | eQTLGen summary statistics subsetted by HapMap3 | downloaded from https://www.eqtlgen.org/cis-eqtls.html with standard QC, then split into smaller files by gene name |
 40 | | seq.ref | Genotype matrix of reference panel (1000 Genomes Project) | downloaded from https://www.internationalgenome.org/data |
 41 | | seq.8 | Genotype matrix of GTEx-8 subjects | downloaded from https://gtexportal.org and process |
 42 | | chrX.OMNI.interpolated_genetic_map | Genetic distance of reference panel (1000 Genomes Project) | downloaded from https://github.com/joepickrell/1000-genomes-genetic-maps |
 43 | 
 44 | ### Data alignment
 45 | 
 46 | SUMMIT offers two approaches to align the reference panel with the eQTL summary-level data. If you wish to align your data by rsID, use mainbody_cpp_rsid_precise.R. If you wish to align your data by chromosome+position, use mainbody_cpp_pos_precise.R.
 47 | 
 48 | ### Example run
 49 | 
 50 | After we prepared the data, we can train imputation models via the following command.
 51 | 
 52 | ```
 53 | Rscript mainbody_cpp_rsid.R \
 54 | --name_batch SCAD_1e-6_rsid \
 55 | --method SCAD \
 56 | --wd MY-WORK-DIR \
 57 | ```
 58 | 
 59 | ### Built-in parallel computing
 60 | 
 61 | Both ```mainbody_cpp_rsid_precise.R``` and ```mainbody_cpp_pos_precise.R``` contain a snippet that guarantees mutual exclusion for every subjob. Simply put, you can run ```mainbody_cpp_rsid_precise.R``` and ```mainbody_cpp_pos_precise.R``` using many parallel instances as you want and it will figure out if there is unfinished job on its own.
 62 | 
 63 | ## <a name="TWAS"></a>Association test
 64 | 
 65 | ### Gene expression prediction models built by SUMMIT
 66 | 
 67 | We have uploaded our pre-calculated expression imputation models (Tissue: whole blood) to osf.io (https://doi.org/10.17605/OSF.IO/7MXSA).
 68 | 
 69 | The osf.io repository contains two zip files. ``SUMMIT-weight-pos.zip`` contains models that use chromosome plus position (CHR + POS) to match the SNPs in our models to the SNPs in GWAS summary data; ``SUMMIT-weight-rsid.zip`` uses rsID to match.
 70 | 
 71 | ### Pre-process your summary statistics using APSS
 72 | 
 73 | APSS is an interactive R function that can easily process GWAS summary statistics and shape GWAS summary statistics into any desired format.
 74 | 
 75 | APSS has 3 principal input arguments.
 76 | 
 77 | ```directory.working``` is the working directory.
 78 | 
 79 | ```filename``` is the name of the summary statistics file to be processed.
 80 | 
 81 | ```BIG``` is the number of GBs and default is 2. If ```BIG``` is set as 2, then for any summary statistics file bigger than 2GB, APSS will do an exploratory read first. By doing so, APSS could significantly shorten the runtime and handle summary statistics files bigger than 10GB. 
 82 | 
 83 | ### The must-have columns
 84 | 
 85 | You can use any summary statistics files with reasonable quality control just as long they contain specific columns for SUMMIT to work with.
 86 | 
 87 | The must-have columns are ```A1, A2, Z, CHR```.
 88 | 
 89 | If you are using the rsID-aligned models, you must also include ```SNP``` column.
 90 | 
 91 | If you are using the position-aligned models, you must also include ```POS``` column.
 92 | 
 93 | ### The flags
 94 | 
 95 | ```step2_pos.R``` is SUMMIT's function for association test using position-aligned models.
 96 | 
 97 | ```step2_rsid.R``` is SUMMIT's function for association test using rsID-aligned models.
 98 | 
 99 | For both functions, the input arguments are:
100 | 
101 | ```--models``` is the path of the pre-calculated models (Please make sure that the folder contains only model files).
102 | 
103 | ```--path.ref``` is the path of the reference panel used plus the prefixes of reference panel files.
104 | 
105 | ```--path.ss``` is the path of the reference panel (with "/" at the end).
106 | 
107 | ```--trait``` is the path plus name of the input summary statistics files.
108 | 
109 | ```--path.out``` is the path of the output folder (with "/" at the end).
110 | 
111 | ```--parallel``` is the number of parallel instances
112 | 
113 | ### Parallelization
114 | 
115 | Unlike other TWAS methods (e.g., PrediXcan), SUMMIT can be a bit more time-consuming. Because SUMMIT does not use a designated correlation matrix (LD matrix), the SUMMIT pipeline would spend more time matching models, summary statistics, and reference panel. In addition, each SUMMIT's pre-calculated model file contains models from 5 different types of penalized regression, hence more computation time is needed.
116 | 
117 | However, with proper parallelization, a complete association study usually takes less than 10 minutes. For example, if you decided to split the association tests into 20 smaller subjobs, an index (from 1 to 20) would need to be explicitly passed on to R from the global environment and the ```--parallel``` flag should be set to 20. Depending on your computing environment, you
118 | may need to manually modify line 2 in ```step2_pos.R/step2_rsid.R```. 
119 | 
120 | ### Example run
121 | 
122 | ```
123 | Rscript step2_pos.R \
124 | --models my-model-folder-pos \
125 | --path.ref my-reference-panel/1000G.EUR.ALLSNP.QC.CHR \
126 | --path.ss my-ss-folder/ \
127 | --trait my-trait-1 \
128 | --path.out my-output-folder \
129 | --parallel 100
130 | ```
131 | 
132 | ```
133 | Rscript step2_rsid.R \
134 | --models my-model-folder-rsid \
135 | --path.ref my-reference-panel/1000G.EUR.ALLSNP.QC.CHR \
136 | --path.ss my-ss-folder/ \
137 | --trait my-trait-2 \
138 | --path.out my-output-folder \
139 | --parallel 50
140 | ```
141 | 
142 | ### Output format
143 | 
144 | | Column number   | Column name | Description |
145 | | ----- | ----- | ---- |
146 | | 1 | gene_symbol | Gene name  |
147 | | 2 | gene_id | Ensembl ID  |
148 | | 3 | chromosome | Chromosome |
149 | | 4 | model_best | Best model |
150 | | 5 | r2_test | Best model's $R^2$ on testing data  |
151 | | 6-10 | p_ElNet | p-value of TWAS (Method is after the underscore) |
152 | | 11-15 | z_ElNet | Z-score of TWAS (Method is after the underscore) |
153 | | 16 | p_ACAT | The combined ACAT p-value |
154 | | 17 | gene_pos | Gene position  |
155 | | 18 | runtime | Runtime |
156 | 
157 | ### Disclaimer
158 | 
159 | The built gene expression prediction models and software are provided “as is”, without warranty of any kind, express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose and noninfringement. in no event shall the authors or copyright holders be liable for any claim, damages or other liability, whether in an action of contract, tort or otherwise, arising from, out of or in connection with the models or the use or other dealings in the models.
160 | 
161 | ### License
162 | 
163 | Maintainer: [Zichen Zhang] (zz17@fsu.edu)
164 | 
165 | [MIT](http://opensource.org/licenses/MIT)
166 | 
167 | Copyright (c) 2013-present, Zichen Zhang (zz17@fsu.edu), Chong Wu (cwu18@mdanderson.org)
168 | 


--------------------------------------------------------------------------------
/illustration.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChongWuLab/SUMMIT/9b11e3d1f4ca00f0dd35a1f777720a616f365528/illustration.bmp


--------------------------------------------------------------------------------
/mainbody_cpp_pos_precise.R:
--------------------------------------------------------------------------------
  1 | #############################################
  2 | # Package, parameter, and working directory #
  3 | #############################################
  4 | 
  5 | # Package
  6 | library(BEDMatrix)
  7 | library(compiler)
  8 | library(data.table)
  9 | library(ddpcr)
 10 | library(dplyr)
 11 | library(optparse)
 12 | library(Rcpp)
 13 | 
 14 | # Parameter
 15 | s.array <- 0.1 * (1:9)
 16 | 
 17 | ###########
 18 | # Options #
 19 | ###########
 20 | 
 21 | # Options
 22 | option_list <- list(
 23 |     make_option("--name_batch", type = "character", default = FALSE, action = "store", help = "Name of this batch"
 24 |     ),
 25 |     make_option("--method"    , type = "character", default = FALSE, action = "store", help = "Method to use"
 26 |     ),
 27 |     make_option("--wd"        , type = "character", default = FALSE, action = "store", help = "Working directory"
 28 |     )  
 29 | )
 30 | 
 31 | opt <- parse_args(OptionParser(option_list = option_list))
 32 | 
 33 | # Pass the options to variables
 34 | batch  <- opt$name_batch
 35 | method <- opt$method
 36 | setwd(opt$wd)
 37 | 
 38 | #######################
 39 | # Quick-submit gadget #
 40 | #######################
 41 | 
 42 | # Randomly sleep for a little while
 43 | Sys.sleep(runif(1, 0, 30))
 44 | 
 45 | # Setting up the folders
 46 | null.object <- NULL
 47 | 
 48 | dir.create("quick-submit", showWarnings = TRUE) 
 49 | if (!dir.exists("quick-submit/to-do")) {
 50 |     dir.create("quick-submit/to-do")
 51 |     dir.create("quick-submit/working")
 52 |     dir.create("quick-submit/done")
 53 |     dir.create("quick-submit/error")
 54 |     for (qwerty in 1:19227) {
 55 |         save(null.object, file = paste0("quick-submit/to-do/", qwerty))
 56 |     }
 57 | }
 58 | 
 59 | ##########################
 60 | # User-defined functions #
 61 | ##########################
 62 | 
 63 | # Cpp functions
 64 | suppressMessages(sourceCpp("DetectAndSolve_debugged.cpp"))
 65 | 
 66 | # FindOptimalResult
 67 | FindOptimalResult <- function(response.true) {
 68 |     # Find the optimal out of 900 results
 69 |     index.optimal <- 0
 70 |     max           <- -1
 71 | 
 72 |     for (t in 1:900) {
 73 |         if (norm(result[[t]][[1]], type = "2") <= 100) {
 74 |             # Compute the predicted response
 75 |             response.pred <- genotype.train %*% result[[t]][[1]]
 76 | 
 77 |             # Do the regression
 78 |             reg <- summary(lm(response.true ~ response.pred))
 79 | 
 80 |             # Keep only the optimal result
 81 |             if (reg$adj.r.sq > max) {
 82 |                 max            <- reg$adj.r.sq
 83 |                 result.optimal <- list(result[[t]][[1]], result[[t]][[2]], reg$adj.r.sq, reg$coef)
 84 |                 t.optimal      <- t
 85 |             }
 86 |         }
 87 |     }
 88 | 
 89 |     # Compute the P-value of R square
 90 |     quiet(
 91 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
 92 |     )
 93 |     result.optimal[[5]] <- test$p.value
 94 | 
 95 |     return(result.optimal)
 96 | }
 97 | 
 98 | FindOptimalResult <- cmpfun(FindOptimalResult)
 99 | 
100 | # PatchUp
101 | source("PatchUp.R")
102 | 
103 | PatchUp <- cmpfun(PatchUp)
104 | 
105 | # Standardize
106 | Standardize <- function(M) {
107 |     # Centralize
108 |     M <- M - matrix(rep(colMeans(M), times = nrow(M)), nrow = nrow(M) , ncol = ncol(M), byrow = T)
109 | 
110 |     # Standardize
111 |     M <- sweep(M, 2, sqrt(apply(M, 2, crossprod) / nrow(M)), "/")
112 | 
113 |     return(M)
114 | }
115 | 
116 | Standardize <- cmpfun(Standardize)
117 | 
118 | # Translate
119 | list.translation           <- readRDS("gencode.v26.hg19.genes.rds")
120 | list.translation$gene_id   <- substr(list.translation$gene_id, 1, 15)
121 | list.translation$gene_name <- as.character(list.translation$gene_name)
122 | 
123 | list.supplementary <- readRDS("list.supplementary.rds")
124 | 
125 | Translate <- function(gene.ENSG) {
126 |     temp.1 <- list.translation[which(list.translation$gene_id == gene.ENSG), ]
127 | 
128 |     if (nrow(temp.1) == 0) {
129 |         temp.2 <- list.supplementary[which(list.supplementary$ensembl_gene_id == gene.ENSG), ]
130 | 
131 |         if (nrow(temp.2) == 0) {
132 |             gene.proper <- gene.ENSG
133 |         } else {
134 |             gene.proper <- temp.2[1, 3]
135 |         }
136 |     } else {
137 |         gene.proper <- temp.1[1, 12]
138 |     }
139 | 
140 |     if (gene.proper == "") {
141 |         gene.proper <- gene.ENSG
142 |     }
143 | 
144 |     return(gene.proper)
145 | }
146 | 
147 | Translate <- cmpfun(Translate)
148 | 
149 | ##############################
150 | # Read GTEx-7's subject list #
151 | ##############################
152 | 
153 | raw    <- readRDS("Whole Blood_QCed_rpkm.rds")
154 | list.7 <- rownames(raw$qced.exp)
155 | 
156 | rm(raw)
157 | 
158 | ##########################################################################
159 | # Read GTEx-8's expression data (response.8.RData is already translated) #
160 | ##########################################################################
161 | 
162 | load("response.8.RData")
163 | 
164 | ####################
165 | # List of subjects #
166 | ####################
167 | 
168 | # Split subjects in GTEx-8 into two groups
169 | list.8 <- rownames(response.8)
170 | 
171 | list.train <- list.8[which(list.8 %in% list.7)]
172 | list.valid <- list.8[which(!(list.8 %in% list.7))]
173 | 
174 | ####################################
175 | # There are 19,227 genes in total! #
176 | ####################################
177 | 
178 | filenames <- dir("subset-hapmap3")
179 | 
180 | # Quick-submit gadget
181 | while (TRUE) {
182 |     if (length(dir("quick-submit/to-do")) == 0) {
183 |         print("All jobs are done!")
184 |         break
185 |     } else {
186 |         # Get the job
187 |         to_do <- as.numeric(dir("quick-submit/to-do"))
188 |         if (length(to_do) == 1) {
189 |             i <- to_do[1]
190 |         } else {
191 |             i <- sample(to_do, 1)
192 |         }
193 |         # Ship the job to "working" folder
194 |         save(null.object, file = paste0("quick-submit/working/", i))
195 |         file.remove(paste0("quick-submit/to-do/", i))
196 |     }
197 | 
198 |     # Start keeping track of runtime
199 |     time.start <- proc.time()[3]
200 | 
201 |     #################
202 |     # Preprocess ss #
203 |     #################
204 | 
205 |     load(paste0("subset-hapmap3/", filenames[i]))
206 | 
207 |     # Get the name and chromosome of to-be-processed gene
208 |     gene.ENSG   <- ss$Gene[1]
209 |     gene.proper <- Translate(gene.ENSG)
210 |     chr         <- ss$SNPChr[1]
211 | 
212 |     ##########################
213 |     # Clumping and filtering #
214 |     ##########################
215 | 
216 |     do.clumping <- FALSE
217 | 
218 |     # Obsolete
219 |     if (do.clumping) {
220 |         ###########
221 |         # Extract #
222 |         ###########
223 | 
224 |         # Create a temporary SNP list for Plink
225 |         temp.outdir  <- "/gpfs/home/zz17/current-project/temp/"
226 |         temp.snpfile <- paste0(temp.outdir, gene.ENSG, ".txt")
227 |         write.table(ss[, 1], file = temp.snpfile, quote = FALSE, col.names = FALSE, row.names = FALSE)
228 | 
229 |         # Put together the command and execute
230 |         command <- path.Plink
231 |         command <- paste0(command)
232 |         command <- paste0(command, " --bfile /gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR", chr)
233 |         command <- paste0(command, " --chr ", chr)
234 |         command <- paste0(command, " --extract ", temp.snpfile)
235 |         command <- paste0(command, " --make-bed")
236 |         command <- paste0(command, " --out ", temp.outdir, gene.ENSG)
237 | 
238 |         quiet(
239 |             system(command, intern = TRUE)
240 |         )
241 | 
242 |         ###########
243 |         # Process #
244 |         ###########
245 | 
246 |         # Create a temporary ss file and add a p-value column
247 |         ss$pval     <- 2 * pnorm(abs(ss$Zscore), lower.tail = FALSE)
248 |         temp.ssfile <- paste0(temp.outdir, gene.ENSG, "_ss.txt")
249 |         write.table(ss, file = temp.ssfile, quote = FALSE, col.names = TRUE, row.names = FALSE)
250 | 
251 |         # Put together the command and execute
252 |         command <- path.Plink
253 |         command <- paste0(command, " --bfile ", temp.outdir, gene.ENSG)
254 |         command <- paste0(command, " --clump ", temp.ssfile)
255 |         command <- paste0(command, " --clump-field pval")
256 |         command <- paste0(command, " --clump-kb 250")
257 |         command <- paste0(command, " --clump-p1 1")
258 |         command <- paste0(command, " --clump-r2 0.95")
259 |         command <- paste0(command, " --clump-snp-field SNP")
260 |         command <- paste0(command, " --out ", temp.outdir, gene.ENSG)
261 | 
262 |         quiet(
263 |             system(command, intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE)
264 |         )
265 | 
266 |         clumped.snp <- fread(paste0(temp.outdir, gene.ENSG, ".clumped"))
267 |         clumped.snp <- as.data.frame(clumped.snp)
268 |         clumped.snp <- clumped.snp[, "SNP"]
269 | 
270 |         # Clean up and update
271 |         system(paste0("rm ", temp.outdir, gene.ENSG, "*"))
272 | 
273 |         ss <- ss[ss$SNP %in% clumped.snp, ]
274 |     }
275 | 
276 |     ############
277 |     # Quick QC #
278 |     ############
279 | 
280 |     # Skip this round of iteration if ss is empty to begin with
281 |     if (nrow(ss) == 0) {
282 |         file.connection <- file(paste0("quick-submit/error/", i, ".error0"))
283 |         writeLines("Empty summary statistics to begin with!", file.connection)
284 |         close(file.connection)
285 |         file.remove(paste0("quick-submit/working/", i))
286 |         save(null.object, file=paste0("quick-submit/done/", i))
287 | 
288 |         next
289 |     }
290 | 
291 |     # Read the bim files of GTEx-8 and reference panel
292 |     bim.8   <- as.data.frame(fread(paste0("qced_chr", chr, ".bim")))
293 |     bim.ref <- as.data.frame(fread(paste0("1000G.EUR.ALLSNP.QC.CHR", chr, ".bim")))
294 | 
295 |     # Create new identifiers
296 |     ss["ID"]      <- paste0(chr, "_", ss$SNPPos)
297 |     bim.8["ID"]   <- paste0(bim.8$V1, "_", bim.8$V4)
298 |     bim.ref["ID"] <- paste0(bim.ref$V1, "_", bim.ref$V4)
299 | 
300 |     # Only keep the SNPs that are simultaneously IN (reference panel, GTEx-8)
301 |     ss <- ss[ss$ID %in% bim.ref$ID & ss$ID %in% bim.8$ID, ]
302 | 
303 |     # Sieve out problematic SNPs
304 |     list.1  <- bim.8$ID[duplicated(bim.8$ID)]
305 |     list.2  <- bim.8$ID[nchar(bim.8$V5) > 1 | nchar(bim.8$V6) > 1]
306 |     list.3  <- bim.ref$ID[duplicated(bim.ref$ID)]
307 |     list.4  <- bim.ref$ID[nchar(bim.ref$V5) > 1 | nchar(bim.ref$V6) > 1]
308 |     list.5  <- ss$ID[duplicated(ss$ID)]
309 |     problem <- ss$ID %in% list.1 | ss$ID %in% list.2 | ss$ID %in% list.3 | ss$ID %in% list.4 | ss$ID %in% list.5
310 | 
311 |     ss <- ss[!problem, ]
312 | 
313 |     rm(list.1)
314 |     rm(list.2)
315 |     rm(list.3)
316 |     rm(list.4)
317 |     rm(list.5)
318 |     rm(problem)
319 | 
320 |     # Skip this round of iteration if ss has few row left
321 |     if (nrow(ss) <= 1) {
322 |         file.connection <- file(paste0("quick-submit/error/", i, ".error1"))
323 |         writeLines("Empty summary statistics after QC!", file.connection)
324 |         close(file.connection)
325 |         file.remove(paste0("quick-submit/working/", i))
326 |         save(null.object, file=paste0("quick-submit/done/", i))
327 | 
328 |         next
329 |     }
330 | 
331 |     ########################################################
332 |     # Make sure that the gene of the hour is in response.8 #
333 |     ########################################################
334 | 
335 |     if (!(gene.proper %in% colnames(response.8))) {
336 |         file.connection <- file(paste0("quick-submit/error/", i, ".error2"))
337 |         writeLines("Gene is not in response.8 and we move on to the next gene!", file.connection)
338 |         close(file.connection)
339 |         file.remove(paste0("quick-submit/working/", i))
340 |         save(null.object, file=paste0("quick-submit/done/", i))
341 | 
342 |         next
343 |     }
344 | 
345 |     #################################################
346 |     # Preprocess genotype matrix of reference panel #
347 |     #################################################
348 | 
349 |     # Extract genotype information
350 |     quiet(
351 |         seq.ref <- BEDMatrix(paste0("sequence/1000G.EUR.ALLSNP.QC.CHR", chr), simple_names = TRUE)
352 |     )
353 | 
354 |     colnames(seq.ref) <- bim.ref$ID
355 |     genotype.ref      <- seq.ref[, ss$ID]
356 |     rm(seq.ref)
357 | 
358 |     # Skip this round of iteration if genotype is empty
359 |     if (ncol(genotype.ref) == 0) {
360 |         file.connection <- file(paste0("quick-submit/error/", i, ".error3"))
361 |         writeLines("Empty reference genotype matrix!", file.connection)
362 |         close(file.connection)
363 |         file.remove(paste0("quick-submit/working/", i))
364 |         save(null.object, file=paste0("quick-submit/done/", i))
365 | 
366 |         next
367 |     }
368 | 
369 |     # Compare to see if all (A1 and A2) pairs are well-aligned
370 |     # Always use reference panel's A1/A2 as the standard order 
371 |     bim.temp <- subset(bim.ref, ID %in% ss$ID)
372 |     ss.temp  <- left_join(ss, bim.temp, by = "ID")
373 |     problem  <- !(ss.temp$a1 == ss.temp$V5)
374 | 
375 |     if (sum(problem) != 0) {
376 |         # Flip the problematic pairs
377 |         ss$Zscore[problem] <- -1 * ss$Zscore[problem]
378 | 
379 |         ss$a1 <- ss.temp$V5
380 |         ss$a2 <- ss.temp$V6
381 |     }
382 | 
383 |     rm(bim.temp)
384 |     rm(ss.temp)
385 |     rm(problem)
386 | 
387 |     # Patch up the NAs and centralize genotype matrix
388 |     genotype.ref <- PatchUp(genotype.ref)
389 |     genotype.ref <- Standardize(genotype.ref)
390 | 
391 |     #######################################
392 |     # Preprocess genotype matrix of GTEx8 #
393 |     #######################################
394 | 
395 |     # The big genotype matrix
396 |     quiet(
397 |         seq.8 <- BEDMatrix(paste0("qced_chr", chr), simple_names = TRUE)
398 |     )
399 | 
400 |     # Remove the "0_" part of subject IDs
401 |     rownames(seq.8) <- gsub("0_", "", rownames(seq.8))
402 | 
403 |     # Subset by list.8 and ID from ss
404 |     colnames(seq.8) <- bim.8$ID
405 |     genotype.8 <- seq.8[list.8, ss$ID]
406 | 
407 |     # Skip this round of iteration if genotype is empty
408 |     if (ncol(genotype.8) == 0) {
409 |         file.connection <- file(paste0("quick-submit/error/", i, ".error4"))
410 |         writeLines("Empty genotype.8 matrix!", file.connection)
411 |         close(file.connection)
412 |         file.remove(paste0("quick-submit/working/", i))
413 |         save(null.object, file=paste0("quick-submit/done/", i))
414 | 
415 |         next
416 |     }
417 | 
418 |     # Compare to see if all (a1 and a2) are well-aligned
419 |     bim.temp1 <- subset(bim.ref, ID %in% ss$ID)
420 |     bim.temp2 <- subset(bim.8, ID %in% ss$ID)
421 |     bim.temp3 <- left_join(bim.temp1, bim.temp2, by = "ID")
422 |     problem   <- !(bim.temp3$V5.x == bim.temp3$V5.y)
423 | 
424 |     if (sum(problem) != 0) {
425 |         genotype.8[, problem] <- 2 - genotype.8[, problem]
426 |     }
427 | 
428 |     rm(bim.temp1)
429 |     rm(bim.temp2)
430 |     rm(bim.temp3)
431 |     rm(seq.8)
432 |     rm(problem)
433 | 
434 |     #######################################################
435 |     # EXTREMELY ANNOYING ISSUE FOUND!!!!!!!!!!!!!!!!!!!!! #
436 |     #######################################################
437 | 
438 |     # Subject number is NOT the same for all chromosomes!
439 |     genotype.train <- genotype.8[which(rownames(genotype.8) %in% list.train), ]
440 |     genotype.train <- scale(PatchUp(genotype.train))
441 | 
442 |     genotype.valid <- genotype.8[which(rownames(genotype.8) %in% list.valid), ]
443 |     genotype.valid <- scale(PatchUp(genotype.valid))
444 | 
445 |     genotype.train[is.na(genotype.train)] <- 0
446 |     genotype.valid[is.na(genotype.valid)] <- 0
447 | 
448 |     ##########################
449 |     # The original LD matrix #
450 |     ##########################
451 | 
452 |     matrix.LD <- t(genotype.ref) %*% genotype.ref / nrow(genotype.ref)
453 | 
454 |     #########################
455 |     # Additional adjustment #
456 |     #########################
457 | 
458 |     do.adjust <- TRUE
459 | 
460 |     if (do.adjust) {
461 |         # Some given parameters
462 |         cutoff        <- 0.001
463 |         matrix.adjust <- matrix(0, nrow(ss), nrow(ss))
464 |         n.effective   <- 11400
465 |         n.size        <- 183
466 | 
467 |         temp.cof <- (-2 * n.effective) / n.size
468 | 
469 |         # Read distance information and create an identifer
470 |         distance <- as.data.frame(fread(paste0("distance-genetic/subset-omni/chr", chr, ".OMNI.interpolated_genetic_map")))
471 |         names(distance) <- c("SNP", "position", "distance")
472 | 
473 |         distance["ID"] <- paste0(chr, "_", distance$position)
474 |         distance <- distance[!duplicated(distance$ID), ]
475 | 
476 |         # Merge and process
477 |         ss.temp <- left_join(ss, distance, by = "ID")
478 | 
479 |         for (col in 1:(nrow(ss.temp) - 1)) {
480 |             for (row in (col + 1):nrow(ss.temp)) {
481 |                 entry.temp <- exp(temp.cof * abs(ss.temp[col, "distance"] - ss.temp[row, "distance"]))
482 |                 entry.temp <- entry.temp * (entry.temp >= cutoff)
483 |                 matrix.adjust[row, col] <- entry.temp
484 |             }
485 |         }
486 | 
487 |         matrix.adjust <- t(matrix.adjust) + matrix.adjust + diag(nrow(ss.temp))
488 | 
489 |         rm(distance)
490 |         rm(ss.temp)
491 |         rm(temp.cof)
492 | 
493 |         ###################
494 |         # Final LD matrix #
495 |         ###################
496 | 
497 |         # "*" stands for elementwise multiplication
498 |         matrix.LD <- matrix.LD * matrix.adjust
499 |     }
500 | 
501 |     ############
502 |     # r vector #
503 |     ############
504 | 
505 |     # Compute r vector
506 |     r <- ss$Zscore / sqrt(ss$NrSamples - 1 + ss$Zscore ^2)
507 | 
508 |     # Get size
509 |     size <- nrow(ss)
510 | 
511 |     ##########################
512 |     # Iterate By parameter s #
513 |     ##########################
514 | 
515 |     result <- list()
516 | 
517 |     for (k in 1:length(s.array)) {
518 |         s <- s.array[k]
519 | 
520 |         #########################################
521 |         # Constructing the initial lambda array #
522 |         #########################################
523 | 
524 |         # Get the maximum for lambda
525 |         z.temp <- numeric(size)
526 |         for (m in 1:size) {
527 |             z.temp[m] <- abs(r[m])
528 |         }
529 | 
530 |         lambda.max   <- max(z.temp)
531 |         lambda.min   <- lambda.max * 1E-3
532 |         lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 10000)
533 |         rm(z.temp)
534 | 
535 |         ###########################################
536 |         # Explore the possible minimum for lambda #
537 |         ###########################################
538 | 
539 |         ######################################
540 |         # What do 0, 1, and 2 stand for?     #
541 |         # 0: Not tested                      #
542 |         # 1: tested and no problem detected  #
543 |         # 2: tested and problem is detected  #
544 |         ######################################
545 | 
546 |         problem <- numeric(10000)
547 |         start   <- 1
548 |         end     <- 10000
549 | 
550 |         max.iteration <- 500
551 | 
552 |         threshold <- 100
553 |         alpha     <- 0.5
554 | 
555 |         if (method == "SCAD") {
556 |             gamma <- 3.7
557 |         } else {
558 |             gamma <- 3
559 |         }
560 | 
561 |         dummy.detect <- TRUE
562 | 
563 |         while (dummy.detect) {
564 |             # Stop if all lambdas are OK
565 |             if (sum(problem == rep(1, 10000)) == 10000) {
566 |                 print("All lambdas check out!")
567 |                 break
568 |             }
569 | 
570 |             start <- min(which(problem == 0))
571 |             end   <- max(which(problem == 0))
572 | 
573 |             try    <- ceiling((start + end) /2)
574 |             lambda <- lambda.array[try]
575 | 
576 |             if (method == "MCP") {
577 |                 problem[try] <- MCPDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try)
578 |             }
579 |             if (method == "LASSO") {
580 |                 problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold, try)
581 |             }
582 |             if (method == "ElNet") {
583 |                 problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold, try)
584 |             }
585 |             if (method == "MNet") {
586 |                 problem[try] <- MNetDetect(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold, try)
587 |             }
588 |             if (method == "SCAD") {
589 |                 problem[try] <- SCADDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try)
590 |             }
591 | 
592 |             if (problem[try] == 1) {
593 |                 problem[1:try] <- 1
594 |             } else {
595 |                 problem[try:10000] <- 2
596 |             }
597 | 
598 |             if (problem[try] == 1 & problem[min(10000, try + 1)] == 2) {
599 |                 print(paste0("Problem detected for lambda=", lambda.array[try + 1], ". Finer interval has been saved!"))
600 |                 lambda.min   <- lambda
601 |                 dummy.detect <- FALSE
602 |             }
603 | 
604 |             if (problem[try] == 2 & problem[try - 1] == 1) {
605 |                 print(paste0("Problem detected for lambda=", lambda.array[try], ". Finer interval has been saved!"))
606 |                 lambda.min   <- lambda.array[try - 1]
607 |                 dummy.detect <- FALSE
608 |             }
609 |         }
610 | 
611 |         # Update the lambda array
612 |         lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 100)
613 | 
614 |         #######################################################################
615 |         # Optimizing with the chosen penalty (using the updated lambda array) #
616 |         #######################################################################
617 | 
618 |         #####################
619 |         # Iterate by lambda #
620 |         #####################
621 | 
622 |         beta <- t(matrix(0, nrow = 1, ncol = size))
623 | 
624 |         max.iteration <- 300
625 | 
626 |         threshold <- 1e-5
627 |         alpha     <- 0.5
628 | 
629 |         if (method == "SCAD") {
630 |             gamma <- 3.7
631 |         } else {
632 |             gamma <- 3
633 |         }
634 | 
635 |         if (method == "MCP") {
636 |             res <- MCP(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold)
637 |         }
638 |         if (method == "LASSO") {
639 |             res <- ElNet(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold)
640 |         }
641 |         if (method == "ElNet") {
642 |             res <- ElNet(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold)
643 |         }
644 |         if (method == "MNet") {
645 |             res <- MNet(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold)
646 |         }
647 |         if (method == "SCAD") {
648 |             res <- SCAD(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold)
649 |         }
650 | 
651 |         for (b in 1:100) {
652 |             result[[length(result) + 1]] <- list(res[, b], lambda.array[b])
653 |         }
654 |     }
655 | 
656 |     ############################################################################################################################
657 |     #         -----.                   osssso            ---------..`                                                          #
658 |     #        `ssssso                   osssso           `sssssssssssss+:`                                                      #
659 |     #        `ssssso       `...`       osssso   ``      `ssssssssssssssss/        `...`                ```          `...`      #
660 |     #        `ssssso   `/osssssss+:`   osssso:osssso/`  `sssss/  `.:osssss+   `:+sssssss+:`   +sssso.+sssso/`    -+sssssss+-   #
661 |     #        `ssssso  /sssssssssssso-  ossssssssssssss- `sssss/     `ssssss` :sssssssssssss-  +ssssssssssssso  .ossso/::ossso` #
662 |     #        `ssssso -sssss:``./sssss` osssso-``./sssss``sssss/      osssss..sssss:``./sssss. +sssss-``osssss` ossss/---:ssss+ #
663 |     #        .ssssso /ssss+    `sssss. ossss/    `sssss.`sssss/    `/ssssso :sssso     sssss- +sssss   /sssss``sssssssssssssso #
664 |     # .:/oso/osssss/ .sssss+--:osssso  osssss/--:osssso `ssssso+++ossssss+` `sssss+:-:+sssso` +sssss   /sssss` +ssss/`  `-`    #
665 |     # .ossssssssss+`  .osssssssssss+`  ossssosssssssso. `ssssssssssssss+-    .osssssssssss+.  +sssss   /sssss` `/sssssssssso/` #
666 |     #   -/+osso+/.      ./+oosoo+:`    /++++.-/+oo+/-   `++++++++//:-.`        .:+oosoo+:.    :+++++   :+++++`   `:/oosoo+/-`  #
667 |     ############################################################################################################################
668 | 
669 |     #########################
670 |     # Select optimal lambda #
671 |     #########################
672 | 
673 |     #############################
674 |     # Calculate goodness of fit #
675 |     #############################
676 | 
677 |     ###############################################
678 |     # Structure of the list result/result.optimal #
679 |     ###############################################
680 |     # index:                                      #
681 |     # [[1]]  beta vector                          #
682 |     # [[2]]  lambda                               #
683 |     # [[3]]  r square adjusted (to be added)      #
684 |     # [[4]]  regression coefficient               #
685 |     # [[5]]  p-value of r square                  #
686 |     # [[6]]  r-square projected by GTEx8 - GTEx7  #
687 |     # [[7]]  runtime for the gene                 #
688 |     # [[8]]  average sample size                  #
689 |     # [[9]]  iteration index                      #
690 |     # [[10]] correlation                          #
691 |     ###############################################
692 | 
693 |     # Get the true response vector
694 |     response.true <- response.8[list.train, which(colnames(response.8) == gene.proper)]
695 | 
696 |     # Figure out if response.true is just a vector or multiple vectors and find the optimal
697 |     if (class(response.true) == "numeric") {
698 |         result.optimal <- FindOptimalResult(response.true)
699 |     } else {
700 |         max <- -1
701 |         for (asd in 1:ncol(response.true)) {
702 |             temp <- FindOptimalResult(response.true[, asd])[[3]]
703 |             if (temp > max) {
704 |                 max <- temp
705 |                 n.col <- asd
706 |             }
707 |         }
708 | 
709 |         response.true  <- response.true[, n.col]
710 |         result.optimal <- FindOptimalResult(response.true)
711 |     }
712 | 
713 |     # Compute the P-value of R square
714 |     response.pred <- genotype.train %*% result.optimal[[1]]
715 |     quiet(
716 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
717 |     )
718 |     result.optimal[[5]] <- test$p.value
719 | 
720 |     # Little tweak to make beta vector an one-column matrix
721 |     result.optimal[[1]] <- as.matrix(result.optimal[[1]])
722 | 
723 |     ##############################
724 |     # Validate model with GTex-8 #
725 |     ##############################
726 | 
727 |     # Get the response vectors
728 |     response.true <- response.8[list.valid, which(colnames(response.8) == gene.proper)]
729 | 
730 |     if (exists("n.col")) {
731 |         response.true <- response.true[, n.col]
732 |         rm(n.col)
733 |     }
734 | 
735 |     response.pred <- genotype.valid %*% result.optimal[[1]]
736 | 
737 |     # Cross validation with subject that are NOT INCLUDED in GTEx7
738 |     reg                 <- summary(lm(response.true ~ response.pred))
739 |     result.optimal[[6]] <- reg$adj.r.squared
740 | 
741 |     # Keep track of runtime
742 |     time.end <- proc.time()[3]
743 |     result.optimal[[7]] <- time.end - time.start
744 | 
745 |     # Keep track of average sample size and miscellaneous items
746 |     result.optimal[[8]]  <- mean(ss$NrSamples)
747 |     result.optimal[[9]]  <- i
748 |     result.optimal[[10]] <- cor(response.true, response.pred)
749 | 
750 |     ########
751 |     # Save #
752 |     ########
753 | 
754 |     save <- result.optimal
755 | 
756 |     dir.create(paste0("output/output_", batch), showWarnings = FALSE)
757 |     save.chr <- paste0("output/output_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData")
758 | 
759 |     save(save, file = save.chr)
760 | 
761 |     ########################
762 |     # For TWAS-fusion only #
763 |     ########################
764 | 
765 |     # cv.performance
766 |     cv.performance           <- matrix(nrow = 2, ncol = 1)
767 |     colnames(cv.performance) <- method
768 |     rownames(cv.performance) <- c("rsq", "pval")
769 |     cv.performance[1, 1]     <- reg$adj.r.squared
770 |     quiet(
771 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
772 |     )
773 |     cv.performance[2, 1]     <- test$p.value
774 | 
775 |     # snps
776 |     snps <- ss
777 | 
778 |     # wgt.matrix
779 |     wgt.matrix           <- result.optimal[[1]]
780 |     colnames(wgt.matrix) <- method
781 |     rownames(wgt.matrix) <- ss$SNP
782 | 
783 |     dir.create(paste0("output/output_twas_", batch), showWarnings = FALSE)
784 |     save.twas.chr <- paste0("output/output_twas_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData")
785 |     save(cv.performance, snps, wgt.matrix, file = save.twas.chr)
786 | 
787 |     ###############
788 |     # Cleaning-up #
789 |     ###############
790 | 
791 |     file.remove(paste0("quick-submit/working/", i))
792 |     save(null.object, file = paste0("quick-submit/done/", i))
793 | }
794 | 


--------------------------------------------------------------------------------
/mainbody_cpp_rsid_precise.R:
--------------------------------------------------------------------------------
  1 | #############################################
  2 | # Package, parameter, and working directory #
  3 | #############################################
  4 | 
  5 | # Package
  6 | library(BEDMatrix)
  7 | library(compiler)
  8 | library(data.table)
  9 | library(ddpcr)
 10 | library(dplyr)
 11 | library(optparse)
 12 | library(Rcpp)
 13 | 
 14 | # Parameter
 15 | s.array <- 0.1 * (1:9)
 16 | 
 17 | ###########
 18 | # Options #
 19 | ###########
 20 | 
 21 | # Options
 22 | option_list <- list(
 23 |     make_option("--name_batch", type = "character", default = FALSE, action = "store", help = "Name of this batch"
 24 |     ),
 25 |     make_option("--method"    , type = "character", default = FALSE, action = "store", help = "Method to use"
 26 |     ),
 27 |     make_option("--wd"        , type = "character", default = FALSE, action = "store", help = "Working directory"
 28 |     )  
 29 | )
 30 | 
 31 | opt <- parse_args(OptionParser(option_list = option_list))
 32 | 
 33 | # Pass the options to variables
 34 | batch  <- opt$name_batch
 35 | method <- opt$method
 36 | setwd(opt$wd)
 37 | 
 38 | #######################
 39 | # Quick-submit gadget #
 40 | #######################
 41 | 
 42 | # Randomly sleep for a little while
 43 | Sys.sleep(runif(1, 0, 30))
 44 | 
 45 | # Setting up the folders
 46 | null.object <- NULL
 47 | 
 48 | dir.create("quick-submit", showWarnings = TRUE) 
 49 | if (!dir.exists("quick-submit/to-do")) {
 50 |     dir.create("quick-submit/to-do")
 51 |     dir.create("quick-submit/working")
 52 |     dir.create("quick-submit/done")
 53 |     dir.create("quick-submit/error")
 54 |     for (qwerty in 1:19227) {
 55 |         save(null.object, file = paste0("quick-submit/to-do/", qwerty))
 56 |     }
 57 | }
 58 | 
 59 | ##########################
 60 | # User-defined functions #
 61 | ##########################
 62 | 
 63 | # Cpp functions
 64 | suppressMessages(sourceCpp("DetectAndSolve_debugged.cpp"))
 65 | 
 66 | # FindOptimalResult
 67 | FindOptimalResult <- function(response.true) {
 68 |     # Find the optimal out of 900 results
 69 |     index.optimal <- 0
 70 |     max           <- -1
 71 | 
 72 |     for (t in 1:900) {
 73 |         if (norm(result[[t]][[1]], type = "2") <= 100) {
 74 |             # Compute the predicted response
 75 |             response.pred <- genotype.train %*% result[[t]][[1]]
 76 | 
 77 |             # Do the regression
 78 |             reg <- summary(lm(response.true ~ response.pred))
 79 | 
 80 |             # Keep only the optimal result
 81 |             if (reg$adj.r.sq > max) {
 82 |                 max            <- reg$adj.r.sq
 83 |                 result.optimal <- list(result[[t]][[1]], result[[t]][[2]], reg$adj.r.sq, reg$coef)
 84 |                 t.optimal      <- t
 85 |             }
 86 |         }
 87 |     }
 88 | 
 89 |     # Compute the P-value of R square
 90 |     quiet(
 91 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
 92 |     )
 93 |     result.optimal[[5]] <- test$p.value
 94 | 
 95 |     return(result.optimal)
 96 | }
 97 | 
 98 | FindOptimalResult <- cmpfun(FindOptimalResult)
 99 | 
100 | # PatchUp
101 | source("PatchUp.R")
102 | 
103 | PatchUp <- cmpfun(PatchUp)
104 | 
105 | # Standardize
106 | Standardize <- function(M) {
107 |     # Centralize
108 |     M <- M - matrix(rep(colMeans(M), times = nrow(M)), nrow = nrow(M) , ncol = ncol(M), byrow = T)
109 | 
110 |     # Standardize
111 |     M <- sweep(M, 2, sqrt(apply(M, 2, crossprod) / nrow(M)), "/")
112 | 
113 |     return(M)
114 | }
115 | 
116 | Standardize <- cmpfun(Standardize)
117 | 
118 | # Translate
119 | list.translation           <- readRDS("resource/list-lookup/gencode.v26.hg19.genes.rds")
120 | list.translation$gene_id   <- substr(list.translation$gene_id, 1, 15)
121 | list.translation$gene_name <- as.character(list.translation$gene_name)
122 | 
123 | list.supplementary <- readRDS("resource/list-lookup/list.supplementary.rds")
124 | 
125 | Translate <- function(gene.ENSG) {
126 |     temp.1 <- list.translation[which(list.translation$gene_id == gene.ENSG), ]
127 | 
128 |     if (nrow(temp.1) == 0) {
129 |         temp.2 <- list.supplementary[which(list.supplementary$ensembl_gene_id == gene.ENSG), ]
130 | 
131 |         if (nrow(temp.2) == 0) {
132 |             gene.proper <- gene.ENSG
133 |         } else {
134 |             gene.proper <- temp.2[1, 3]
135 |         }
136 |     } else {
137 |         gene.proper <- temp.1[1, 12]
138 |     }
139 | 
140 |     if (gene.proper == "") {
141 |         gene.proper <- gene.ENSG
142 |     }
143 | 
144 |     return(gene.proper)
145 | }
146 | 
147 | Translate <- cmpfun(Translate)
148 | 
149 | ##############################
150 | # Read GTEx-7's subject list #
151 | ##############################
152 | 
153 | raw    <- readRDS("Whole Blood_QCed_rpkm.rds")
154 | list.7 <- rownames(raw$qced.exp)
155 | 
156 | rm(raw)
157 | 
158 | ##########################################################################
159 | # Read GTEx-8's expression data (response.8.RData is already translated) #
160 | ##########################################################################
161 | 
162 | load("response.8.RData")
163 | 
164 | ####################
165 | # List of subjects #
166 | ####################
167 | 
168 | # Split subjects in GTEx-8 into two groups
169 | list.8 <- rownames(response.8)
170 | 
171 | list.train <- list.8[which(list.8 %in% list.7)]
172 | list.valid <- list.8[which(!(list.8 %in% list.7))]
173 | 
174 | ####################################
175 | # There are 19,227 genes in total! #
176 | ####################################
177 | 
178 | filenames <- dir("subset-hapmap3")
179 | 
180 | # Quick-submit gadget
181 | while (TRUE) {
182 |     if (length(dir("quick-submit/to-do")) == 0) {
183 |         print("All jobs are done!")
184 |         break
185 |     } else {
186 |         # Get the job
187 |         to_do <- as.numeric(dir("quick-submit/to-do"))
188 |         if (length(to_do) == 1) {
189 |             i <- to_do[1]
190 |         } else {
191 |             i <- sample(to_do, 1)
192 |         }
193 |         # Ship the job to "working" folder
194 |         save(null.object, file = paste0("quick-submit/working/", i))
195 |         file.remove(paste0("quick-submit/to-do/", i))
196 |     }
197 | 
198 |     # Start keeping track of runtime
199 |     time.start <- proc.time()[3]
200 | 
201 |     #################
202 |     # Preprocess ss #
203 |     #################
204 | 
205 |     load(paste0("ss-eqtlgen/subset-hapmap3/", filenames[i]))
206 | 
207 |     # Get the name and chromosome of to-be-processed gene
208 |     gene.ENSG   <- ss$Gene[1]
209 |     gene.proper <- Translate(gene.ENSG)
210 |     chr         <- ss$SNPChr[1]
211 | 
212 |     ##########################
213 |     # Clumping and filtering #
214 |     ##########################
215 | 
216 |     do.clumping <- FALSE
217 | 
218 |     # Obsolete
219 |     if (do.clumping) {
220 |         ###########
221 |         # Extract #
222 |         ###########
223 | 
224 |         # Create a temporary SNP list for Plink
225 |         temp.outdir  <- "/gpfs/home/zz17/current-project/temp/"
226 |         temp.snpfile <- paste0(temp.outdir, gene.ENSG, ".txt")
227 |         write.table(ss[, 1], file = temp.snpfile, quote = FALSE, col.names = FALSE, row.names = FALSE)
228 | 
229 |         # Put together the command and execute
230 |         command <- path.Plink
231 |         command <- paste0(command)
232 |         command <- paste0(command, " --bfile /gpfs/home/zz17/resource/1000-genome/sequence/1000G.EUR.ALLSNP.QC.CHR", chr)
233 |         command <- paste0(command, " --chr ", chr)
234 |         command <- paste0(command, " --extract ", temp.snpfile)
235 |         command <- paste0(command, " --make-bed")
236 |         command <- paste0(command, " --out ", temp.outdir, gene.ENSG)
237 | 
238 |         quiet(
239 |             system(command, intern = TRUE)
240 |         )
241 | 
242 |         ###########
243 |         # Process #
244 |         ###########
245 | 
246 |         # Create a temporary ss file and add a p-value column
247 |         ss$pval     <- 2 * pnorm(abs(ss$Zscore), lower.tail = FALSE)
248 |         temp.ssfile <- paste0(temp.outdir, gene.ENSG, "_ss.txt")
249 |         write.table(ss, file = temp.ssfile, quote = FALSE, col.names = TRUE, row.names = FALSE)
250 | 
251 |         # Put together the command and execute
252 |         command <- path.Plink
253 |         command <- paste0(command, " --bfile ", temp.outdir, gene.ENSG)
254 |         command <- paste0(command, " --clump ", temp.ssfile)
255 |         command <- paste0(command, " --clump-field pval")
256 |         command <- paste0(command, " --clump-kb 250")
257 |         command <- paste0(command, " --clump-p1 1")
258 |         command <- paste0(command, " --clump-r2 0.95")
259 |         command <- paste0(command, " --clump-snp-field SNP")
260 |         command <- paste0(command, " --out ", temp.outdir, gene.ENSG)
261 | 
262 |         quiet(
263 |             system(command, intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE)
264 |         )
265 | 
266 |         clumped.snp <- fread(paste0(temp.outdir, gene.ENSG, ".clumped"))
267 |         clumped.snp <- as.data.frame(clumped.snp)
268 |         clumped.snp <- clumped.snp[, "SNP"]
269 | 
270 |         # Clean up and update
271 |         system(paste0("rm ", temp.outdir, gene.ENSG, "*"))
272 | 
273 |         ss <- ss[ss$SNP %in% clumped.snp, ]
274 |     }
275 | 
276 |     ############
277 |     # Quick QC #
278 |     ############
279 | 
280 |     # Skip this round of iteration if ss is empty to begin with
281 |     if (nrow(ss) == 0) {
282 |         file.connection <- file(paste0("quick-submit/error/", i, ".error0"))
283 |         writeLines("Empty summary statistics to begin with!", file.connection)
284 |         close(file.connection)
285 |         file.remove(paste0("quick-submit/working/", i))
286 |         save(null.object, file=paste0("quick-submit/done/", i))
287 | 
288 |         next
289 |     }
290 | 
291 |     # Read the bim files of GTEx-8 and reference panel
292 |     bim.8   <- as.data.frame(fread(paste0("qced_chr", chr, ".bim")))
293 |     bim.ref <- as.data.frame(fread(paste0("1000G.EUR.ALLSNP.QC.CHR", chr, ".bim")))
294 | 
295 |     # Only keep the SNPs that are simultaneously IN (reference panel, GTEx-8)
296 |     ss <- ss[ss$SNP %in% bim.ref$V2 & ss$SNP %in% bim.8$V2, ]
297 | 
298 |     # Sieve out problematic SNPs
299 |     list.1  <- bim.8$V2[duplicated(bim.8$V2)]
300 |     list.2  <- bim.8$V2[nchar(bim.8$V5) > 1 | nchar(bim.8$V6) > 1]
301 |     list.3  <- bim.ref$V2[duplicated(bim.ref$V2)]
302 |     list.4  <- bim.ref$V2[nchar(bim.ref$V5) > 1 | nchar(bim.ref$V6) > 1]
303 |     list.5  <- ss$SNP[duplicated(ss$SNP)]
304 |     problem <- ss$SNP %in% list.1 | ss$SNP %in% list.2 | ss$SNP %in% list.3 | ss$SNP %in% list.4 | ss$SNP %in% list.5
305 | 
306 |     ss <- ss[!problem, ]
307 | 
308 |     rm(list.1)
309 |     rm(list.2)
310 |     rm(list.3)
311 |     rm(list.4)
312 |     rm(list.5)
313 |     rm(problem)
314 | 
315 |     # Skip this round of iteration if ss has few row left
316 |     if (nrow(ss) <= 1) {
317 |         file.connection <- file(paste0("quick-submit/error/", i, ".error1"))
318 |         writeLines("Empty summary statistics after QC!", file.connection)
319 |         close(file.connection)
320 |         file.remove(paste0("quick-submit/working/", i))
321 |         save(null.object, file=paste0("quick-submit/done/", i))
322 | 
323 |         next
324 |     }
325 | 
326 |     ########################################################
327 |     # Make sure that the gene of the hour is in response.8 #
328 |     ########################################################
329 | 
330 |     if (!(gene.proper %in% colnames(response.8))) {
331 |         file.connection <- file(paste0("quick-submit/error/", i, ".error2"))
332 |         writeLines("Gene is not in response.8 and we move on to the next gene!", file.connection)
333 |         close(file.connection)
334 |         file.remove(paste0("quick-submit/working/", i))
335 |         save(null.object, file=paste0("quick-submit/done/", i))
336 | 
337 |         next
338 |     }
339 | 
340 |     #################################################
341 |     # Preprocess genotype matrix of reference panel #
342 |     #################################################
343 | 
344 |     # Extract genotype information
345 |     quiet(
346 |         seq.ref <- BEDMatrix(paste0("1000G.EUR.ALLSNP.QC.CHR", chr), simple_names = TRUE)
347 |     )
348 | 
349 |     genotype.ref      <- seq.ref[, ss$SNP]
350 |     rm(seq.ref)
351 | 
352 |     # Skip this round of iteration if genotype is empty
353 |     if (ncol(genotype.ref) == 0) {
354 |         file.connection <- file(paste0("quick-submit/error/", i, ".error3"))
355 |         writeLines("Empty reference genotype matrix!", file.connection)
356 |         close(file.connection)
357 |         file.remove(paste0("quick-submit/working/", i))
358 |         save(null.object, file=paste0("quick-submit/done/", i))
359 | 
360 |         next
361 |     }
362 | 
363 |     # Compare to see if all (A1 and A2) pairs are well-aligned
364 |     # Always use reference panel's A1/A2 as the standard order
365 |     names(bim.ref)[2] <- "SNP"
366 |     bim.temp <- subset(bim.ref, SNP %in% ss$SNP)
367 |     ss.temp  <- left_join(ss, bim.temp, by = "SNP")
368 |     problem  <- !(ss.temp$a1 == ss.temp$V5)
369 | 
370 |     if (sum(problem) != 0) {
371 |         # Flip the problematic pairs
372 |         ss$Zscore[problem] <- -1 * ss$Zscore[problem]
373 | 
374 |         ss$a1 <- ss.temp$V5
375 |         ss$a2 <- ss.temp$V6
376 |     }
377 | 
378 |     rm(bim.temp)
379 |     rm(ss.temp)
380 |     rm(problem)
381 | 
382 |     # Patch up the NAs and centralize genotype matrix
383 |     genotype.ref <- PatchUp(genotype.ref)
384 |     genotype.ref <- Standardize(genotype.ref)
385 | 
386 |     #######################################
387 |     # Preprocess genotype matrix of GTEx8 #
388 |     #######################################
389 | 
390 |     # The big genotype matrix
391 |     quiet(
392 |         seq.8 <- BEDMatrix(paste0("qced_chr", chr), simple_names = TRUE)
393 |     )
394 | 
395 |     # Remove the "0_" part of subject IDs
396 |     rownames(seq.8) <- gsub("0_", "", rownames(seq.8))
397 | 
398 |     # Subset by list.8 and ID from ss
399 |     genotype.8 <- seq.8[list.8, ss$SNP]
400 | 
401 |     # Skip this round of iteration if genotype is empty
402 |     if (ncol(genotype.8) == 0) {
403 |         file.connection <- file(paste0("quick-submit/error/", i, ".error4"))
404 |         writeLines("Empty genotype.8 matrix!", file.connection)
405 |         close(file.connection)
406 |         file.remove(paste0("quick-submit/working/", i))
407 |         save(null.object, file=paste0("quick-submit/done/", i))
408 | 
409 |         next
410 |     }
411 | 
412 |     # Compare to see if all (a1 and a2) are well-aligned
413 |     names(bim.ref)[2] <- "SNP"
414 |     names(bim.8)[2]   <- "SNP"
415 |     bim.temp1 <- subset(bim.ref, SNP %in% ss$SNP)
416 |     bim.temp2 <- subset(bim.8, SNP %in% ss$SNP)
417 |     bim.temp3 <- left_join(bim.temp1, bim.temp2, by = "SNP")
418 |     problem   <- !(bim.temp3$V5.x == bim.temp3$V5.y)
419 | 
420 |     if (sum(problem) != 0) {
421 |         genotype.8[, problem] <- 2 - genotype.8[, problem]
422 |     }
423 | 
424 |     rm(bim.temp1)
425 |     rm(bim.temp2)
426 |     rm(bim.temp3)
427 |     rm(seq.8)
428 |     rm(problem)
429 | 
430 |     #######################################################
431 |     # EXTREMELY ANNOYING ISSUE FOUND!!!!!!!!!!!!!!!!!!!!! #
432 |     #######################################################
433 | 
434 |     # Subject number is NOT the same for all chromosomes!
435 |     genotype.train <- genotype.8[which(rownames(genotype.8) %in% list.train), ]
436 |     genotype.train <- scale(PatchUp(genotype.train))
437 | 
438 |     genotype.valid <- genotype.8[which(rownames(genotype.8) %in% list.valid), ]
439 |     genotype.valid <- scale(PatchUp(genotype.valid))
440 | 
441 |     genotype.train[is.na(genotype.train)] <- 0
442 |     genotype.valid[is.na(genotype.valid)] <- 0
443 | 
444 |     ##########################
445 |     # The original LD matrix #
446 |     ##########################
447 | 
448 |     matrix.LD <- t(genotype.ref) %*% genotype.ref / nrow(genotype.ref)
449 | 
450 |     #########################
451 |     # Additional adjustment #
452 |     #########################
453 | 
454 |     do.adjust <- TRUE
455 | 
456 |     if (do.adjust) {
457 |         # Some given parameters
458 |         cutoff        <- 0.001
459 |         matrix.adjust <- matrix(0, nrow(ss), nrow(ss))
460 |         n.effective   <- 11400
461 |         n.size        <- 183
462 | 
463 |         temp.cof <- (-2 * n.effective) / n.size
464 | 
465 |         # Read distance information and create an identifer
466 |         distance <- as.data.frame(fread(paste0("chr", chr, ".OMNI.interpolated_genetic_map")))
467 |         names(distance) <- c("SNP", "position", "distance")
468 | 
469 |         # Merge and process
470 |         ss.temp <- left_join(ss, distance, by = "SNP")
471 | 
472 |         for (col in 1:(nrow(ss.temp) - 1)) {
473 |             for (row in (col + 1):nrow(ss.temp)) {
474 |                 entry.temp <- exp(temp.cof * abs(ss.temp[col, "distance"] - ss.temp[row, "distance"]))
475 |                 entry.temp <- entry.temp * (entry.temp >= cutoff)
476 |                 matrix.adjust[row, col] <- entry.temp
477 |             }
478 |         }
479 | 
480 |         matrix.adjust <- t(matrix.adjust) + matrix.adjust + diag(nrow(ss.temp))
481 | 
482 |         rm(distance)
483 |         rm(ss.temp)
484 |         rm(temp.cof)
485 | 
486 |         ###################
487 |         # Final LD matrix #
488 |         ###################
489 | 
490 |         # "*" stands for elementwise multiplication
491 |         matrix.LD <- matrix.LD * matrix.adjust
492 |     }
493 | 
494 |     ############
495 |     # r vector #
496 |     ############
497 | 
498 |     # Compute r vector
499 |     r <- ss$Zscore / sqrt(ss$NrSamples - 1 + ss$Zscore ^2)
500 | 
501 |     # Get size
502 |     size <- nrow(ss)
503 | 
504 |     ##########################
505 |     # Iterate By parameter s #
506 |     ##########################
507 | 
508 |     result <- list()
509 | 
510 |     for (k in 1:length(s.array)) {
511 |         s <- s.array[k]
512 | 
513 |         #########################################
514 |         # Constructing the initial lambda array #
515 |         #########################################
516 | 
517 |         # Get the maximum for lambda
518 |         z.temp <- numeric(size)
519 |         for (m in 1:size) {
520 |             z.temp[m] <- abs(r[m])
521 |         }
522 | 
523 |         lambda.max   <- max(z.temp)
524 |         lambda.min   <- lambda.max * 1E-3
525 |         lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 10000)
526 |         rm(z.temp)
527 | 
528 |         ###########################################
529 |         # Explore the possible minimum for lambda #
530 |         ###########################################
531 | 
532 |         ######################################
533 |         # What do 0, 1, and 2 stand for?     #
534 |         # 0: Not tested                      #
535 |         # 1: tested and no problem detected  #
536 |         # 2: tested and problem is detected  #
537 |         ######################################
538 | 
539 |         problem <- numeric(10000)
540 |         start   <- 1
541 |         end     <- 10000
542 | 
543 |         max.iteration <- 300
544 | 
545 |         threshold <- 100
546 |         alpha     <- 0.5
547 | 
548 |         if (method == "SCAD") {
549 |             gamma <- 3.7
550 |         } else {
551 |             gamma <- 3
552 |         }
553 | 
554 |         dummy.detect <- TRUE
555 | 
556 |         while (dummy.detect) {
557 |             # Stop if all lambdas are OK
558 |             if (sum(problem == rep(1, 10000)) == 10000) {
559 |                 print("All lambdas check out!")
560 |                 break
561 |             }
562 | 
563 |             start <- min(which(problem == 0))
564 |             end   <- max(which(problem == 0))
565 | 
566 |             try    <- ceiling((start + end) /2)
567 |             lambda <- lambda.array[try]
568 | 
569 |             if (method == "MCP") {
570 |                 problem[try] <- MCPDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try)
571 |             }
572 |             if (method == "LASSO") {
573 |                 problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold, try)
574 |             }
575 |             if (method == "ElNet") {
576 |                 problem[try] <- ElNetDetect(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold, try)
577 |             }
578 |             if (method == "MNet") {
579 |                 problem[try] <- MNetDetect(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold, try)
580 |             }
581 |             if (method == "SCAD") {
582 |                 problem[try] <- SCADDetect(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold, try)
583 |             }
584 | 
585 |             if (problem[try] == 1) {
586 |                 problem[1:try] <- 1
587 |             } else {
588 |                 problem[try:10000] <- 2
589 |             }
590 | 
591 |             if (problem[try] == 1 & problem[min(10000, try + 1)] == 2) {
592 |                 print(paste0("Problem detected for lambda=", lambda.array[try + 1], ". Finer interval has been saved!"))
593 |                 lambda.min   <- lambda
594 |                 dummy.detect <- FALSE
595 |             }
596 | 
597 |             if (problem[try] == 2 & problem[try - 1] == 1) {
598 |                 print(paste0("Problem detected for lambda=", lambda.array[try], ". Finer interval has been saved!"))
599 |                 lambda.min   <- lambda.array[try - 1]
600 |                 dummy.detect <- FALSE
601 |             }
602 |         }
603 | 
604 |         # Update the lambda array
605 |         lambda.array <- exp(1) ^ seq(log(lambda.max), log(lambda.min), length = 100)
606 | 
607 |         #######################################################################
608 |         # Optimizing with the chosen penalty (using the updated lambda array) #
609 |         #######################################################################
610 | 
611 |         #####################
612 |         # Iterate by lambda #
613 |         #####################
614 | 
615 |         beta <- t(matrix(0, nrow = 1, ncol = size))
616 | 
617 |         max.iteration <- 300
618 | 
619 |         threshold <- 1e-6
620 |         alpha     <- 0.5
621 | 
622 |         if (method == "SCAD") {
623 |             gamma <- 3.7
624 |         } else {
625 |             gamma <- 3
626 |         }
627 | 
628 |         if (method == "MCP") {
629 |             res <- MCP(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold)
630 |         }
631 |         if (method == "LASSO") {
632 |             res <- ElNet(r, matrix.LD, lambda.array, s, 1, max.iteration, threshold)
633 |         }
634 |         if (method == "ElNet") {
635 |             res <- ElNet(r, matrix.LD, lambda.array, s, 0.5, max.iteration, threshold)
636 |         }
637 |         if (method == "MNet") {
638 |             res <- MNet(r, matrix.LD, lambda.array, s, alpha, gamma, max.iteration, threshold)
639 |         }
640 |         if (method == "SCAD") {
641 |             res <- SCAD(r, matrix.LD, lambda.array, s, gamma, max.iteration, threshold)
642 |         }
643 | 
644 |         for (b in 1:100) {
645 |             result[[length(result) + 1]] <- list(res[, b], lambda.array[b])
646 |         }
647 |     }
648 | 
649 |     ############################################################################################################################
650 |     #         -----.                   osssso            ---------..`                                                          #
651 |     #        `ssssso                   osssso           `sssssssssssss+:`                                                      #
652 |     #        `ssssso       `...`       osssso   ``      `ssssssssssssssss/        `...`                ```          `...`      #
653 |     #        `ssssso   `/osssssss+:`   osssso:osssso/`  `sssss/  `.:osssss+   `:+sssssss+:`   +sssso.+sssso/`    -+sssssss+-   #
654 |     #        `ssssso  /sssssssssssso-  ossssssssssssss- `sssss/     `ssssss` :sssssssssssss-  +ssssssssssssso  .ossso/::ossso` #
655 |     #        `ssssso -sssss:``./sssss` osssso-``./sssss``sssss/      osssss..sssss:``./sssss. +sssss-``osssss` ossss/---:ssss+ #
656 |     #        .ssssso /ssss+    `sssss. ossss/    `sssss.`sssss/    `/ssssso :sssso     sssss- +sssss   /sssss``sssssssssssssso #
657 |     # .:/oso/osssss/ .sssss+--:osssso  osssss/--:osssso `ssssso+++ossssss+` `sssss+:-:+sssso` +sssss   /sssss` +ssss/`  `-`    #
658 |     # .ossssssssss+`  .osssssssssss+`  ossssosssssssso. `ssssssssssssss+-    .osssssssssss+.  +sssss   /sssss` `/sssssssssso/` #
659 |     #   -/+osso+/.      ./+oosoo+:`    /++++.-/+oo+/-   `++++++++//:-.`        .:+oosoo+:.    :+++++   :+++++`   `:/oosoo+/-`  #
660 |     ############################################################################################################################
661 | 
662 |     #########################
663 |     # Select optimal lambda #
664 |     #########################
665 | 
666 |     #############################
667 |     # Calculate goodness of fit #
668 |     #############################
669 | 
670 |     ###############################################
671 |     # Structure of the list result/result.optimal #
672 |     ###############################################
673 |     # index:                                      #
674 |     # [[1]]  beta vector                          #
675 |     # [[2]]  lambda                               #
676 |     # [[3]]  r square adjusted (to be added)      #
677 |     # [[4]]  regression coefficient               #
678 |     # [[5]]  p-value of r square                  #
679 |     # [[6]]  r-square projected by GTEx8 - GTEx7  #
680 |     # [[7]]  runtime for the gene                 #
681 |     # [[8]]  average sample size                  #
682 |     # [[9]]  iteration index                      #
683 |     # [[10]] correlation                          #
684 |     ###############################################
685 | 
686 |     # Get the true response vector
687 |     response.true <- response.8[list.train, which(colnames(response.8) == gene.proper)]
688 | 
689 |     # Figure out if response.true is just a vector or multiple vectors and find the optimal
690 |     if (class(response.true) == "numeric") {
691 |         result.optimal <- FindOptimalResult(response.true)
692 |     } else {
693 |         max <- -1
694 |         for (asd in 1:ncol(response.true)) {
695 |             temp <- FindOptimalResult(response.true[, asd])[[3]]
696 |             if (temp > max) {
697 |                 max <- temp
698 |                 n.col <- asd
699 |             }
700 |         }
701 | 
702 |         response.true  <- response.true[, n.col]
703 |         result.optimal <- FindOptimalResult(response.true)
704 |     }
705 | 
706 |     # Compute the P-value of R square
707 |     response.pred <- genotype.train %*% result.optimal[[1]]
708 |     quiet(
709 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
710 |     )
711 |     result.optimal[[5]] <- test$p.value
712 | 
713 |     # Little tweak to make beta vector an one-column matrix
714 |     result.optimal[[1]] <- as.matrix(result.optimal[[1]])
715 | 
716 |     ##############################
717 |     # Validate model with GTex-8 #
718 |     ##############################
719 | 
720 |     # Get the response vectors
721 |     response.true <- response.8[list.valid, which(colnames(response.8) == gene.proper)]
722 | 
723 |     if (exists("n.col")) {
724 |         response.true <- response.true[, n.col]
725 |         rm(n.col)
726 |     }
727 | 
728 |     response.pred <- genotype.valid %*% result.optimal[[1]]
729 | 
730 |     # Cross validation with subject that are NOT INCLUDED in GTEx7
731 |     reg                 <- summary(lm(response.true ~ response.pred))
732 |     result.optimal[[6]] <- reg$adj.r.squared
733 | 
734 |     # Keep track of runtime
735 |     time.end <- proc.time()[3]
736 |     result.optimal[[7]] <- time.end - time.start
737 | 
738 |     # Keep track of average sample size and miscellaneous items
739 |     result.optimal[[8]]  <- mean(ss$NrSamples)
740 |     result.optimal[[9]]  <- i
741 |     result.optimal[[10]] <- cor(response.true, response.pred)
742 | 
743 |     ########
744 |     # Save #
745 |     ########
746 | 
747 |     save <- result.optimal
748 | 
749 |     dir.create(paste0("output_", batch), showWarnings = FALSE)
750 |     save.chr <- paste0("output_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData")
751 | 
752 |     save(save, file = save.chr)
753 | 
754 |     ########################
755 |     # For TWAS-fusion only #
756 |     ########################
757 | 
758 |     # cv.performance
759 |     cv.performance           <- matrix(nrow = 2, ncol = 1)
760 |     colnames(cv.performance) <- method
761 |     rownames(cv.performance) <- c("rsq", "pval")
762 |     cv.performance[1, 1]     <- reg$adj.r.squared
763 |     quiet(
764 |         test <- cor.test(as.vector(response.true), as.vector(response.pred), method = "pearson", alternative = "greater")
765 |     )
766 |     cv.performance[2, 1]     <- test$p.value
767 | 
768 |     # snps
769 |     snps <- ss
770 | 
771 |     # wgt.matrix
772 |     wgt.matrix           <- result.optimal[[1]]
773 |     colnames(wgt.matrix) <- method
774 |     rownames(wgt.matrix) <- ss$SNP
775 | 
776 |     dir.create(paste0("output_twas_", batch), showWarnings = FALSE)
777 |     save.twas.chr <- paste0("output_twas_", batch, "/Whole_Blood.", gene.ENSG, ".wgt.RData")
778 |     save(cv.performance, snps, wgt.matrix, file = save.twas.chr)
779 | 
780 |     ###############
781 |     # Cleaning-up #
782 |     ###############
783 | 
784 |     file.remove(paste0("quick-submit/working/", i))
785 |     save(null.object, file = paste0("quick-submit/done/", i))
786 | }
787 | 


--------------------------------------------------------------------------------
/step2_pos.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | array  <- Sys.getenv("SLURM_ARRAY_TASK_ID")
  3 | id.job <- as.numeric(array)
  4 | 
  5 | # Packages
  6 | library(BEDMatrix)
  7 | suppressMessages(library(data.table))
  8 | suppressMessages(library(ddpcr))
  9 | suppressMessages(library(dplyr))
 10 | library(optparse)
 11 | 
 12 | # Options
 13 | option_list <- list(
 14 |     make_option("--models",   type = "character", default = FALSE, action = "store", help = "Path of the model folder"
 15 |     ),
 16 |     make_option("--path.ref", type = "character", default = FALSE, action = "store", help = "Path of the reference panel plus the prefixes"
 17 |     ),
 18 |     make_option("--path.ss",  type = "character", default = FALSE, action = "store", help = "Path of the summary statistics"
 19 |     ),
 20 |     make_option("--trait"   , type = "character", default = FALSE, action = "store", help = "Name of the summary statistics"
 21 |     ),
 22 |     make_option("--path.out", type = "character", default = FALSE, action = "store", help = "Path of the output file plus the prefixes"
 23 |     ),
 24 |     make_option("--parallel", type = "numeric",   default = 100,   action = "store", help = "The number of parallel instances"
 25 |     )    
 26 | )
 27 | 
 28 | opt <- parse_args(OptionParser(option_list = option_list))
 29 | 
 30 | path.model <- opt$models
 31 | path.ref   <- opt$path.ref
 32 | path.ss    <- opt$path.ss
 33 | trait      <- opt$trait
 34 | path.out   <- opt$path.out
 35 | pieces     <- opt$parallel
 36 | 
 37 | # User-defined functions
 38 | # ACAT
 39 | ACAT <- function(Pvals, Weights = NULL) {
 40 |     if (sum(is.na(Pvals)) > 0) {
 41 |         stop("Cannot have NAs in the p-values!")
 42 |     }
 43 |     if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0){
 44 |         stop("P-values must be between 0 and 1!")
 45 |     }
 46 |     is.zero <- (sum(Pvals == 0) >= 1)
 47 |     is.one  <- (sum(Pvals == 1) >= 1)
 48 |     if (is.zero && is.one) {
 49 |         return(-1)
 50 |     }
 51 |     if (is.zero) {
 52 |         return(0)
 53 |     }
 54 |     if (is.one) {
 55 |         return(1)
 56 |     }
 57 | 
 58 |     if (is.null(Weights)) {
 59 |         Weights <- rep(1 / length(Pvals), length(Pvals))
 60 |     } else if (length(Weights) != length(Pvals)) {
 61 |         stop("The length of weights should be the same as that of the p-values")
 62 |     } else if (sum(Weights < 0) > 0){
 63 |         stop("All the weights must be positive!")
 64 |     } else {
 65 |         Weights <- Weights / sum(Weights)
 66 |     }
 67 | 
 68 |     is.small <- (Pvals < 1e-16)
 69 |     if (sum(is.small) == 0){
 70 |         cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi))
 71 |     } else {
 72 |         cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi)
 73 |         cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi))
 74 |     }
 75 | 
 76 |     if (cct.stat > 1e15){
 77 |         pval <- (1 / cct.stat) / pi
 78 |     } else {
 79 |         pval <- pcauchy(cct.stat, lower.tail = F)
 80 |     }
 81 | 
 82 |     return(pval)
 83 | }
 84 | 
 85 | # PatchUp
 86 | PatchUp <- function(M) {
 87 |     M <- apply(M, 2, function(x) {
 88 |         x[is.na(x)] <- mean(x, na.rm = TRUE)
 89 |         return(x)
 90 |     })
 91 | 
 92 |     return(M)
 93 | }
 94 | 
 95 | # Object to store the output
 96 | out <- data.frame(
 97 |     gene_symbol = character(),
 98 |     gene_id     = character(),
 99 |     chromosome  = numeric(),
100 |     model_best  = character(),
101 |     r2_test     = numeric(),
102 |     p_SCAD      = numeric(),
103 |     p_ElNet     = numeric(),
104 |     p_LASSO     = numeric(),
105 |     p_MCP       = numeric(),
106 |     p_MNet      = numeric(),
107 |     z_SCAD      = numeric(),
108 |     z_ElNet     = numeric(),
109 |     z_LASSO     = numeric(),
110 |     z_MCP       = numeric(),
111 |     z_MNet      = numeric(),
112 |     p_ACAT      = numeric(),
113 |     gene_pos    = numeric(),
114 |     runtime     = numeric(),
115 |     stringsAsFactors = FALSE
116 | )
117 | 
118 | # Total number of jobs should be 16884
119 | files <- dir(path.model)
120 | 
121 | size.pieces <- ceiling(length(files) / pieces)
122 | 
123 | # Main iteration
124 | for (gene.index in (1 + (id.job - 1) * size.pieces):(min(length(files), id.job * size.pieces))) {
125 |     print(gene.index)
126 | 
127 |     # Start tracking runtime
128 |     time.start <- proc.time()[3]
129 | 
130 |     # The vector to store all the updates in this iteration
131 |     update <- rep(NA, 18)
132 | 
133 |     # Load weight
134 |     load(paste0(path.model, files[gene.index]))
135 |     update[5] <- max(out.r2[1, ], na.rm = TRUE)
136 |     if (update[5] <= 0.005) {
137 |         next
138 |     }
139 | 
140 |     # Get the chromosome
141 |     chr <- snps$SNPChr[1]
142 | 
143 |     # Read the summary statistics file
144 |     if (file.exists(paste0(path.ss, trait, "-", chr, ".sumstats"))) {
145 |         ss <- paste0(path.ss, trait, "-", chr, ".sumstats") %>% fread() %>% as.data.frame()
146 |     } else {
147 |         next
148 |     }
149 |     names(ss) <- colnames(ss) %>% tolower()
150 |     ss["ID"] <- paste0(ss$chr, "_", ss$pos)
151 | 
152 |     # Load the reference panel
153 |     quiet(
154 |         bim.ref <- as.data.frame(fread(paste0(path.ref, chr, ".bim")))
155 |     )
156 |     quiet(
157 |         genotype.ref <- BEDMatrix(paste0(path.ref, chr), simple_names = TRUE)
158 |     )
159 | 
160 |     names(bim.ref)[c(2, 5, 6)] <- c("SNP" ,"a1", "a2")
161 | 
162 |     # Find the best model
163 |     model.best <- which.max(out.r2[1, ])
164 | 
165 |     # Create new identifier for reference panel
166 |     bim.ref["ID"] <- paste0(bim.ref$V1, "_", bim.ref$V4)
167 | 
168 |     # Find the common snps in all three data sets
169 |     list.common <- intersect(bim.ref$ID, snps$ID) %>% intersect(., ss$ID)
170 | 
171 |     # Skip if no common snps found
172 |     if (length(list.common) == 0) {
173 |         next
174 |     }
175 | 
176 |     # Trim genotype.ref
177 |     genotype.temp <- genotype.ref[, bim.ref$ID %in% list.common]
178 |     bim.temp      <- bim.ref[bim.ref$ID %in% list.common, ]
179 | 
180 |     # Fix the NAs in reference panel
181 |     if (sum(is.na(genotype.temp)) != 0) {
182 |         genotype.temp <- PatchUp(genotype.temp)
183 |     }
184 | 
185 |     # Trim ss
186 |     ss.temp <- ss[ss$ID %in% list.common, ]
187 | 
188 |     # Trim snps and wgt.matrix
189 |     index.temp <- snps$ID %in% list.common
190 | 
191 |     snps       <- snps[index.temp, ]
192 |     out.weight <- out.weight[index.temp, ]
193 |     if ("numeric" %in% class(out.weight)) {
194 |         out.weight <- t(as.matrix(out.weight))
195 |     }
196 | 
197 |     rm(index.temp)
198 | 
199 |     # New
200 |     # Re-arrange every data sets
201 |     m.1 <- match(ss.temp$ID, bim.temp$ID)
202 |     m.2 <- match(ss.temp$ID, snps$ID)
203 | 
204 |     bim.temp      <- bim.temp[m.1, ]
205 |     genotype.temp <- genotype.temp[, m.1]
206 | 
207 |     snps       <- snps[m.2, ]
208 |     out.weight <- out.weight[m.2, ]
209 | 
210 |     # Align the mismatched alleles
211 |     problem.1 <- ss.temp$a1 != bim.temp$a1
212 |     genotype.temp[, problem.1] <- 2 - genotype.temp[, problem.1]
213 | 
214 |     problem.2 <- ss.temp$a1 != snps$a1
215 |     # flipped <- snps$SNP[problem.2]
216 |     out.weight[problem.2, ] <- -1 * out.weight[problem.2, ]
217 | 
218 |     # # Old
219 |     # # Re-arrange every data sets
220 |     # m.1 <- match(bim.temp$ID, ss.temp$ID)
221 |     # m.2 <- match(bim.temp$ID, snps$ID)
222 | 
223 |     # ss.temp    <- ss.temp[m.1, ]
224 |     # snps       <- snps[m.2, ]
225 |     # out.weight <- out.weight[m.2, ]
226 | 
227 |     # # Align the mismatched alleles
228 |     # problem <- ss.temp$a1 != snps$a1
229 |     # ss.temp$a1 <- snps$a1
230 |     # ss.temp$a2 <- snps$a2
231 |     # ss.temp$beta[problem] <- -1 * ss.temp$beta[problem]
232 |     # ss.temp$z[problem] <- -1 * ss.temp$z[problem]
233 | 
234 |     # Compute LD matrix
235 |     genotype.temp <- scale(genotype.temp)
236 |     matrix.LD  <- t(genotype.temp) %*% genotype.temp / (nrow(genotype.temp) - 1)
237 | 
238 |     # Catch: When there is only one row in wgt.matrix
239 |     if ("numeric" %in% class(out.weight)) {
240 |         out.weight <- out.weight %>% as.matrix() %>% t() %>% as.data.frame()
241 |     }
242 | 
243 |     # Catch: Over-fitting
244 |     for (qwe in 1:5) {
245 |         if (max(abs(out.weight[, qwe])) >= 10) {
246 |             out.weight[, qwe] <- 0
247 |         }
248 |     }
249 | 
250 |     # Iteration by method
251 |     for (w in 1:ncol(out.weight)) {
252 |         # Settings
253 |         weights <- out.weight[, w]
254 | 
255 |         # Skip if weight is a zero vector
256 |         if (sum(weights) == 0) {
257 |             update[5 + w] <- NA
258 |             next
259 |         }
260 | 
261 |         # Keep the non-zero components of weights vector
262 |         keep <- (weights != 0)
263 |         print(sum(keep))
264 |         weights <- weights[keep]
265 |         #print(sum(names(weights) %in% flipped))
266 | 
267 |         # Compute TWAS z-score, r2, and p-value
268 |         z.twas  <- as.numeric(weights %*% ss.temp$z[keep])
269 |         r2.twas <- as.numeric(weights %*% matrix.LD[keep, keep] %*% weights)
270 | 
271 |         update[5 + w]  <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas)), lower.tail = F))
272 |         update[10 + w] <- z.twas
273 |     }
274 | 
275 |     # ACAT
276 |     check.na   <- !is.na(update[6:10])
277 |     check.sign <- out.r2[1, ] > 0
278 | 
279 |     check.final <- check.na & check.sign
280 | 
281 |     if (sum(check.final) == 0) {
282 |         update[16] <- NA
283 |     } else {
284 |         update[16] <- ACAT(update[6:10][check.final], out.r2[1, check.final] / sum(out.r2[1, check.final]))
285 |     }
286 | 
287 |     # Stop tracking runtime
288 |     time.end <- proc.time()[3]
289 | 
290 |     #################
291 |     # Output format #
292 |     #################
293 |     # 1.     Gene symbol
294 |     # 2.     ENSG id
295 |     # 3.     Chromosome
296 |     # 4.     Best model
297 |     # 5.     Best model's R^2 on testing data (Skipped)
298 |     # 6-10.  P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
299 |     # 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
300 |     # 16.    P-value from ACAT
301 |     # 17.    Gene position
302 |     # 18.    Runtime
303 | 
304 |     # Update
305 |     update[1]  <- snps$GeneSymbol[1]
306 |     update[2]  <- snps$Gene[1]
307 |     update[3]  <- chr
308 |     update[4]  <- names(model.best)
309 |     update[17] <- snps$GenePos[1]
310 |     update[18] <- time.end - time.start
311 | 
312 |     out[nrow(out) + 1, ] <- update
313 | }
314 | 
315 | # Write the result
316 | dir.create(paste0(path.out, trait))
317 | write.table(out, file = paste0(path.out, trait, "/", trait, "-", id.job), row.names = FALSE, quote = FALSE)
318 | 


--------------------------------------------------------------------------------
/step2_rsid.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | array  <- Sys.getenv("SLURM_ARRAY_TASK_ID")
  3 | job.id <- as.numeric(array)
  4 | 
  5 | # Packages
  6 | library(BEDMatrix)
  7 | suppressMessages(library(data.table))
  8 | suppressMessages(library(ddpcr))
  9 | suppressMessages(library(dplyr))
 10 | library(optparse)
 11 | 
 12 | # Options
 13 | option_list <- list(
 14 |     make_option("--models",   type = "character", default = FALSE, action = "store", help = "Path of the model folder"
 15 |     ),
 16 |     make_option("--path.ref", type = "character", default = FALSE, action = "store", help = "Path of the reference panel plus the prefixes"
 17 |     ),
 18 |     make_option("--path.ss",  type = "character", default = FALSE, action = "store", help = "Path of the summary statistics"
 19 |     ),
 20 |     make_option("--trait"   , type = "character", default = FALSE, action = "store", help = "Name of the summary statistics"
 21 |     ),
 22 |     make_option("--path.out", type = "character", default = FALSE, action = "store", help = "Path of the output file plus the prefixes"
 23 |     ),
 24 |     make_option("--parallel", type = "numeric",   default = 100,   action = "store", help = "The number of parallel instances"
 25 |     )    
 26 | )
 27 | 
 28 | opt <- parse_args(OptionParser(option_list = option_list))
 29 | 
 30 | path.model <- opt$models
 31 | path.ref   <- opt$path.ref
 32 | path.ss    <- opt$path.ss
 33 | trait      <- opt$trait
 34 | path.out   <- opt$path.out
 35 | pieces     <- opt$parallel
 36 | 
 37 | # User-defined functions
 38 | # ACAT
 39 | ACAT <- function(Pvals, Weights = NULL) {
 40 |     if (sum(is.na(Pvals)) > 0) {
 41 |         stop("Cannot have NAs in the p-values!")
 42 |     }
 43 | 
 44 |     if ((sum(Pvals < 0) + sum(Pvals > 1)) > 0){
 45 |         stop("P-values must be between 0 and 1!")
 46 |     }
 47 | 
 48 |     is.zero <- (sum(Pvals == 0) >= 1)
 49 |     is.one  <- (sum(Pvals == 1) >= 1)
 50 |     if (is.zero && is.one) {
 51 |         return(-1)
 52 |     }
 53 | 
 54 |     if (is.zero) {
 55 |         return(0)
 56 |     }
 57 | 
 58 |     if (is.one) {
 59 |         return(1)
 60 |     }
 61 | 
 62 |     if (is.null(Weights)) {
 63 |         Weights <- rep(1 / length(Pvals), length(Pvals))
 64 |     } else if (length(Weights) != length(Pvals)) {
 65 |         stop("The length of weights should be the same as that of the p-values")
 66 |     } else if (sum(Weights < 0) > 0){
 67 |         stop("All the weights must be positive!")
 68 |     } else {
 69 |         Weights <- Weights / sum(Weights)
 70 |     }
 71 | 
 72 |     is.small <- (Pvals < 1e-16)
 73 |     if (sum(is.small) == 0){
 74 |         cct.stat <- sum(Weights * tan((0.5 - Pvals) * pi))
 75 |     } else {
 76 |         cct.stat <- sum((Weights[is.small] / Pvals[is.small]) / pi)
 77 |         cct.stat <- cct.stat + sum(Weights[!is.small] * tan((0.5 - Pvals[!is.small]) * pi))
 78 |     }
 79 | 
 80 |     if (cct.stat > 1e15){
 81 |         pval <- (1 / cct.stat) / pi
 82 |     } else {
 83 |         pval <- pcauchy(cct.stat, lower.tail = F)
 84 |     }
 85 | 
 86 |     return(pval)
 87 | }
 88 | 
 89 | # PatchUp
 90 | PatchUp <- function(M) {
 91 |     M <- apply(M, 2, function(x) {
 92 |         x[is.na(x)] <- mean(x, na.rm = TRUE)
 93 |         return(x)
 94 |     })
 95 | 
 96 |     return(M)
 97 | }
 98 | 
 99 | # Object to store the output
100 | out <- data.frame(
101 |     gene_symbol = character(),
102 |     gene_id     = character(),
103 |     chromosome  = numeric(),
104 |     model_best  = character(),
105 |     r2_test     = numeric(),
106 |     p_SCAD      = numeric(),
107 |     p_ElNet     = numeric(),
108 |     p_LASSO     = numeric(),
109 |     p_MCP       = numeric(),
110 |     p_MNet      = numeric(),
111 |     z_SCAD      = numeric(),
112 |     z_ElNet     = numeric(),
113 |     z_LASSO     = numeric(),
114 |     z_MCP       = numeric(),
115 |     z_MNet      = numeric(),
116 |     p_ACAT      = numeric(),
117 |     gene_pos    = numeric(),
118 |     runtime     = numeric(),
119 |     stringsAsFactors = FALSE
120 | )
121 | 
122 | # Total number of jobs = 16864
123 | files <- dir(path.model)
124 | 
125 | size.pieces <- ceiling(length(files) / pieces)
126 | 
127 | # Main iteration
128 | for (gene.index in (1 + (id.job - 1) * size.pieces):(min(length(files), id.job * size.pieces))) {
129 |     print(gene.index)
130 | 
131 |     # Start tracking runtime
132 |     time.start <- proc.time()[3]
133 | 
134 |     # The vector to store all the updates in this iteration
135 |     update <- rep(NA, 18)
136 | 
137 |     # Load weight
138 |     load(paste0(path.model, files[gene.index]))
139 |     update[5] <- max(out.r2[1, ], na.rm = TRUE)
140 |     if (update[5] <= 0.005) {
141 |         next
142 |     }
143 | 
144 |     # Get the chromosome
145 |     chr <- snps$SNPChr[1]
146 | 
147 |     # Read the summary statistics file
148 |     if (file.exists(paste0(path.ss, trait, "-", chr, ".sumstats"))) {
149 |         ss <- paste0(path.ss, trait, "-", chr, ".sumstats" ) %>% fread() %>% as.data.frame()
150 |     } else {
151 |         next
152 |     }
153 |     names(ss) <- colnames(ss) %>% tolower()
154 | 
155 |     # Load the reference panel
156 |     quiet(
157 |         bim.ref <- as.data.frame(fread(paste0(path.ref, chr, ".bim")))
158 |     )
159 |     quiet(
160 |         genotype.ref <- BEDMatrix(paste0(path.ref, chr), simple_names = TRUE)
161 |     )
162 | 
163 |     names(bim.ref)[c(2, 5, 6)] <- c("SNP" ,"a1", "a2")
164 | 
165 |     # Find the best model
166 |     model.best <- which.max(out.r2[1, ])
167 | 
168 |     # Find the common snps in all three data sets
169 |     list.common <- intersect(intersect(bim.ref$SNP, snps$SNP), ss$snp)
170 | 
171 |     # Skip if no common snps found
172 |     if (length(list.common) == 0) {
173 |         next
174 |     }
175 | 
176 |     # Trim genotype.ref
177 |     genotype.temp <- genotype.ref[, bim.ref$SNP %in% list.common]
178 |     bim.temp      <- bim.ref[bim.ref$SNP %in% list.common, ]
179 | 
180 |     # Fix the NAs in reference panel and scale
181 |     if (sum(is.na(genotype.temp)) != 0) {
182 |         genotype.temp <- PatchUp(genotype.temp)
183 |     }
184 | 
185 |     genotype.temp <- scale(genotype.temp)
186 | 
187 |     # Trim ss
188 |     ss.temp <- ss[ss$snp %in% list.common, ]
189 | 
190 |     # Trim snps and wgt.matrix
191 |     index.temp <- snps$SNP %in% list.common
192 | 
193 |     snps       <- snps[index.temp, ]
194 |     out.weight <- out.weight[index.temp, ]
195 |     if ("numeric" %in% class(out.weight)) {
196 |         out.weight <- t(as.matrix(out.weight))
197 |     }
198 | 
199 |     rm(index.temp)
200 | 
201 |     # Re-arrange every data sets
202 |     m.1 <- match(bim.temp$SNP, ss.temp$snp)
203 |     m.2 <- match(bim.temp$SNP, snps$SNP)
204 | 
205 |     ss.temp    <- ss.temp[m.1, ]
206 |     snps       <- snps[m.2, ]
207 |     out.weight <- out.weight[m.2, ]
208 | 
209 |     # Align the mismatched alleles
210 |     problem <- ss.temp$a1 != snps$a1
211 |     ss.temp$a1 <- snps$a1
212 |     ss.temp$a2 <- snps$a2
213 |     ss.temp$beta[problem] <- -1 * ss.temp$beta[problem]
214 |     ss.temp$z[problem] <- -1 * ss.temp$z[problem]
215 | 
216 |     # Compute LD matrix
217 |     matrix.LD <- t(genotype.temp) %*% genotype.temp / (nrow(genotype.temp) - 1)
218 | 
219 |     # Catch: When there is only one row in wgt.matrix
220 |     if ("numeric" %in% class(out.weight)) {
221 |         out.weight <- out.weight %>% as.matrix() %>% t() %>% as.data.frame()
222 |     }
223 | 
224 |     # Catch: Over-fitting
225 |     for (qwe in 1:5) {
226 |         if (max(abs(out.weight[, qwe])) >= 10) {
227 |             out.weight[, qwe] <- 0
228 |         }
229 |     }
230 | 
231 |     # Iteration by method
232 |     for (w in 1:ncol(out.weight)) {
233 |         # Settings
234 |         weights <- out.weight[, w]
235 | 
236 |         # Skip if weight is a zero vector
237 |         if (sum(weights) == 0) {
238 |             update[5 + w] <- NA
239 |             next
240 |         }
241 | 
242 |         # Keep the non-zero components of weights vector
243 |         keep <- (weights != 0)
244 |         weights <- weights[keep]
245 | 
246 |         # Compute TWAS z-score, r2, and p-value
247 |         z.twas  <- as.numeric(weights %*% ss.temp$z[keep])
248 |         r2.twas <- as.numeric(weights %*% matrix.LD[keep, keep] %*% weights)
249 | 
250 |         update[5 + w]  <- 2 * (pnorm(abs(z.twas / sqrt(r2.twas)), lower.tail = F))
251 |         update[10 + w] <- z.twas
252 |     }
253 | 
254 |     # ACAT
255 |     check.na    <- !is.na(update[6:10])
256 |     check.sign  <- out.r2[1, ] > 0
257 | 
258 |     check.final <- check.na & check.sign
259 | 
260 |     if (sum(check.final) == 0) {
261 |         update[16] <- NA
262 |     } else {
263 |         update[16] <- ACAT(update[6:10][check.final], out.r2[1, check.final] / sum(out.r2[1, check.final]))
264 |     }
265 | 
266 |     # Stop tracking runtime
267 |     time.end <- proc.time()[3]
268 | 
269 |     #################
270 |     # Output format #
271 |     #################
272 |     # 1.     Gene symbol
273 |     # 2.     ENSG id
274 |     # 3.     Chromosome
275 |     # 4.     Best model
276 |     # 5.     Best model's R^2 on testing data (Skipped)
277 |     # 6-10.  P-value of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
278 |     # 11-15. Z-score of TWAS from weights constructed by (SCAD, ElNet, LASSO, MCP, and MNet)
279 |     # 16.    P-value from ACAT
280 |     # 17.    Gene position
281 |     # 18.    Runtime
282 | 
283 |     # Update
284 |     update[1]  <- snps$GeneSymbol[1]
285 |     update[2]  <- snps$Gene[1]
286 |     update[3]  <- chr
287 |     update[4]  <- names(model.best)
288 |     update[17] <- snps$GenePos[1]
289 |     update[18] <- time.end - time.start
290 | 
291 |     out[nrow(out) + 1, ] <- update
292 | }
293 | 
294 | # Write the result
295 | dir.create(paste0(path.out, trait))
296 | write.table(out, file = paste0(path.out, trait, "/", trait, "-", id.job), row.names = FALSE, quote = FALSE)
297 | 


--------------------------------------------------------------------------------