├── 1c_imporFG.R ├── 3_extract_all_outcomes.R ├── 1aa_SDtimmers.R ├── 1ee_importpressure.R ├── 1b_importeGFR.R ├── 1a_importlotta.R ├── 1bb_importeGFRSD.R ├── 7a_writetext.R ├── 6_create_supplementary_tables.R ├── 2_import_instrument.R ├── 4_harmonise_and_mrresult.R └── 5_create_plots_figures.R /1c_imporFG.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(GagnonMR) 4 | library(tidyverse) 5 | 6 | setwd("/mnt/sda/gagelo01/Projects/small_MR_exploration/fg_BMI") 7 | 8 | fg <- fread("Data/Raw/FG_combined_1000G_density_formatted_21-03-29.txt.gz") 9 | setnames(fg, c("a1", "a2"), c("other_allele", "effect_allele")) 10 | 11 | traduction = fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 12 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 13 | traduction[, maf := NULL] 14 | 15 | 16 | GagnonMR::formattovcf_createindex(all_out = fg, 17 | snp_col = "rsid", 18 | outcome_name = "Fasting_Glucose", 19 | beta_col = "beta", 20 | se_col = "se", 21 | pval_col = "p-value", 22 | eaf_col = NULL, 23 | effect_allele_col = "effect_allele", 24 | other_allele_col = "other_allele", 25 | ncase_col = NULL, 26 | ncontrol_col = NULL, 27 | samplesize_col = "n", 28 | chr_col = NULL, 29 | pos_col = NULL, 30 | units = "natural logarithm transformed FI measured in pmol/L", 31 | traduction = traduction, 32 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 33 | df_index = fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt"), 34 | group_name = "public", 35 | year = 2021, 36 | author = "Lagou V", 37 | consortium = "MAGIC", 38 | sex = "Males and Females", 39 | population = "European", 40 | initial_build = "HG19/GRCh37", 41 | category = "Trait", 42 | pmid = 33402679, 43 | note = NA, 44 | should_create_id = TRUE, 45 | ID = NA ) 46 | 47 | 48 | message("This script finished without error") 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /3_extract_all_outcomes.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(tidyverse) 4 | library(GagnonMR) 5 | library(furrr) 6 | 7 | setwd("/mnt/sda/gagelo01/Projects/Dysbiose_project") 8 | gwasvcf::set_bcftools() 9 | gwasvcf::set_plink() 10 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 11 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 12 | ao <- fread("/mnt/sda/gagelo01/Vcffile/available_outcomes_2021-10-13.txt") 13 | ao_small <- ao[id %in% list.files("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/")] 14 | ao_small[pmid == 32203549, unit := "SD"] 15 | 16 | inst_clump <- fread( "Data/Modified/inst_clump.txt") 17 | 18 | 19 | ID_mrbase_out <- c("ieu-b-109", "ieu-b-110", "ieu-b-111", "ukb-b-19953","ukb-b-12141") 20 | outcomes_mrbase <- paste0("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/", ID_mrbase_out, "/", ID_mrbase_out, ".vcf.gz") 21 | ID_server_out <- c("trait-2-2", "trait-6-1", "trait-7-2", "trait-2-4", "trait-12-2", 22 | "dis-2-1", "dis-3-1", "dis-4-1", "dis-5-1", "dis-6-1", "dis-7-1", "dis-8-1", 23 | "trait-13-1", "trait-13-2") 24 | outcomes_server <- paste0("/mnt/sda/gagelo01/Vcffile/Server_vcf/", ID_server_out, "/", ID_server_out, ".vcf.gz") 25 | 26 | 27 | options(future.globals.maxSize= 5e9) 28 | plan(multisession, workers = 9, gc = TRUE) #I should try using multicore 29 | 30 | 31 | # test <- gwasvcf::query_gwas(vcf = c(outcomes_mrbase, outcomes_server)[1], rsid = unique(inst_clump$SNP), 32 | # proxies = "yes", bfile = ldref) 33 | # test %>% 34 | # gwasglue::gwasvcf_to_TwoSampleMR(., "outcome") %>% 35 | # data.table::as.data.table(.) 36 | 37 | outcome_all <- future_map(as.list(c(outcomes_mrbase, outcomes_server)), function(x, rsiid = unique(inst_clump$SNP)) { 38 | 39 | gwasvcf::set_bcftools() 40 | gwasvcf::set_plink() 41 | res <- gwasvcf::query_gwas(vcf = x, rsid = rsiid, proxies = "yes", bfile = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs", 42 | tag_r2 = 0.8) %>% 43 | gwasglue::gwasvcf_to_TwoSampleMR(., "outcome") %>% 44 | data.table::as.data.table(.) 45 | return(res) 46 | }, .options = furrr_options(seed = TRUE)) %>% rbindlist(.,fill = TRUE) 47 | 48 | add_n <- c("ieu-b-109", "ieu-b-110", "ieu-b-111") 49 | for(i in 1:length(add_n)) { 50 | outcome_all[outcome %in% add_n[i], samplesize.outcome := ao_small[id %in% add_n[i],][,.(sample_size)]] 51 | } 52 | 53 | outcome_all[,outcome := outcome %>% ifelse(grepl("UKB-b-19953|UKB-b-12141", .), tolower(.), .)] 54 | k<-ao_small[id %in% ID_mrbase_out, .(id, unit)] 55 | setnames(k, "id", "trait") 56 | theunits <- rbind(df_index[id %in% ID_server_out,.(trait, unit)], k) 57 | setnames(theunits, "unit", "units.outcome") 58 | mirge <- merge(outcome_all, theunits, by.x = "outcome", by.y = "trait") 59 | mirge[outcome %in% "Stanzick_eGFR", units.outcome := "SD"] 60 | fwrite(mirge, "Data/Modified/outcome_all") 61 | 62 | print("this script finished without errors") -------------------------------------------------------------------------------- /1aa_SDtimmers.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(GagnonMR) 4 | library(tidyverse) 5 | 6 | 7 | gwasvcf::set_bcftools() 8 | gwasvcf::set_plink() 9 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 10 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 11 | df_index[id == "trait-7-1", ] 12 | 13 | ############onlt change this section 14 | ancient_id<- "trait-7-1" 15 | new_id<-"trait-7-2" 16 | ################### 17 | 18 | inst <- get_inst(vcffile = paste0("/mnt/sda/gagelo01/Vcffile/Server_vcf/", ancient_id, "/", ancient_id, ".vcf.gz")) 19 | sd <- coloc:::sdY.est(vbeta = (inst$se.exposure)^2, 20 | maf = inst[, ifelse(eaf.exposure > 0.5,1-eaf.exposure,eaf.exposure)], 21 | n = inst$samplesize.exposure) 22 | 23 | sd 24 | 25 | newrow <- df_index[id == ancient_id, ] 26 | newrow[, id := new_id] 27 | newrow[, note := "standardised with coloc"] 28 | newrow[, unit := "SD"] 29 | df_index <- rbind(df_index, newrow) 30 | 31 | vcf <- VariantAnnotation::readVcf(paste0("/mnt/sda/gagelo01/Vcffile/Server_vcf/", ancient_id, "/", ancient_id, ".vcf.gz")) 32 | tsmr <- vcf %>% gwasglue::gwasvcf_to_TwoSampleMR(.) %>% as.data.table(.) 33 | tsmr[, beta.exposure := beta.exposure / sd] 34 | tsmr[, se.exposure := se.exposure / sd] 35 | 36 | GagnonMR::formattovcf_createindex(all_out = tsmr, 37 | snp_col = "SNP", 38 | outcome_name = newrow[,trait], 39 | beta_col = "beta.exposure", 40 | se_col = "se.exposure", 41 | pval_col = "pval.exposure", 42 | eaf_col = "eaf.exposure", 43 | effect_allele_col = "effect_allele.exposure", 44 | other_allele_col = "other_allele.exposure", 45 | ncase_col = NULL, 46 | ncontrol_col = NULL, 47 | samplesize_col = "samplesize.exposure", 48 | chr_col = "chr.exposure", 49 | pos_col = "pos.exposure", 50 | units = newrow$unit, 51 | traduction = NULL, 52 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 53 | df_index = df_index, 54 | group_name = newrow$group_name, 55 | year = newrow$year, 56 | author = newrow$author, 57 | consortium = newrow$consortium, 58 | sex = newrow$sex, 59 | population = newrow$population, 60 | initial_build = newrow$initial_build, 61 | category = newrow$category, 62 | pmid = newrow$pmid, 63 | note = newrow$note, 64 | should_create_id = FALSE, 65 | ID = new_id) 66 | fwrite(df_index,"/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 67 | message("this script finished without errors") 68 | -------------------------------------------------------------------------------- /1ee_importpressure.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(GagnonMR) 4 | library(tidyverse) 5 | 6 | gwasvcf::set_bcftools() 7 | gwasvcf::set_plink() 8 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 9 | 10 | ao <- fread("/mnt/sda/gagelo01/Vcffile/available_outcomes_2021-10-13.txt") 11 | ao_small <- ao[id %in% list.files("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/")] 12 | vec_id <-ao_small[pmid == 30224653, id] 13 | 14 | ## 15 | traduction = fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 16 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 17 | traduction[, maf := NULL] 18 | 19 | 20 | for(i in 1:length(vec_id)) { 21 | 22 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 23 | vcffile <- paste0("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/", vec_id[i], "/", vec_id[i], ".vcf.gz") 24 | inst<-gwasvcf::query_gwas(vcffile, chrompos = "1:20000-1200000") %>% 25 | gwasglue::gwasvcf_to_TwoSampleMR(.) %>% 26 | as.data.table(.) 27 | sd <- coloc:::sdY.est(vbeta = (inst$se.exposure)^2, 28 | maf = inst[, ifelse(eaf.exposure > 0.5,1-eaf.exposure,eaf.exposure)], 29 | n = inst$samplesize.exposure) 30 | 31 | ## 32 | tsmr <- VariantAnnotation::readVcf(vcffile) %>% 33 | gwasglue::gwasvcf_to_TwoSampleMR(.) %>% 34 | as.data.table(.) 35 | 36 | tsmr[,beta.exposure := beta.exposure / sd] 37 | tsmr[,se.exposure := se.exposure / sd] 38 | tsmr[, chr.exposure := as.integer(chr.exposure) ] 39 | 40 | newrow <- ao_small[id == vec_id[i]] 41 | newrow[, id := paste0("trait-13-", i) ] 42 | newrow[, trait := gsub(" ", "_", trait)] 43 | newrow[,unit := "SD"] 44 | setnames(newrow, "build", "initial_build") 45 | newrow[,category := "Trait"] 46 | newrow[,note:= paste0("standardised with coloc initially in mmHg with SD ", sd)] 47 | colinclude <- colnames(df_index) 48 | newrow <- newrow[, ..colinclude] 49 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 50 | df_index <- rbind(df_index, newrow) 51 | 52 | 53 | GagnonMR::formattovcf_createindex(all_out = tsmr, 54 | snp_col = "SNP", 55 | outcome_name = newrow[, trait], 56 | beta_col = "beta.exposure", 57 | se_col = "se.exposure", 58 | pval_col = "pval.exposure", 59 | eaf_col = "eaf.exposure", 60 | effect_allele_col = "effect_allele.exposure", 61 | other_allele_col = "other_allele.exposure", 62 | ncase_col = NULL, 63 | ncontrol_col = NULL, 64 | samplesize_col = "samplesize.exposure", 65 | chr_col = "chr.exposure", 66 | pos_col = "pos.exposure", 67 | units = "SD", 68 | traduction = traduction, 69 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 70 | df_index = df_index, 71 | group_name = "public", 72 | year = newrow$year, 73 | author = newrow$author, 74 | consortium = newrow$consortium, 75 | sex = newrow$sex, 76 | population = newrow$population, 77 | initial_build = newrow$initial_build, 78 | category = newrow$category, 79 | pmid = newrow$pmid, 80 | note = newrow$note, 81 | should_create_id = FALSE, 82 | ID = newrow$id ) 83 | 84 | 85 | fwrite(df_index, "/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 86 | } 87 | 88 | message("This script finished without error") 89 | -------------------------------------------------------------------------------- /1b_importeGFR.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | #import instrument 3 | library(data.table) 4 | library(GagnonMR) 5 | library(tidyverse) 6 | library(tictoc) 7 | library(furrr) 8 | 9 | gwasvcf::set_bcftools() 10 | gwasvcf::set_plink() 11 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 12 | ao <- fread("/mnt/sda/gagelo01/Vcffile/available_outcomes_2021-10-13.txt") 13 | ao[grepl("glomerular", tolower(trait))][population == "European"] 14 | setwd("/mnt/sda/gagelo01/Projects/Dysbiose_project") 15 | 16 | 17 | wuttke <- fread("Data/Raw/20171017_MW_eGFR_overall_EA_nstud42.dbgap.txt.gz") 18 | stanzick <- fread("Data/Raw/metal_eGFR_meta_ea1.TBL.map.annot.gc.gz") 19 | 20 | traduction <- fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 21 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 22 | traduction[, maf := NULL] 23 | 24 | 25 | GagnonMR::formattovcf_createindex(all_out = wuttke, 26 | snp_col = "RSID", 27 | outcome_name = "wuttke_eGFR", 28 | beta_col = "Effect", 29 | se_col = "StdErr", 30 | pval_col = "P-value", 31 | eaf_col = "Freq1", 32 | effect_allele_col = "Allele1", 33 | other_allele_col = "Allele2", 34 | ncase_col = NULL, 35 | ncontrol_col = NULL, 36 | samplesize_col = "n_total_sum", 37 | chr_col = "Chr", 38 | pos_col = "Pos_b37", 39 | units = "log(eGFR)", 40 | traduction = traduction, 41 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 42 | df_index = df_index, 43 | group_name = "public", 44 | year = 2019, 45 | author = "Wuttke Matthias", 46 | consortium = "CKDGEN", 47 | sex = "Males and Females", 48 | population = "European", 49 | initial_build = "HG19/GRCh37", 50 | category = "Trait", 51 | pmid = 31152163, 52 | note = "", 53 | should_create_id = TRUE, 54 | ID = NA) 55 | 56 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 57 | 58 | 59 | GagnonMR::formattovcf_createindex(all_out = stanzick, 60 | snp_col = "RSID", 61 | outcome_name = "Stanzick_eGFR", 62 | beta_col = "Effect", 63 | se_col = "StdErr", 64 | pval_col = "P.value", 65 | eaf_col = "Freq1", 66 | effect_allele_col = "Allele1", 67 | other_allele_col = "Allele2", 68 | ncase_col = NULL, 69 | ncontrol_col = NULL, 70 | samplesize_col = "n", 71 | chr_col = "chr", 72 | pos_col = "pos", 73 | units = "log(eGFR)", 74 | traduction = traduction, 75 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 76 | df_index = df_index, 77 | group_name = "public", 78 | year = 2021, 79 | author = "Stanzick Kira", 80 | consortium = "CKDGEN UKBbiobank", 81 | sex = "Males and Females", 82 | population = "European", 83 | initial_build = "HG19/GRCh37", 84 | category = "Trait", 85 | pmid = 34272381, 86 | note = "", 87 | should_create_id = TRUE, 88 | ID = NA) 89 | 90 | message("This script finished without errors") -------------------------------------------------------------------------------- /1a_importlotta.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | #import instrument 3 | library(data.table) 4 | library(GagnonMR) 5 | library(tidyverse) 6 | library(tictoc) 7 | library(furrr) 8 | 9 | gwasvcf::set_bcftools() 10 | gwasvcf::set_plink() 11 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 12 | setwd("/mnt/sda/gagelo01/Projects/Dysbiose_project") 13 | 14 | met_lab <-fread("/home/couchr02/Mendel_Commun/Christian/GWAS/proteome/Lotta/metabolites_label.txt") 15 | colnames(met_lab) <- c("namefile", "abbreviation", "fullname", "class") 16 | 17 | #convert label 18 | traduction <- fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 19 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 20 | traduction[, maf := NULL] 21 | 22 | 23 | newrow<- data.table( id = paste0("met-2-", 1:met_lab[,.N]), 24 | trait = met_lab$abbreviation, 25 | group_name = "Public", year = 2021, 26 | author = "Lotta A, Luca", 27 | consortium = "Fenland, EPIC-Norfolk (Metabolon), INTERVAL (Metabolon), INTERVAL (Nightingale), Kettunen et al. 2016 (Nightingale), Draisma et al. 2015 (Biocates p150), Shin et al.2014 (Twins-UK, Metabolon), Shin et al. 2014 (KORA, Metbolon)", 28 | sex = "Males and Females", population = "European", unit = "SD", 29 | nsnp = "~10000000", sample_size = ">10000" ,initial_build = "HG19/GRCh37", category = "Metabolites", 30 | pmid = 33414548, sd = 1, 31 | note = met_lab$fullname, 32 | ncase = NA, ncontrol = NA) 33 | 34 | 35 | df_index <- rbind(df_index, newrow) 36 | 37 | lotta_ID <- df_index[pmid == "33414548", ]$id 38 | format_wrapper <- function(lotta_id, traduction, df_index, met_lab) { 39 | 40 | namefile <- met_lab[abbreviation == df_index[id == lotta_id & pmid == "33414548", trait], namefile ] 41 | data <- fread(paste0("/home/couchr02/Mendel_Commun/Christian/GWAS/proteome/Lotta/Results/", namefile, ".txt.gz")) 42 | 43 | GagnonMR::formattovcf_createindex(all_out = data, 44 | snp_col = "rsid", 45 | outcome_name = df_index[id == lotta_id, trait], 46 | beta_col = "Beta", 47 | se_col = "SE", 48 | pval_col = "Pvalue_MA", 49 | eaf_col = "Freq1_MA", 50 | effect_allele_col = "Allele1", 51 | other_allele_col = "Allele2", 52 | ncase_col = NULL, 53 | ncontrol_col = NULL, 54 | samplesize_col = "Weight_MA", 55 | chr_col = "chr", 56 | pos_col = "pos", 57 | units = "SD", 58 | traduction = traduction, 59 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 60 | df_index = df_index, 61 | group_name = "public", 62 | year = 2021, 63 | author = "Lotta A, Luca", 64 | consortium = "Fenland, EPIC-Norfolk (Metabolon), INTERVAL (Metabolon), INTERVAL (Nightingale), Kettunen et al. 2016 (Nightingale), Draisma et al. 2015 (Biocates p150), Shin et al.2014 (Twins-UK, Metabolon), Shin et al. 2014 (KORA, Metbolon)", 65 | sex = "Males and Females", 66 | population = "Mix", 67 | initial_build = "HG19/GRCh37", 68 | category = "Metabolites", 69 | pmid = 33414548, 70 | note = df_index[id == lotta_id, note], 71 | should_create_id = FALSE, 72 | ID = lotta_id) 73 | } 74 | 75 | 76 | options(future.globals.maxSize= 1e10) 77 | plan(multisession, workers = 8) 78 | 79 | df_index_copy <- df_index 80 | traduction_copy <- traduction 81 | met_lab_copy <- met_lab 82 | tic() 83 | 84 | k <- list.files("/mnt/sda/gagelo01/Vcffile/Server_vcf") 85 | l <- k[grepl("met-2-", k)] 86 | n <- as.numeric(gsub("met-2-", "", l)) 87 | index <- which(!(1:173 %in% n)) 88 | 89 | future_map(lotta_ID[index], function(x) {format_wrapper(lotta_id = x, traduction = traduction_copy, df_index = df_index_copy, met_lab = met_lab_copy)}, 90 | .options = furrr_options(seed = TRUE)) 91 | 92 | fwrite(df_index, "/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 93 | 94 | message("this script finished without errors") 95 | 96 | toc() 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /1bb_importeGFRSD.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(GagnonMR) 4 | library(tidyverse) 5 | 6 | 7 | gwasvcf::set_bcftools() 8 | gwasvcf::set_plink() 9 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 10 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 11 | df_index[trait == "Stanzick_eGFR", ] 12 | 13 | 14 | 15 | inst <- get_inst(vcffile = "/mnt/sda/gagelo01/Vcffile/Server_vcf/trait-12-1/trait-12-1.vcf.gz") 16 | sd <- coloc:::sdY.est(vbeta = (inst$se.exposure)^2, 17 | maf = inst[, ifelse(eaf.exposure > 0.5,1-eaf.exposure,eaf.exposure)], 18 | n = inst$samplesize.exposure) 19 | 20 | sd 21 | 22 | newrow <- df_index[trait == "Stanzick_eGFR", ] 23 | newrow[, id := "trait-12-2"] 24 | newrow[, note := "standardised with coloc"] 25 | df_index <- rbind(df_index, newrow) 26 | 27 | stanzick <- fread("Data/Raw/metal_eGFR_meta_ea1.TBL.map.annot.gc.gz") 28 | 29 | stanzick[, Effect := Effect / sd] 30 | stanzick[, StdErr := StdErr / sd] 31 | 32 | traduction <- fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 33 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 34 | traduction[, maf := NULL] 35 | 36 | GagnonMR::formattovcf_createindex(all_out = stanzick, 37 | snp_col = "RSID", 38 | outcome_name = "Stanzick_eGFR", 39 | beta_col = "Effect", 40 | se_col = "StdErr", 41 | pval_col = "P.value", 42 | eaf_col = "Freq1", 43 | effect_allele_col = "Allele1", 44 | other_allele_col = "Allele2", 45 | ncase_col = NULL, 46 | ncontrol_col = NULL, 47 | samplesize_col = "n", 48 | chr_col = "chr", 49 | pos_col = "pos", 50 | units = "log(eGFR)", 51 | traduction = traduction, 52 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 53 | df_index = df_index, 54 | group_name = "public", 55 | year = 2021, 56 | author = "Stanzick Kira", 57 | consortium = "CKDGEN", 58 | sex = "Males and Females", 59 | population = "European", 60 | initial_build = "HG19/GRCh37", 61 | category = "Trait", 62 | pmid = 31152163, 63 | note = "Standardised with coloc", 64 | should_create_id = FALSE, 65 | ID = "trait-12-2") 66 | fwrite(df_index,"/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 67 | 68 | ###### 69 | 70 | inst <- get_inst(vcffile = "/mnt/sda/gagelo01/Vcffile/Server_vcf/trait-11-1/trait-11-1.vcf.gz") 71 | sd <- coloc:::sdY.est(vbeta = (inst$se.exposure)^2, 72 | maf = inst[, ifelse(eaf.exposure > 0.5,1-eaf.exposure,eaf.exposure)], 73 | n = inst$samplesize.exposure) 74 | 75 | sd 76 | 77 | newrow <- df_index[trait == "wuttke_eGFR", ] 78 | newrow[, id := "trait-11-2"] 79 | newrow[, note := "standardised with coloc"] 80 | df_index <- rbind(df_index, newrow) 81 | 82 | wuttke <- fread("Data/Raw/20171017_MW_eGFR_overall_EA_nstud42.dbgap.txt.gz") 83 | wuttke[, Effect := Effect / sd] 84 | wuttke[, StdErr := StdErr / sd] 85 | 86 | 87 | GagnonMR::formattovcf_createindex(all_out = wuttke, 88 | snp_col = "RSID", 89 | outcome_name = "wuttke_eGFR", 90 | beta_col = "Effect", 91 | se_col = "StdErr", 92 | pval_col = "P-value", 93 | eaf_col = "Freq1", 94 | effect_allele_col = "Allele1", 95 | other_allele_col = "Allele2", 96 | ncase_col = NULL, 97 | ncontrol_col = NULL, 98 | samplesize_col = "n_total_sum", 99 | chr_col = "Chr", 100 | pos_col = "Pos_b37", 101 | units = "log(eGFR)", 102 | traduction = traduction, 103 | out_wd = "/mnt/sda/gagelo01/Vcffile/Server_vcf", 104 | df_index = df_index, 105 | group_name = "public", 106 | year = 2021, 107 | author = "Wuttke Matthias", 108 | consortium = "CKDGEN UKBbiobank", 109 | sex = "Males and Females", 110 | population = "European", 111 | initial_build = "HG19/GRCh37", 112 | category = "Trait", 113 | pmid = 34272381, 114 | note = "Standardise with coloc", 115 | should_create_id = FALSE, 116 | ID = "trait-11-2") 117 | 118 | fwrite(df_index,"/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 119 | 120 | message("This script finished without errors") -------------------------------------------------------------------------------- /7a_writetext.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(tidyverse) 4 | library(TwoSampleMR) 5 | library(MendelianRandomization) 6 | library(GagnonMR) 7 | 8 | 9 | 10 | ########################## descriptive statistics ################################### 11 | inst_clump <- fread( "Data/Modified/inst_clump.txt") 12 | outcome_all <- fread( "Data/Modified/outcome_all") 13 | harm_all <- fread( "Data/Modified/harm_all.txt") 14 | harm_all <- harm_all[exposure != "willer_LDL-cholesterol:ieu-a-300",] 15 | primary_df_full <- fread( "/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Primary/primary_df") 16 | primary_df_full <- merge(primary_df_full, distinct(harm_all[,.(exposure, study)]), by = "exposure", all.x = TRUE) 17 | primary_df_full[, lci := b - se*1.96] 18 | primary_df_full[, uci := b + se*1.96] 19 | primary_df <- primary_df_full[exposure != "willer_LDL-cholesterol:ieu-a-300" ] 20 | primary_df[, exposure_outcome := paste0(exposure, "_", outcome)] 21 | 22 | 23 | harm_all[, .N, by = "exposure_outcome"][order(N)] 24 | list_sensitivity <- readRDS( "Data/Modified/Sensitivity/list_sensitivity") 25 | veclog <- readRDS( "Data/Modified/Sensitivity/veclog") 26 | MVMR_dat <- fread("/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Sensitivity/MVMR") 27 | 28 | ##### 29 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 30 | ao <- fread("/mnt/sda/gagelo01/Vcffile/available_outcomes_2021-10-13.txt") 31 | ID_mrbase_out <- c("ieu-b-109", "ieu-b-110", "ieu-b-111", "ukb-b-19953","ukb-b-12141") 32 | ID_server_out <- c("trait-2-2", "trait-6-1", "trait-7-2", "trait-2-4", "trait-12-2", 33 | "dis-2-1", "dis-3-1", "dis-4-1", "dis-5-1", "dis-6-1", "dis-7-1", "dis-8-1", 34 | "trait-13-1", "trait-13-2", "trait-12-2") 35 | 36 | 37 | k <- df_index[id %in% ID_server_out,] 38 | m <- ao[id %in% ID_mrbase_out, ] 39 | 40 | dataset <-rbind(k,m,fill = TRUE ) 41 | dataset <- dataset[,.(trait,group_name, year, author, consortium, sex, population, unit, nsnp, sample_size, ncase, ncontrol, pmid, note)] 42 | ###### 43 | return_format_data<-function(data) { 44 | return(data[, paste0(round(exp(b), digits = 2), " 95% CI=", round(exp(lci), digits = 2), "-", round(exp(uci), digits = 2), ", p=",pval %>% formatC(., format = "e", digits = 1))]) 45 | } 46 | return_format_data_noexp <-function(data) { 47 | return(data[, paste0(round(b, digits = 2), " 95% CI=", round(lci, digits = 2), "-", round(uci, digits = 2), ", p=",pval %>% formatC(., format = "e", digits = 1))]) 48 | } 49 | #included exposure 50 | harm_all$exposure %>% unique %>% length 51 | harm_all$outcome %>% unique %>% length 52 | harm_all$exposure_outcome %>% unique %>% length 53 | 54 | 55 | #Abstract 56 | harm_all[study %in% c("sanna", "kettunen", "framingham", "lotta"), length(unique(exposure))] 57 | harm_all[study %in% c("kurilshikov", "ruhlemann"),length(unique(exposure))] 58 | sum(veclog) 59 | harm_all[study != "willer", ]$exposure_outcome %>% unique %>% length 60 | dt_sen <-rbindlist(list_sensitivity[veclog]) 61 | dt_sen[method == "Inverse variance weighted", ][abs(b)<0.1, .N] 62 | 63 | ##INtro 64 | 65 | #Results 66 | #para 1 67 | harm_all[, length(unique(exposure))] 68 | dataset[population != "European", ] 69 | harm_all[study %in% c("sanna", "kettunen", "framingham", "lotta"), length(unique(exposure))] 70 | harm_all[study %in% c("kurilshikov", "ruhlemann"),length(unique(exposure))] 71 | harm_all[,length(unique(exposure))] 72 | 73 | #para 2 74 | harm_all[study %in% c("sanna", "kettunen", "framingham", "lotta"), length(unique(exposure_outcome))] 75 | primary_df[study %in% c("sanna", "kettunen", "framingham", "lotta"), mean(abs(b))] 76 | primary_df[study %in% c("sanna", "kettunen", "framingham", "lotta") & pval < 0.05, .N] 77 | primary_df[study %in% c("sanna", "kettunen", "framingham", "lotta") & fdr < 0.05, .N] 78 | primary_df_full[exposure == "willer_LDL-cholesterol:ieu-a-300" & 79 | outcome %in% c("van_der_Harst_CAD", "Deelen_longevity")] %>% return_format_data(.) 80 | 81 | #para 3 82 | harm_all[study %in% c("kurilshikov", "ruhlemann"), length(unique(exposure))] 83 | primary_df[study %in% c("kurilshikov", "ruhlemann"), length(unique(exposure_outcome))] 84 | primary_df[study %in% c("kurilshikov", "ruhlemann"), mean(abs(b))] 85 | primary_df[study %in% c("kurilshikov", "ruhlemann") & pval < 0.05, .N] 86 | primary_df[study %in% c("kurilshikov", "ruhlemann") & fdr < 0.05, .N] 87 | 88 | #Para 4 89 | primary_df[pval < 0.05,.N] 90 | length(list_sensitivity) 91 | sum(veclog) 92 | harm_all[exposure_outcome %in% names(list_sensitivity)[veclog], all(steiger_dir)] 93 | harm_all[exposure_outcome %in% names(list_sensitivity)[veclog], any(is_in_pleiotropic_region)] 94 | 95 | dtsen <- rbindlist(list_sensitivity[veclog]) 96 | dtsen[method == "Inverse variance weighted", ][abs(b)<0.1, .N] 97 | 98 | dtsen[method == "Inverse variance weighted" & abs(b)>0.2,] %>% 99 | return_format_data(.) 100 | 101 | 102 | #para7 103 | {MVMR_dat[,b := as.numeric(b)] 104 | MVMR_dat[, exposure_outcome := paste0(exposure, "_", outcome)] 105 | MVMR_dat_split <- split(MVMR_dat, MVMR_dat$exposure_outcome) 106 | list_dat <- vector(mode = "list", length(MVMR_dat_split)) 107 | for(i in 1:length(MVMR_dat_split)) { 108 | dat <- MVMR_dat_split[[i]] 109 | reference <- dat[MVMR == "no counfounder", b] 110 | dat[, comparison := apply(.SD, 1, function(x) as.numeric(x[names(x)=="b"])/reference), ] 111 | list_dat[[i]] <- dat 112 | } 113 | 114 | datfull <-rbindlist(list_dat)} 115 | 116 | datfull[comparison < 0.6 & !(outcome == "ukb-b-19953" & MVMR == "with BMI"),][,.(exposure,outcome,b,comparison, MVMR)] 117 | datfull[comparison < 0.6 & !(outcome == "ukb-b-19953" & MVMR == "with BMI"),][exposure == "lotta_Serotonin" & outcome == "van_der_Harst_CAD"] %>% 118 | return_format_data(.) 119 | datfull[comparison < 0.6 & !(outcome == "ukb-b-19953" & MVMR == "with BMI"),][exposure == "kurilshikov_order.Lactobacillales.id.1800"] %>% 120 | return_format_data(.) 121 | 122 | 123 | ######Discussion 124 | primary_df[fdr<0.05,] 125 | dtsen[method == "Inverse variance weighted" & abs(b)>0.2,] 126 | dtsen[method == "Inverse variance weighted" & abs(b)>0.1,] 127 | 128 | #comparison with other studies 129 | primary_df[exposure %in% "framingham_trimethylamine_N_oxide" & outcome %in% "van_der_Harst_CAD",] %>% 130 | return_format_data(.) 131 | 132 | #Method 133 | primary_df[,length(unique(exposure_outcome))] 134 | suppose <- length(unique(harm_all$outcome)) * harm_all[study != "willer", length(unique(exposure))] 135 | 136 | suppose - primary_df[,length(unique(exposure_outcome))] 137 | -------------------------------------------------------------------------------- /6_create_supplementary_tables.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(readxl) 4 | library(TwoSampleMR) 5 | library(tidyverse) 6 | library(writexl) 7 | library("xlsx") 8 | library(GagnonMR) 9 | 10 | setwd("/home/gagelo01/workspace/Projects/Dysbiose_project") 11 | harm_all <- fread( "Data/Modified/harm_all.txt") 12 | dataset_info<- read_excel("Data/Modified/informationondataused.xlsx") 13 | expinf <- readxl::read_excel(path = "Data/Modified/exposures_cohort_info.xlsx") 14 | setDT(expinf) 15 | ########Supplementary table titles and description 16 | dt_title <- data.table(title = paste0("Supplementary Table ", 1:7), 17 | caption = c("Description of the datasets used.", 18 | "Description of the metabolites selection rational.", 19 | "Summary of the instrument selection criteria for each data source. LD R2, pvalue treshold and Fstatistics.", 20 | "Harmonised dataset for each exposure outcome.", 21 | "Primary MR results and associated statistics.", 22 | "Robust MR results and other sensitivity analyses results.", 23 | "Multivariable MR results and associated statistics.")) 24 | ########Supplementary table with information 25 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 26 | ao<-fread("/mnt/sda/gagelo01/Vcffile/available_outcomes_2021-10-13.txt") 27 | ao_small<-ao[id %in% list.files("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/")] 28 | ID_server_out <- c("trait-2-2", "trait-6-1", "trait-7-2", "trait-2-4", "trait-12-2", 29 | "dis-2-1", "dis-3-1", "dis-4-1", "dis-5-1", "dis-6-1", "dis-7-1", "dis-8-1", 30 | "trait-13-1", "trait-13-2") 31 | ID_mrbase_out <- c("ieu-b-109", "ieu-b-110", "ieu-b-111", "ukb-b-19953","ukb-b-12141") 32 | 33 | outcome_table <- rbind(ao[id %in% ID_mrbase_out, ], df_index[id %in% ID_server_out], fill = TRUE) 34 | outcome_table[, author == "Richardson, Tom", unit := "SD"] 35 | dis <- c("ukb-b-12141", "dis-2-1", "dis-3-1", "dis-4-1", "dis-5-1", "dis-6-1", "dis-7-1", "dis-8-1") 36 | longlife <- c("trait-6-1", "trait-7-2") 37 | outcome_table[, category := id %>% ifelse(. %in% dis, "Disease", .) %>% 38 | ifelse(. %in% longlife, "Mortality", .) %>% 39 | ifelse(!(. %in% c("Disease", "Mortality")), "Metabolic risk factor", .)] 40 | outcome_table[,variable := "outcome"] 41 | outcome_table <- outcome_table[,.(variable,category, trait, group_name, year, author, consortium, 42 | sex, population, unit, nsnp, 43 | sample_size, pmid, note, ncase, ncontrol)] 44 | outcome_table <- outcome_table[order(category, trait)] 45 | 46 | setDT(expinf) 47 | expinf[trait == "Microbial relative abundance", trait := paste0(trait, " (", tolower(author), ")")] 48 | expinf[, c("sex", "unit") := .("Males and Females", "SD")] 49 | expinf[, category := ifelse(author %in% c("Kurilshikov", "Ruhlemann"), "Taxa abundance", "metabolites")] 50 | lotta_tobind <- df_index[pmid == "33414548",][1,] 51 | lotta_tobind[, trait := c("serotonin, leucine, isoleucine, valine, kynurenine")] 52 | lotta_tobind[,category := "metabolites"] 53 | lotta_tobind[, note := NULL] 54 | lotta_tobind <- lotta_tobind[,.(trait, group_name, year, author, consortium, 55 | sex, population, unit, nsnp, category, 56 | sample_size, pmid, ncase, ncontrol)] 57 | exp <- rbind(lotta_tobind, expinf, fill = TRUE) 58 | exp[,variable := "exposure"] 59 | 60 | suptab1 <- rbind(outcome_table, exp, fill = TRUE) 61 | suptab1[, c("group_name", "id", "initial_build", "sd", "nsnp") := NULL] 62 | dattrait <- data.frame(trait = suptab1[,trait], 63 | url = c("https://datashare.is.ed.ac.uk/handle/10283/3203", 64 | "https://ctg.cncr.nl/software/summary_statistics", 65 | "http://diagram-consortium.org/downloads.html", 66 | "https://www.megastroke.org/download.html", 67 | "https://www.ebi.ac.uk/gwas/publications/34841290", 68 | "https://gwas.mrcieu.ac.uk/files/ukb-b-12141/ukb-b-12141.vcf.gz", 69 | "http://ckdgen.imbi.uni-freiburg.de/", 70 | "https://data.mendeley.com/datasets/gbbsrpx6bs/1", 71 | "https://gwas.mrcieu.ac.uk/files/ukb-b-19953/ukb-b-19953.vcf.gz", 72 | "https://magicinvestigators.org/downloads/", 73 | "https://magicinvestigators.org/downloads/", 74 | "https://www.ebi.ac.uk/gwas/publications/32203549", 75 | "https://www.ebi.ac.uk/gwas/publications/32203549", 76 | "https://ckdgen.imbi.uni-freiburg.de/", 77 | "https://www.ebi.ac.uk/gwas/publications/30224653", 78 | "https://www.ebi.ac.uk/gwas/publications/30224653", 79 | "https://www.ebi.ac.uk/gwas/publications/32203549", 80 | "https://www.longevitygenomics.org/downloads", 81 | "https://datashare.ed.ac.uk/handle/10283/3209", 82 | "https://omicscience.org/apps/crossplatform/", 83 | "https://static-content.springer.com/esm/art%3A10.1038%2Fs41588-019-0350-x/MediaObjects/41588_2019_350_MOESM1_ESM.pdf", 84 | "https://www.cell.com/action/showFullTableHTML?isHtml=true&tableId=tbl2&pii=S1550-4131%2813%2900257-X", 85 | "http://www.computationalmedicine.fi/data#NMR_GWAS", 86 | "https://www.nature.com/articles/s41588-020-00747-1", 87 | "https://mibiogen.gcc.rug.nl/")) 88 | 89 | suptab1 <- cbind(suptab1, url = dattrait$url) 90 | suptab1[, note := gsub("standardised with coloc", "standardised with coloc:::sdY.est function", note)] 91 | 92 | suptab1 <- merge(suptab1, dataset_info, by= "trait") 93 | #Supplementary Table 2 94 | suptab2<- readxl::read_excel(path = "Data/Modified/tables_metabolites.xlsx") %>% 95 | as.data.table 96 | 97 | #Supplementary Table 3 98 | suptab3<- readxl::read_excel(path = "Data/Modified/instrumentselection.xlsx") %>% 99 | as.data.table 100 | setnames(suptab3, "min F-statistics", "F.statistics") 101 | suptab3[, F.statistics := 10] 102 | #Supplementary Table 4 103 | harm_all <- fread( "Data/Modified/harm_all_clean.txt") 104 | harm_all[,c("id.exposure", "id.outcome", "outcome", "exposure") := NULL] 105 | setnames(harm_all, c("exposure_clean", "outcome_clean", "Category"), c("exposure", "outcome", "exposure_category")) 106 | suptab4 <- harm_all 107 | #Supplementary Table 5 108 | primary_df <- fread( "Data/Modified/primary_df_clean.txt") 109 | harm_all <- fread( "Data/Modified/harm_all_clean.txt") 110 | harm_all <- distinct(harm_all[, .(exposure, outcome, fstat.exposure)]) 111 | harm_all[ , fstat.exposure := mean(fstat.exposure), by = c("exposure", "outcome")] 112 | harm_all 113 | primary_df <- merge( 114 | primary_df, 115 | distinct(harm_all), 116 | by = c("exposure", "outcome")) 117 | primary_df[,c("id.exposure", "id.outcome", "outcome", "exposure") := NULL] 118 | setnames(primary_df, c("exposure_clean", "outcome_clean", "Category"), c("exposure", "outcome", "exposure_category")) 119 | primary_df <- primary_df[, .(exposure_category, outcome_category, exposure, outcome, method, nsnp, 120 | b,se,pval,fdr, power, fstat.exposure) ] 121 | primary_df <- primary_df[order(exposure_category, outcome_category, exposure, outcome),] 122 | suptab5 <- primary_df 123 | suptab5[,power:=NULL] 124 | #Supplementary table 6 125 | list_sensitivity <- readRDS( "Data/Modified/Sensitivity/list_sensitivity") 126 | veclog <- readRDS( "Data/Modified/Sensitivity/veclog") 127 | suptab6 <- rbindlist(list_sensitivity) 128 | 129 | #Supplementary Table 7 130 | MVMR_dat <- fread("Data/Modified/MVMR_clean.txt") 131 | MVMR_dat[,c( "outcome", "exposure") := NULL] 132 | setnames(MVMR_dat, c("exposure_clean", "outcome_clean", "Category", "F_stastistics"), 133 | c("exposure", "outcome", "exposure_category", "conditional_F_stastistics")) 134 | MVMR_dat<- MVMR_dat[, .(exposure_category, outcome_category, exposure, outcome, method, nsnp, 135 | b,se,lci,uci,pval,MVMR, nsnp, cochranQ,cochranQpval,conditional_F_stastistics)] 136 | MVMR_dat <- MVMR_dat[order(exposure_category, outcome_category, exposure, outcome),] 137 | suptab7<-MVMR_dat 138 | 139 | 140 | 141 | 142 | 143 | writexl::write_xlsx(x = list("Tables captions and titles" = dt_title, 144 | "Supplementary Table 1" = suptab1, 145 | "Supplementary Table 2" = suptab2, 146 | "Supplementary Table 3" = suptab3, 147 | "Supplementary Table 4" = suptab4, 148 | "Supplementary Table 5" = suptab5, 149 | "Supplementary Table 6" = suptab6, 150 | "Supplementary Table 7" = suptab7), 151 | path = "Results/supplementary_tables.xlsx") 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /2_import_instrument.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(tidyverse) 4 | library(TwoSampleMR) 5 | library(MendelianRandomization) 6 | library(GagnonMR) 7 | library(readxl) 8 | 9 | setwd("/mnt/sda/gagelo01/Projects/Dysbiose_project") 10 | gwasvcf::set_bcftools() 11 | gwasvcf::set_plink() 12 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 13 | df_index <- fread("/mnt/sda/gagelo01/Vcffile/server_gwas_id.txt") 14 | # 15 | inst_sanna <- fread("Data/Raw/inst_sanna") 16 | inst_sanna[, units.exposure := "SD"] 17 | 18 | rsid_traduction <- fread("/mnt/sda/couchr02/rsids/rsid_pos_86millions.txt") 19 | rsid_traduction[,chr_pos := paste0(chr, "_", pos)] 20 | all_out <- rsid_traduction[rsid %in% inst_sanna$SNP] 21 | mirge <- merge(inst_sanna, all_out[,c(1,3,2) ], by.x = "SNP", by.y = "rsid", all.x = TRUE) 22 | setnames(mirge, c("pos", "chr"), c("pos.exposure", "chr.exposure")) 23 | inst_sanna <- mirge 24 | 25 | ###"framingham" 26 | traduction = fread("/mnt/sda/couchr02/1000G_Phase3/1000G_Phase3_b37_rsid_maf.txt") 27 | traduction[, EUR := EUR %>% ifelse(.==0,0.001,. ) %>% ifelse(.==1, 0.999, .)] 28 | traduction[, maf := NULL] 29 | framingham_pheno <- c("indole_3_propionate", "trimethylamine_N_oxide") #carnitine is also 30 | format_framingham <- function(framingham_pheno) { 31 | instrument <- read_excel("Data/Raw/mmc2.xlsx", sheet = framingham_pheno) 32 | setDT(instrument) 33 | instrument <- instrument[Chr != "X", ] 34 | instrument[, Chr := as.integer(Chr)] 35 | all_out <- merge(instrument, traduction, by.x = c("rsID"), by.y = c("rsid"), all = FALSE) 36 | setnames(all_out, c("min_all", "maj_all"), c("effect_allele","other_allele")) 37 | all_out[,c("Chr", "PhysPos") := NULL] #I think it is GRCH36 so better to remove it 38 | all_out <- all_out[(effect_allele == a0 | effect_allele == a1) & (other_allele == a0 | other_allele == a1) & a0 != a1 & effect_allele != other_allele, ] #because low number removed, coded on the forward strand 39 | all_out[effect_allele == a0, beta := beta*-1] 40 | all_out[effect_allele == a0, effect_allele := a1] 41 | all_out[other_allele == a1, other_allele := a0] 42 | all_out[, eaf := ifelse(EUR < 0.5, MAF, 1-MAF)] 43 | 44 | instrument <- all_out 45 | 46 | instrument[,Phenotype := framingham_pheno] 47 | instrument[,units := "SD"] 48 | instrument[,samplesize := 2076] 49 | inst <- TwoSampleMR::format_data( 50 | instrument, 51 | type = "exposure", 52 | phenotype_col = "Phenotype", 53 | snp_col = "rsID", 54 | beta_col = "beta", 55 | se_col = "se", 56 | effect_allele_col = "effect_allele", #I verified and this is the effect allele 57 | other_allele_col = "other_allele", #I verified and this is the reference allele 58 | eaf_col = "eaf", 59 | pval_col = "pval", 60 | units_col = "units", 61 | samplesize_col = "samplesize", 62 | chr_col = "chr", 63 | pos_col = "position", 64 | ) 65 | setDT(inst) 66 | return(inst) 67 | } #do not estimate eaf, cause old GWAS 68 | 69 | list_framingham <- pmap(data.frame(framingham_pheno = framingham_pheno), format_framingham) 70 | inst_framingham <- rbindlist(list_framingham, fill = TRUE) 71 | inst_framingham <- inst_framingham[pval.exposure < 1*10^-5, ] 72 | 73 | ##kettunen 74 | kettunen_pheno <- c("Ace") 75 | format_kettunen <- function(kettunen_pheno) { 76 | instrument <- fread(paste0("Data/Raw/Summary_statistics_MAGNETIC_kettunen_", kettunen_pheno, ".txt.gz")) 77 | setnames(instrument, "p-value", "p_value") 78 | instrument <- instrument[p_value < 1*10^-5, ] 79 | instrument[,Phenotype := kettunen_pheno] 80 | instrument[,units := "SD"] 81 | 82 | inst_kett<- format_data( 83 | instrument, 84 | type = "exposure", 85 | phenotype_col = "Phenotype", 86 | snp_col = "ID", 87 | beta_col = "beta", 88 | se_col = "se", 89 | eaf_col = "eaf", 90 | effect_allele_col = "EA", 91 | other_allele_col = "NEA", 92 | pval_col = "p_value", 93 | units_col = "units", 94 | ncase_col = "ncase", 95 | ncontrol_col = "ncontrol", 96 | samplesize_col = "n_samples", 97 | chr_col = "chromosome", 98 | pos_col = "position") 99 | 100 | setDT(inst_kett) 101 | return(inst_kett) 102 | } 103 | 104 | list_kettunen <- pmap(data.frame(kettunen_pheno = kettunen_pheno), format_kettunen) 105 | inst_kettunen <- rbindlist(list_kettunen, fill = TRUE) 106 | 107 | 108 | ###Lotta 109 | vec_id <- df_index[pmid == "33414548" ,][ trait %in% c("Ile", "Leu", "Val") | note %in% c("Serotonin", "Kynurenine"), ]$id 110 | 111 | vec_vcffile <- paste0("/mnt/sda/gagelo01/Vcffile/Server_vcf/", vec_id, "/", vec_id, ".vcf.gz") 112 | 113 | inst_lotta <- map(as.list(vec_vcffile), function(x) get_inst(vcffile = x,pval = 1e-6, clump = FALSE, r2 = 0.01, kb = 10000)) %>% 114 | rbindlist(., fill = TRUE) 115 | inst_lotta[,units.exposure := "SD"] 116 | #######ruhlemann 117 | ruhlemann <-read_excel("Data/Raw/41588_2020_747_MOESM3_ESM.xlsx", sheet = 4, range = "A3:CN10003") 118 | setDT(ruhlemann) 119 | ruhlemann <- ruhlemann[META.P < 1*10^-6, ] 120 | 121 | ####change exposure of ruhlemann 122 | key_exposure <-read_excel("Data/Raw/41588_2020_747_MOESM3_ESM.xlsx", sheet = 1, range = "A4:C283") 123 | setDT(key_exposure) 124 | setnames(key_exposure, "Feature name", "feature") 125 | 126 | key_exposure[, Taxonomy := mgsub::mgsub(Taxonomy,pattern = c("K_", "P_", "C_", "O_", "F_", "G_"), 127 | c("kingdom-", "phylum-", "class-", "order-", "family-", "genus-"))] 128 | key_exposure[, feature := sub("_", "-", feature)] 129 | ruhlemann[, feature := sub("NB_", "", feature)] 130 | ruhlemann[, feature := sub("_", "-", feature)] 131 | test <- merge(ruhlemann ,key_exposure[,c("feature", "Taxonomy")], by = "feature", all.x = TRUE) 132 | test[, Taxonomy %>% unique %>% length, by = "feature"]$V1 %>% table 133 | test[, feature %>% unique %>% length, by = "Taxonomy"]$V1 %>% table 134 | test$Taxonomy %>% unique(.) %>% length(.) #Here I make a decision. I want my exposure to be "feature", but i will include Taxonomy 135 | test[ , Taxonomy := tolower(Taxonomy)] 136 | test[ , feature_taxonomy := paste0(feature, "(", Taxonomy, ")")] 137 | ruhlemann <- test 138 | ##remove expousre ruhlemann already evaluated in kuri 139 | ku_bac <- read_excel("Data/Raw/41588_2020_763_MOESM3_ESM.xlsx", sheet = 3, skip =2) 140 | setDT(ku_bac) 141 | ku <- ku_bac[Quant==TRUE, Taxon.name] 142 | ku <- unique(ku) 143 | ku<-ku%>% gsub("\\.id..*", "", .) 144 | ku<-tolower(ku) 145 | ku <- ku %>% gsub("\\.", "-", .) %>% gsub("--", "-", .) 146 | ruhlemann <- ruhlemann[!(Taxonomy %in% ku), ] 147 | #add rsid and eaf 148 | mirge <- merge(ruhlemann, traduction, by.x = c("chrom", "pos"), by.y = c("chr", "position"), all = FALSE) #genome assembly hg19 (GRCh37). 149 | all_out <- mirge 150 | all_out[,units := "SD"] 151 | 152 | setnames(all_out, c("A1", "A2"), c("other_allele", "effect_allele")) 153 | all_out <- all_out[(effect_allele == a0 | effect_allele == a1) & (other_allele == a0 | other_allele == a1) & a0 != a1 & effect_allele != other_allele, ] #because low number removed, coded on the forward strand 154 | all_out[effect_allele == a0, META.BETA := META.BETA*-1] 155 | all_out[effect_allele == a0, effect_allele := a1] 156 | all_out[other_allele == a1, other_allele := a0] 157 | all_out[, eaf := ifelse(EUR < 0.5, META.MAF, 1-META.MAF)] 158 | 159 | 160 | ##format 161 | ruhlemann_inst <- TwoSampleMR::format_data( 162 | all_out, 163 | type = "exposure", 164 | phenotype_col = "feature_taxonomy", 165 | snp_col = "rsid", 166 | beta_col = "META.BETA", 167 | se_col = "META.se", 168 | eaf_col = "eaf", 169 | effect_allele_col = "effect_allele", #See Supplementary Tables 2 170 | other_allele_col = "other_allele", #See Supplementary Tables 2 171 | pval_col = "META.P", 172 | units_col = "units", 173 | samplesize_col = "META.N", 174 | id_col = "id", 175 | chr_col = "chrom", 176 | pos_col = "pos") # 177 | 178 | setDT(ruhlemann_inst) 179 | 180 | inst_ruhlemann <- ruhlemann_inst 181 | 182 | #kurilshikov 183 | kurilshikov <- fread("Data/Raw/MBG.allHits.p1e4.txt") 184 | kurilshikov <- kurilshikov[P.weightedSumZ < 1e6,] 185 | 186 | inst_kurilshikov <- TwoSampleMR::format_data( 187 | kurilshikov, 188 | type = "exposure", 189 | phenotype_col = "bac", 190 | snp_col = "rsID", 191 | beta_col = "beta", 192 | se_col = "SE", 193 | eaf_col = , 194 | effect_allele_col = "eff.allele", 195 | other_allele_col = "ref.allele", 196 | pval_col = "P.weightedSumZ", 197 | samplesize_col = "N", 198 | chr_col = "chr", 199 | pos_col = "bp") 200 | 201 | setDT(inst_kurilshikov) 202 | inst_kurilshikov[,units.exposure := "SD"] 203 | all_out <- merge(inst_kurilshikov, traduction, by.x = c("chr.exposure", "pos.exposure"), by.y = c("chr", "position"), all = FALSE) 204 | setDT(all_out) 205 | all_out <- all_out[(effect_allele.exposure == a0 | effect_allele.exposure == a1) & (other_allele.exposure == a0 | other_allele.exposure == a1) 206 | & a0 != a1 & effect_allele.exposure != other_allele.exposure, ] 207 | all_out <- all_out[chr.exposure %in% 1:22, ] 208 | all_out[, `:=`(chr.exposure, as.integer(chr.exposure))] 209 | all_out[effect_allele.exposure == a0, `:=`(beta.exposure, beta.exposure * -1)] 210 | all_out[effect_allele.exposure == a0, `:=`(effect_allele.exposure, a1)] 211 | all_out[other_allele.exposure == a1, `:=`(other_allele.exposure, a0)] 212 | all_out[, `:=`(eaf.exposure, EUR)] 213 | 214 | all_out[, c("rsid", "a0", "a1","EUR") := NULL] 215 | all_out <- all_out[pval.exposure < 1e-6,] 216 | inst_kurilshikov <- all_out 217 | 218 | #####change exposure and id column 219 | list_inst <- list(inst_sanna, inst_framingham, inst_kettunen, inst_lotta, inst_ruhlemann, inst_kurilshikov) 220 | study_name <- c( "sanna", "framingham", "kettunen", "lotta", "ruhlemann", "kurilshikov") 221 | names(list_inst) <- study_name 222 | for(i in 1:length(list_inst)) { 223 | inst <- list_inst[[i]] 224 | inst$exposure <- paste0(study_name[i], "_", inst$exposure) 225 | inst$id.exposure <- inst$exposure 226 | list_inst[[i]] <- inst 227 | } 228 | 229 | dt_inst_noselect <- rbindlist(list_inst, fill = TRUE) 230 | fwrite(dt_inst_noselect, "Data/Modified/dt_inst_noselect.txt") 231 | 232 | ######add ldl cholesterol 233 | ##add ldl of GLGC consortium 234 | ldl <- GagnonMR::get_inst("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/ieu-a-300/ieu-a-300.vcf.gz",pval = 5e-8, r2 = 0.01) 235 | ldl[, exposure := "willer_LDL-cholesterol:ieu-a-300" ] 236 | ldl[,units.exposure := "SD"] 237 | ldl[, data_source.exposure := NULL] 238 | ldl[,id.exposure := exposure] 239 | 240 | ###rbind by names 241 | dt_inst <- rbindlist(c(list_inst, list(ldl)), use.names = TRUE, fill = TRUE) 242 | dt_inst[is.na(units.exposure),] 243 | dt_inst[is.na(samplesize.exposure),] 244 | dt_inst[,c("chrompos", "CHROM", "POS", "N_ALLELES", "N_CHR", "allele1", "freq1", "allele2", "freq2") := NULL] 245 | 246 | ####rsq 247 | dt_inst <- add_rsq(dt_inst) 248 | setDT(dt_inst) 249 | dt_inst[is.na(rsq.exposure),] #parfait 250 | 251 | 252 | #add column study and microbiote 253 | dt_inst <- separate(dt_inst, col = "exposure", into = c("study", "microbiote"), sep = "_", remove = FALSE) 254 | 255 | fwrite(dt_inst, "Data/Modified/all_inst.txt") 256 | 257 | all_inst <- dt_inst 258 | 259 | all_inst_split <- split(all_inst, all_inst$exposure) 260 | 261 | inst_clump <- map(all_inst_split, function(x) { 262 | rsiid <- x %>% dplyr::select(rsid=SNP, pval=pval.exposure, id = id.exposure) %>% 263 | ieugwasr::ld_clump(., clump_r2 = 0.01, plink_bin=genetics.binaRies::get_plink_binary(), bfile=ldref) %>% 264 | {.$rsid} 265 | return(x[SNP %in% rsiid]) }) %>% rbindlist(., fill = TRUE) 266 | 267 | #fstat 268 | inst_clump_split <- split(inst_clump, inst_clump$exposure) 269 | fstat_fromdat <- function(dat) { 270 | k <- nrow(dat) 271 | n <- mean(dat$samplesize.exposure) 272 | r2sum <- sum(dat$rsq.exposure) 273 | Fstat <- ((n - k - 1)/k) * (r2sum/(1 - r2sum)) 274 | return(Fstat) 275 | } 276 | vec_fstat <-lapply(inst_clump_split, fstat_fromdat) 277 | inst_clump$fstat.exposure <- vec_fstat[match(inst_clump$exposure, names(vec_fstat))] 278 | inst_clump[is.na(fstat.exposure)|fstat.exposure<0,] #parfait 279 | 280 | # 281 | exposure_to_remove <- inst_clump[, mean(fstat.exposure) > 10, by = "exposure"][V1 == FALSE,]$exposure #select mean fstat > 10 282 | inst_clump <- inst_clump[!(exposure %in% exposure_to_remove), ] 283 | exposure_to_remove <- inst_clump[, .N >= 3, by = "exposure"][V1 == FALSE, ]$exposure 284 | inst_clump <- inst_clump[!(exposure %in% exposure_to_remove), ] 285 | 286 | fwrite(inst_clump, "Data/Modified/inst_clump.txt") 287 | 288 | print("script finished without errors") 289 | -------------------------------------------------------------------------------- /4_harmonise_and_mrresult.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(data.table) 3 | library(tidyverse) 4 | library(TwoSampleMR) 5 | library(MendelianRandomization) 6 | library(GagnonMR) 7 | 8 | setwd("/mnt/sda/gagelo01/Projects/Dysbiose_project") 9 | gwasvcf::set_bcftools() 10 | gwasvcf::set_plink() 11 | ldref = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs" 12 | inst_clump <- fread( "Data/Modified/inst_clump.txt") 13 | outcome_all <- fread( "Data/Modified/outcome_all") 14 | 15 | arguments <- tidyr::crossing(exposure_name = unique(inst_clump$exposure), outcome_name = unique(outcome_all$outcome)) 16 | list_arguments <- split(arguments, 1:nrow(arguments)) 17 | harm_all <- map(list_arguments, function(x) { 18 | harm <- harmonise_data(inst_clump[exposure == x$exposure_name,], outcome_dat = outcome_all[outcome == x$outcome_name,], 19 | action = 1) #there all coded on the forward strand is rhee coded on the forward strand I assume that yes, because build 36 is coded on the forward strand 20 | return(harm) 21 | }) %>% rbindlist(., fill = TRUE) 22 | 23 | # harm_all1 <- harmonise_data(inst_clump[!(study %in% c("framingham", "IBD")), ], outcome_all, action = 1) 24 | # harm_all2 <- harmonise_data(inst_clump[study %in% c("framingham", "IBD")], outcome_all, action = 2) 25 | # harm_all <- rbindlist(list(harm_all1, harm_all2), fill = TRUE) 26 | setDT(harm_all) 27 | harm_all[, exposure_outcome := paste0(exposure,"_", outcome)] 28 | 29 | ##remove every association with less than 3 genetic instruments 30 | harm_all<-harm_all[mr_keep==TRUE,] 31 | to_remove <- harm_all[, .N, by ="exposure_outcome"][N<3,]$exposure_outcome 32 | harm_all <- harm_all[!(exposure_outcome %in% to_remove),] 33 | # ####remove ruhlemann exposure that are in kurilshikov 34 | # harm_all[, taxa := tolower(exposure)] 35 | # harm_all[study == c("kurilshikov"),taxa := taxa %>%gsub("kurilshikov_","",.)%>% gsub("\\.id..*", "", .) %>% gsub("\\.", "-", .) %>% gsub("--", "-", .)] 36 | # harm_all[study == c("ruhlemann"), taxa := gsub("[\\(\\)]", "", regmatches(exposure, gregexpr("\\(.*?\\)", exposure)))] 37 | # ku<-harm_all[study == c("kurilshikov"),taxa] 38 | # ru<-harm_all[study == c("ruhlemann"), taxa ] 39 | # to_remove <- harm_all[study == c("ruhlemann") & taxa %in% ru[(ru %in% ku)], unique(exposure)] 40 | # harm_all <- harm_all[!(exposure %in% to_remove),] 41 | ###primary analysis 42 | harm_all_split <- split(harm_all, harm_all$exposure_outcome) 43 | perform_primary_analysis <- function(dat) { 44 | if (nrow(dat) > 3) { 45 | primary_analysis <- "mr_ivw" #multiplicative random effects IVW estimate. The standard error is corrected for under dispersion 46 | } 47 | else { 48 | primary_analysis <- "mr_ivw" #"mr_ivw_fe" 49 | } 50 | res <- TwoSampleMR::mr(dat, method_list = c(primary_analysis)) 51 | return(res) 52 | } 53 | primary_df<-map(harm_all_split, perform_primary_analysis) %>% rbindlist 54 | 55 | setDT(primary_df) 56 | willer_primary <- primary_df[exposure == "willer_LDL-cholesterol:ieu-a-300"] 57 | primary_df<- primary_df[exposure != "willer_LDL-cholesterol:ieu-a-300"] 58 | 59 | 60 | #adjust for multiple testing 61 | primary_df$fdr <- p.adjust(primary_df$pval, method = "fdr")#OK so nothing comes out 62 | primary_df[order(fdr)] 63 | 64 | primary_df[fdr < 0.05,] 65 | primary_df[,min(fdr)] 66 | 67 | ##steiger filtering 68 | harm_all <- steiger_filtering(harm_all) 69 | setDT(harm_all) 70 | harm_all_steiger <- harm_all[harm_all$steiger_dir,] 71 | setDT(harm_all_steiger) 72 | harm_all_steiger_split <- split(harm_all_steiger, harm_all_steiger$exposure_outcome) 73 | 74 | primary_df_steiger <-map(harm_all_steiger_split, perform_primary_analysis) %>% rbindlist 75 | 76 | #remove SNPs in the ABO, APOE, HLA-A gene region 77 | to_exclude = c("APOE", "ABO", "HLA-A") 78 | window = 2e+06 79 | gencode <- fread("/home/couchr02/Mendel_Commun/Nicolas/GTEx/gencode.v19.genes.v7.patched_contigs.txt") 80 | list <- vector(mode = "list", length = length(to_exclude)) 81 | for (i in 1:length(to_exclude)) { 82 | bon <- gencode[gene_name == to_exclude[i], ] 83 | list[[i]] <- data.frame(chr = bon[1, ]$chr, start = min(bon$start) - 84 | window/2, end = max(bon$end) + window/2, gene_name = bon[1, 85 | ]$gene_name) 86 | } 87 | region_df <- rbindlist(list) 88 | harm_all[, is_in_pleiotropic_region := FALSE] 89 | for (i in 1:nrow(region_df)) { 90 | harm_all[(chr.exposure == region_df[i, ]$chr) & 91 | (pos.exposure >= region_df[i, ]$start) & (pos.exposure <= 92 | region_df[i, ]$end), is_in_pleiotropic_region := TRUE] 93 | } 94 | harm_all_notpleio <- harm_all[is_in_pleiotropic_region == FALSE, ] 95 | setDT(harm_all_notpleio) 96 | harm_all_notpleio_split <- split(harm_all_notpleio, harm_all_notpleio$exposure_outcome) 97 | 98 | primary_df_notpleio <- map(harm_all_notpleio_split, perform_primary_analysis) %>% rbindlist 99 | 100 | fwrite(primary_df_steiger, "Data/Modified/primary_df_steiger.txt") 101 | fwrite(primary_df_notpleio, "Data/Modified/primary_df_notpleio.txt") 102 | fwrite(harm_all, "Data/Modified/harm_all.txt") 103 | ###robust MR methods 104 | setDT(harm_all) 105 | primary_df[, exposure_outcome := paste0(exposure,"_", outcome)] 106 | exposure_sign <- primary_df[pval < 0.05, ]$exposure_outcome 107 | exposure_sign <- exposure_sign[!grepl("willer_LDL", exposure_sign)] 108 | harm_all[, exposure_outcome := paste0(exposure,"_", outcome)] 109 | harm_all[,id.exposure := exposure] 110 | harm_all[,id.outcome := outcome] 111 | list_harm_sign <- split(harm_all[exposure_outcome %in% exposure_sign], harm_all[exposure_outcome %in% exposure_sign]$exposure_outcome) 112 | list_sensitivity <- lapply(list_harm_sign, function(x) GagnonMR::all_mr_methods(dat = x, Primary_mr = "random_underdispersion")) 113 | 114 | list_sensitivity <- map(list_sensitivity, function(x) x[!(method %in% 115 | c("Weighted mode", "MR Egger", "Robust adjusted profile score (RAPS)")),]) 116 | 117 | egger_intercept_test<- map_dbl(list_harm_sign, function(x) mr_pleiotropy_test(x) %>% 118 | as.data.table(.) %>% 119 | .$pval) < 0.05 #FALSE == intercept (average pleitropy) not significantly different from zero 120 | df_egger <- data.frame(exposure_outcome = names(egger_intercept_test), 121 | intercept_test = unlist(egger_intercept_test)) 122 | 123 | map(list_sensitivity, function(x) x[,exposure_outcome := paste0(exposure, "_", outcome)]) 124 | list_sensitivity <- map(list_sensitivity, function(x) {merge(x,df_egger, by = "exposure_outcome")}) 125 | 126 | veclog <- vector(mode = "logical", length = length(list_sensitivity)) 127 | toinclude <- FALSE #Do we want to include those that could not be evaluated# 128 | for(i in 1:length(list_sensitivity)) { 129 | dat <- list_sensitivity[[i]] 130 | outi <- dat[method %in% c("MR-PRESSO (Outlier-corrected)", "IVW radial"), ] 131 | outlier_robust_val <- if(nrow(outi) == 0) {toinclude} else{outi[,pval < 0.05]} 132 | conta <- dat[method == "Contamination mixture", ] 133 | conta_val <- if(nrow(conta) == 0) {toinclude} else{conta[,pval < 0.05]} 134 | 135 | stat <- outlier_robust_val & 136 | dat[method == "Weighted median", pval < 0.05] & 137 | dat[1,intercept_test == FALSE] & 138 | conta_val 139 | 140 | veclog[i]<- stat 141 | names(veclog)[i]<-dat$exposure_outcome %>% unique 142 | } 143 | 144 | saveRDS(veclog, "Data/Modified/Sensitivity/veclog") 145 | saveRDS(list_sensitivity, "Data/Modified/Sensitivity/list_sensitivity") 146 | #power analysis 147 | list_harm<- split(harm_all, harm_all$exposure_outcome) 148 | 149 | vec_power <- map_dbl(list_harm, function(x) power.calculator_fromdat_vishner(x, effect = 0.1 ) ) 150 | 151 | primary_df[, exposure_outcome := paste0(exposure, "_", outcome)] 152 | primary_df$power <- vec_power[match(primary_df$exposure_outcome, names(vec_power))] 153 | primary_df <- rbindlist(list(primary_df, willer_primary), fill = TRUE) 154 | fwrite(primary_df, "/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Primary/primary_df") 155 | 156 | #MVMR correcting for BMI and alcohol intake frequency 157 | 158 | harm_all_sen <- harm_all[exposure_outcome %in% names(veclog[veclog == TRUE]),] #half (3 out of 6) sensitivity analyses with p-value <0.05 159 | id_vec <- c("ukb-b-19953", "ukb-b-5779") #BMI alcohol in UKB 160 | list_harm_sen <- split(harm_all_sen, harm_all_sen$exposure_outcome) 161 | 162 | sensitivity_mvmr <- function(dat, id) { 163 | the_clump <- inst_clump[exposure == unique(dat$exposure),] 164 | confound <- gwasvcf::query_gwas(vcf = paste0("/mnt/sda/gagelo01/Vcffile/MRBase_vcf/", id, "/", id,".vcf.gz"), 165 | rsid = dat$SNP, proxies = "yes", bfile = "/home/couchr02/Mendel_Commun/Christian/LDlocal/EUR_rs") %>% 166 | gwasglue::gwasvcf_to_TwoSampleMR(., "exposure") 167 | 168 | inst_mvmr <- rbind(the_clump, confound, fill = TRUE) 169 | df_instrument <- GagnonMR::prepare_for_mvmr(exposure_dat = inst_mvmr, d1 = inst_mvmr, clump_r2 = 0.01, clump_kb = 10000, 170 | harmonise_strictness = 1, pval_threshold = 1, clump_exp = NULL, should_clump = TRUE) 171 | 172 | outcome_names <- dat[,c("SNP", colnames(dat)[grepl("outcome", colnames(dat))])] 173 | outcome <- dat[,.SD, .SDcols = outcome_names] 174 | exposure_outcome_harmonized <- mv_harmonise_data(exposure_dat = df_instrument, 175 | outcome_dat = outcome, 176 | harmonise_strictness = 1) 177 | mvmr_results <- GagnonMR::mv_multiple_MendelianRandomization(exposure_outcome_harmonized = exposure_outcome_harmonized, 178 | only_IVW = TRUE) 179 | 180 | mvmr_results 181 | res<-mvmr_results[exposure == unique(dat$exposure),] 182 | return(res) 183 | } 184 | 185 | list_counfound_BMI <- map(list_harm_sen, function(x) sensitivity_mvmr(dat = x, id = id_vec[1])) 186 | list_counfound_alcohol <- map(list_harm_sen, function(x) sensitivity_mvmr(dat = x, id = id_vec[2])) 187 | 188 | 189 | no_counfound <- lapply(list_sensitivity[veclog], function(x) x[1,]) %>% rbindlist(.) 190 | withbmi <- rbindlist(list_counfound_BMI) 191 | withalcohol <- rbindlist(list_counfound_alcohol) 192 | 193 | no_counfound$MVMR <- "no counfounder" 194 | no_counfound$method <- "IVW" 195 | # setnames(no_counfound, c("exposure", "outcome", "b", "se", "lci","uci", "pval"), 196 | # c("Exposure", "Outcome", "Estimate", "StdError", "CILower", "CIUpper", "Pvalue")) 197 | withbmi$MVMR <- "with BMI" 198 | withalcohol$MVMR <- "with alcohol intake frequency" 199 | 200 | MVMR<-rbindlist(list(no_counfound[,c("method", "exposure", "outcome", "b", "se", "lci", "uci", "pval", "MVMR", "nsnp")], 201 | withbmi, withalcohol), fill = TRUE) 202 | 203 | 204 | MVMR<-MVMR[order(exposure, outcome),] 205 | 206 | fwrite(MVMR, "Data/Modified/Sensitivity/MVMR") 207 | 208 | 209 | 210 | message("This script finished without errors") 211 | 212 | # 213 | # ########################## descriptive statistics ################################### 214 | # harm_all <- fread( "Data/Modified/harm_all.txt") 215 | # harm_all <- harm_all[exposure != "willer_LDL-cholesterol:ieu-a-300",] 216 | # primary_df <- fread( "/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Primary/primary_df") 217 | # primary_df <- primary_df[exposure != "willer_LDL-cholesterol:ieu-a-300" ] 218 | # primary_df[, exposure_outcome := paste0(exposure, "_", outcome)] 219 | # harm_all[, .N, by = "exposure_outcome"][order(N)] 220 | # list_sensitivity <- readRDS( "Data/Modified/Sensitivity/list_sensitivity") 221 | # vec5<- sapply(list_sensitivity, function(x) sum(x[, pval] < 0.05 )) # est-ce qu'il y a 222 | # #included exposure 223 | # harm_all$exposure %>% unique %>% length 224 | # harm_all$outcome %>% unique %>% length 225 | # harm_all$exposure_outcome %>% unique %>% length 226 | # #included microbial metabolites 227 | # harm_all[study %in% c("sanna", "kettunen", "framingham"), length(unique(exposure))] 228 | # harm_all[study %in% c("sanna", "kettunen", "framingham"), length(unique(exposure_outcome))] 229 | # primary_df[exposure_outcome %in% harm_all[study %in% c("sanna", "kettunen", "framingham"), ]$exposure_outcome, ][,sum(pval < 0.05)] 230 | # 231 | # primary_df_full[exposure == "willer_LDL-cholesterol:ieu-a-300" & outcome %in% c("van_der_Harst_CAD", "Timmers_parental_lifespan") , ][,.(outcome,b, b-1.96*se, b+1.96*se, pval)] 232 | # 233 | # #included microbial taxa abundance 234 | # harm_all[study %in% c("kurilshikov", "ruhlemann"), length(unique(exposure))] 235 | # harm_all[study %in% c("kurilshikov", "ruhlemann"), length(unique(exposure_outcome))] 236 | # primary_df[exposure_outcome %in% harm_all[study %in% c("kurilshikov", "ruhlemann"), ]$exposure_outcome, ][,sum(pval < 0.05)] 237 | # 238 | # #general 239 | # primary_df[,length(unique(exposure_outcome))] 240 | # primary_df[,sum(pval < 0.05)] 241 | # primary_df[which.min(fdr),] 242 | # 243 | # #power 244 | # mean(primary_df$power) 245 | # sd(primary_df$power) 246 | # 247 | # #sensitivity 248 | # list_sensitivity <- readRDS( "Data/Modified/Sensitivity/list_sensitivity") 249 | # length(list_sensitivity) 250 | # sum(vec5>=4) 251 | # list_sensitivity[vec5 >= 4] 252 | # list_sensitivity$kettunen_Ile_NAFLD[method == "Inverse variance weighted", c(exp(b), exp(lci), exp(uci))] 253 | # list_sensitivity$kettunen_Ile_NAFLD 254 | # 255 | # ##mvmr 256 | # MVMR<-fread( "Data/Modified/Sensitivity/MVMR") 257 | # MVMR[, max(Pvalue), by = c("Exposure", "Outcome")] 258 | # 259 | # #positive findings 260 | # ok <- MVMR[, max(Pvalue), by = c("Exposure", "Outcome")] 261 | # dt_sensitivity <- rbindlist(list_sensitivity, fill = TRUE) 262 | # dt_sensitivity[exposure == ok[V1< 0.05,]$Exposure[1] & outcome == ok[V1< 0.05,]$Outcome[1],][method == "Inverse variance weighted",c( exposure, exp(b), exp(lci), exp(uci))] 263 | # dt_sensitivity[exposure == ok[V1< 0.05,]$Exposure[2] & outcome == ok[V1< 0.05,]$Outcome[2],][method == "Inverse variance weighted",c( exposure, exp(b), exp(lci), exp(uci))] 264 | # dt_sensitivity[exposure == ok[V1< 0.05,]$Exposure[3] & outcome == ok[V1< 0.05,]$Outcome[3],][method == "Inverse variance weighted",c( exposure, exp(b), exp(lci), exp(uci))] 265 | # 266 | # 267 | # dt_sensitivity[exposure == ok[V1> 0.05 & V1 < 0.1,]$Exposure[1] & outcome == ok[V1> 0.05 & V1 < 0.1,]$Outcome[1],][method == "Inverse variance weighted",c( exposure, exp(b), exp(lci), exp(uci))] 268 | # dt_sensitivity[exposure == ok[V1> 0.05 & V1 < 0.1,]$Exposure[2] & outcome == ok[V1> 0.05 & V1 < 0.1,]$Outcome[2],][method == "Inverse variance weighted",c( exposure, b, lci, uci)] 269 | # 270 | -------------------------------------------------------------------------------- /5_create_plots_figures.R: -------------------------------------------------------------------------------- 1 | library(ckbplotr) #it has to loaded at the beginning of the session 2 | library(data.table) 3 | library(readxl) 4 | library(TwoSampleMR) 5 | library(tidyverse) 6 | library(GagnonMR) 7 | library(ggpubr) 8 | library(ggforestplot) 9 | library(ggforce) 10 | 11 | #creating report and plots 12 | #clean primary_df 13 | primary_df_full <- fread("/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Primary/primary_df") 14 | inst_clump <- fread( "Data/Modified/inst_clump.txt") 15 | 16 | 17 | cleanify <- function(primary_df_full) { 18 | primary_df_full$exposure_clean <- as.character(primary_df_full$exposure) %>% 19 | ifelse(. == "framingham_betaine", "Betaine", .) %>% 20 | ifelse(. == "framingham_carnitine", "Carnitine", .) %>% 21 | ifelse(. == "framingham_choline", "Choline", .) %>% 22 | ifelse(. == "framingham_indole_3_propionate", "Indole-3-propionate", .) %>% 23 | ifelse(. == "kettunen_Ace", "Acetate", .) %>% 24 | ifelse(. == "lotta_Ile", "Isoleucine", .) %>% 25 | ifelse(. == "lotta_Leu", "Leucine", .) %>% 26 | ifelse(. == "lotta_Serotonin", "Serotonin", .) %>% 27 | ifelse(. == "lotta_Val" , "Valine", .) %>% 28 | ifelse(. == "lotta_PEA", "Phenylethylamine", . ) %>% 29 | ifelse(. == "lotta_Glu", "Glutamate", . ) %>% 30 | ifelse(. == "lotta_Asp", "Aspartate", .) %>% 31 | ifelse(. == "lotta_C0", "Carnitine", .) %>% 32 | ifelse(. == "lotta_Kynurenine", "Kynurenine", .) %>% 33 | ifelse(. == "framingham_trimethylamine_N_oxide", "Trimethylamine N-oxide (TMAO)", .) %>% 34 | ifelse(. == "sanna_fecal_propionate_levels", "Fecal propionate", .) %>% 35 | ifelse(. == "sanna_PWY-5022", "Pathway PWY-5022", .) %>% 36 | ifelse(. == "willer_LDL-cholesterol:ieu-a-300", "LDL cholesterol", .) 37 | 38 | 39 | # primary_df_full$outcome_clean <- sapply(strsplit(primary_df_full$outcome, "_"), function(x) x[length(x)]) 40 | primary_df_full$outcome_clean <- primary_df_full$outcome %>% 41 | ifelse(. == "osteoporosis", "Osteoporosis", .) %>% 42 | ifelse(. == "NAFLD", "Non-alcoholic fatty liver disease", .) %>% 43 | ifelse(. == "van_der_Harst_CAD" , "Coronary artery disease", .) %>% 44 | ifelse(. == "Malik_Stroke", "Ischemic stroke", .) %>% 45 | ifelse(. == "Mahajan_Type2diabetes", "Type 2 diabetes", .) %>% 46 | ifelse(. == "Jansen_Alzheimer", "Alzheimer's disease", .) %>% 47 | ifelse(. == "Timmers_parental_lifespan", "Parental lifespan", .) %>% 48 | ifelse(. == "Wuttke_Chronic_kidney" , "Chronic kidney disease", .) %>% 49 | ifelse(. == "Deelen_longevity", "Human longevity", .) %>% 50 | ifelse(. == "Howard_Depression", "Depression", .) %>% 51 | ifelse(. == "diastolic_blood_pressure", "Diastolic blood pressure", .) %>% 52 | ifelse(. == "Fasting_Glucose_standardised", "Fasting glucose", .) %>% 53 | ifelse(. == "Fasting_Insulin", "Fasting insulin", .) %>% 54 | ifelse(. == "ieu-b-109" , "HDL cholesterol", .) %>% 55 | ifelse(. == "ieu-b-110", "LDL cholesterol", .) %>% 56 | ifelse(. == "ieu-b-111", "Triglycerides", .) %>% 57 | ifelse(. == "Stanzick_eGFR", "Estimated glomerular filtration rate", .) %>% 58 | ifelse(. == "systolic_blood_pressure" , "Systolic blood pressure", .) %>% 59 | ifelse(. == "ukb-b-12141", "Osteoporosis", .) %>% 60 | ifelse(. == "ukb-b-19953" , "Body mass index", .) 61 | 62 | primary_df_full[, outcome_category := ifelse(outcome_clean %in% c("Coronary artery disease", "Ischemic stroke", 63 | "Chronic kidney disease", 64 | "Type 2 diabetes", "Non-alcoholic fatty liver disease", 65 | "Osteoporosis", "Depression", "Alzheimer's disease", "Human longevity", 66 | "Parental lifespan"), "Disease", "Cardiometabolic trait")] 67 | 68 | 69 | primary_df_full[, outcome_clean := factor(outcome_clean, 70 | levels = c("Coronary artery disease", "Ischemic stroke", "Chronic kidney disease", "Type 2 diabetes", "Non-alcoholic fatty liver disease", 71 | "Osteoporosis", "Depression", "Alzheimer's disease", "Human longevity", "Parental lifespan", "HDL cholesterol", 72 | "LDL cholesterol","Triglycerides", "Systolic blood pressure", "Diastolic blood pressure","Fasting glucose", 73 | "Fasting insulin", "Estimated glomerular filtration rate", "Body mass index"))] 74 | #Category 75 | primary_df_full$Category <- primary_df_full$exposure %>% 76 | ifelse(. == "IBD_IBD", "Dysbiotic disease", . ) %>% 77 | ifelse(grepl("fecal_propionate_levels",.), "Fecal Metabolites", .) %>% 78 | ifelse(grepl("PWY-", .) & grepl("sanna", .), "Microbial Pathway", .) %>% 79 | ifelse(grepl("framingham", .) | grepl("kettunen|lotta", .), "Plasma Metabolites", .) %>% 80 | ifelse(grepl("dutch", .) & grepl("PWY", .), "Microbial Pathway", .) %>% 81 | ifelse(grepl("dutch", .) & !grepl("PWY", .), "Microbe Abundance", .) %>% 82 | ifelse(grepl("fin_gut", .) | grepl("ruhlemann",.) | grepl("kurilshikov",.), "Microbe Abundance", .) %>% 83 | ifelse(grepl("willer_LDL", .), "Positive control", .) 84 | 85 | all(primary_df_full$Category %in% c("Dysbiotic disease", "Fecal Metabolites", "Plasma Metabolites", "Microbe Abundance", "Microbial Pathway")) #if TRUE great 86 | 87 | setDT(primary_df_full) 88 | primary_df_full[,Category := factor(Category, levels = c("Dysbiotic disease", "Plasma Metabolites","Fecal Metabolites", "Microbial Pathway", "Microbe Abundance", "Positive control"))] 89 | primary_df_full<- primary_df_full[order(Category,outcome, exposure)] 90 | 91 | primary_df_full[Category == "Microbe Abundance", ]$exposure_clean <- sapply(strsplit(primary_df_full[Category == "Microbe Abundance", ]$exposure_clean, "_"), 92 | function(x) paste0(x[2:length(x)], collapse = "_")) 93 | 94 | primary_df_full[Category == "Microbe Abundance", exposure_clean := gsub("(", " (", exposure_clean, fixed = TRUE)] 95 | return(primary_df_full) 96 | } 97 | primary_df_full <- cleanify(primary_df_full = primary_df_full) 98 | fwrite(primary_df_full, "Data/Modified/primary_df_clean.txt") 99 | #This document builds mostly on https://mrcieu.github.io/TwoSampleMR/articles/perform_mr.html 100 | 101 | #figure 3 -> Create a forest plot of all IVW with IBD I want it to look like in Iyas and all. 102 | #https://neilstats.github.io/ckbplotr/articles/make_forest_plot.html#forest-plot-with-row-labels 103 | 104 | ###a better way I think. All exposures. Outcome as title. One forest plot per outcome. 105 | # primary_df <- primary_df_full[exposure == "IBD_IBD",] 106 | # resultsA <- data.frame(variable = paste0(primary_df$outcome_clean), 107 | # estimate = primary_df$b, 108 | # stderr = primary_df$se, 109 | # n = primary_df$nsnp, 110 | # P_value = formatC(primary_df$pval, format = "e", digits = 1)) 111 | # 112 | # 113 | # setDT(resultsA) 114 | # resultsA <- resultsA[order(-estimate)] 115 | # m<-make_forest_plot(panels = list(resultsA), 116 | # col.key = "variable", 117 | # panel.headings = "Primary MR analysis for inflammatory bowel disease \n and 10 health outcomes", 118 | # exponentiate = TRUE, 119 | # nullval = 1, 120 | # pointsize = 2, 121 | # col.left = c("n"), 122 | # col.left.heading = c("n SNP"), 123 | # col.right = "P_value", 124 | # col.right.heading = c("OR (95% CI)", "P-value"), 125 | # printplot = TRUE, 126 | # xlab = "Effect of IBD on chronic diseases and longevity", 127 | # colour = "blue", 128 | # cicolour = "darkred") 129 | # 130 | # # ggsave("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/IBD_IVW_forest_plot.png", 131 | # # width=436/72,height=244/72,units="in",scale=1, 132 | # # device = "png") 133 | # 134 | # b<- m[[1]] 135 | # a <- readRDS( file = "Analysis/LD_score/Results/Forest_plot_IBD_Rg_benoitversion.rdata") 136 | # 137 | # ggarrange(a, b, 138 | # labels = c("A", "B"), 139 | # ncol = 1, nrow = 2) 140 | # 141 | # ggsave("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/2_IBD_twopanel_plot.tiff", 142 | # width=700/72,height=730/72,units="in",scale=1, 143 | # device = "tiff") 144 | ###huge code for other stuff 145 | # list_primary_forest <- vector(mode = "list", length = length(unique(primary_df_full$outcome))) 146 | # for(i in 1:length(unique(primary_df_full$outcome))) { 147 | # primary_df <- primary_df_full[outcome == unique(outcome)[i]] 148 | # 149 | # resultsA <- data.frame(variable = paste0(primary_df$exposure), 150 | # estimate = primary_df$b, 151 | # stderr = primary_df$se, 152 | # n = primary_df$nsnp) 153 | # 154 | # if(primary_df[1,outcome] == "Timmers_parental_lifespan") { 155 | # exponentiate <- FALSE 156 | # col.right.heading <- "Beta (95% CI)" 157 | # xlab <- "Difference in lifespan years caused by gut dysbiosis" 158 | # nullval <- 0 159 | # } else { 160 | # exponentiate <- TRUE 161 | # col.right.heading <- "HR (95% CI)" 162 | # xlab <- "Hazard ratio of having a disease caused by microbiome" 163 | # nullval <- NULL 164 | # } 165 | # 166 | # list_primary_forest[[i]] <- 167 | # make_forest_plot(panels = list(resultsA), 168 | # col.key = "variable", 169 | # panel.headings = paste0("Primarary MR analysis for ", primary_df[1,outcome]), 170 | # exponentiate = exponentiate, 171 | # nullval = nullval, 172 | # pointsize = 2, 173 | # col.left = c("n"), 174 | # col.left.space = c(0.02), 175 | # col.left.heading = c("n SNP"), 176 | # col.right.heading = col.right.heading, 177 | # printplot = FALSE, 178 | # xlab = xlab) 179 | # 180 | # print(list_primary_forest[[i]]) 181 | # ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Primary/Forest_plot/", 182 | # i, "_", unique(primary_df_full$outcome)[i], "_forest_plot.png" ), 183 | # width=12,height=12,units="in",scale=1, 184 | # device = "png") 185 | # 186 | # } 187 | # 188 | # saveRDS(list_primary_forest, "/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Primary/Forest_plot/list_primary_forest") 189 | # 190 | 191 | #Balloon_plot figure 4 - 5 192 | primary_df_full[, zscore := b/se] 193 | primary_df_full[, Category := factor(Category, levels = c("Positive control", "Dysbiotic disease", "Fecal Metabolites", "Microbial Pathway", "Plasma Metabolites", "Microbe Abundance" )),] 194 | primary_df_full<- primary_df_full[order(Category),] 195 | primary_df_full[exposure_clean == outcome_clean, c( "b", "se", "pval", "zscore") := NA] 196 | dat_plot <- data.frame(exposure = factor(primary_df_full$outcome_clean, levels = unique(primary_df_full$outcome_clean)), #C,est mélangeant, mias puisque je veux les exposures sur l'ave des X j'inverse 197 | outcome = factor(primary_df_full$exposure_clean, levels = unique(primary_df_full$exposure_clean)), 198 | pval = as.numeric(primary_df_full$pval), 199 | z_score = as.numeric(primary_df_full$zscore), 200 | beta_score = as.numeric(primary_df_full$b), 201 | Category = primary_df_full$Category, 202 | outcome_category = primary_df_full$outcome_category) 203 | #Category = factor(primary_df_full$Category, levels = sort(unique(primary_df_full$Category)))) 204 | 205 | dat_plot <- dat_plot[dat_plot$Category != "Dysbiotic disease",] 206 | setDT(dat_plot) 207 | dat_plot <- dat_plot[order(Category),] 208 | ########### 209 | plot_balloon <- function(dat_plot, bonferroni_threshold = 0.05) { 210 | 211 | 212 | dat_plot$pval = as.numeric(dat_plot$pval) 213 | dat_plot$log10_pval = -log10(dat_plot$pval) 214 | dat_plot$shape_point = sapply(dat_plot$pval, FUN = function(x) {ifelse( x < bonferroni_threshold, "rond", "Non-significant")}) 215 | 216 | dat_plot_rond = dat_plot 217 | dat_plot_rond$beta_score[which(dat_plot_rond$pval >= bonferroni_threshold)] = NA 218 | dat_plot_rond$log10_pval[which(dat_plot_rond$pval >= bonferroni_threshold)] = NA 219 | dat_plot_rond$shape_point = sapply(dat_plot_rond$pval, FUN = function(x) {ifelse( x < bonferroni_threshold, "rond", NA)}) 220 | 221 | 222 | dat_plot_croix = dat_plot 223 | dat_plot_croix$beta_score[which(dat_plot_croix$pval < bonferroni_threshold)] = NA 224 | dat_plot_croix$log10_pval[which(dat_plot_croix$pval < bonferroni_threshold)] = NA 225 | dat_plot_croix$shape_point = sapply(dat_plot_croix$pval, FUN = function(x) {ifelse( x >= bonferroni_threshold, "Non-significant", NA)}) 226 | 227 | balloon_plot = ggplot2::ggplot() + 228 | ggplot2::geom_point(data = dat_plot_croix, ggplot2::aes(x = exposure, y = outcome, shape = factor(shape_point)), size = 2, color = "gray20") + 229 | ggplot2::geom_point(data = dat_plot_rond, ggplot2::aes(x = exposure, y = outcome, size = log10_pval, color = beta_score)) + 230 | facet_grid(. ~ outcome_category, scales= "free_x") + 231 | ggplot2::scale_color_gradient2(name = "Beta", 232 | low = scales::muted("#5884E5"), 233 | mid = "white", 234 | high = scales::muted("#9E131E"), 235 | midpoint = 0 236 | ) + 237 | ggplot2::scale_shape_manual(name = "", values = c(4,1)) + 238 | ggplot2::scale_size(name = expression(-Log[10](P)), range = c(4,7)) + 239 | # ggplot2::coord_fixed(clip = "off", ratio = 1) + 240 | ggplot2::guides(size = guide_legend(order = 1), 241 | shape = guide_legend(order = 2)) + 242 | ggplot2::theme( 243 | panel.grid.major.y = element_line(size = 0.25, colour = "gray60"), 244 | panel.grid.major.x = element_line(size = 0.25, colour = "gray60"), 245 | panel.grid.minor.y = element_blank(), 246 | panel.grid.minor.x = element_blank(), 247 | panel.background = element_blank(), 248 | plot.margin = margin(t = 2, r = 0.5, b = 0.5, l = 0.5, "cm"), 249 | legend.position = "right", 250 | legend.text = element_text( 251 | color = "gray20", 252 | size = 10, 253 | margin = margin(l = 0.2, r = 0.2) 254 | ), 255 | legend.spacing.y = unit(0.1, 'cm'), 256 | legend.key = element_rect(fill = "transparent", colour = "transparent"), 257 | legend.key.size = unit(0.8, "cm"), 258 | axis.title = element_blank(), 259 | axis.line = element_line(size = 0.5, colour = "gray20"), 260 | axis.ticks = element_line(size = 0.5, colour = "gray20"), 261 | axis.text.y = element_text( 262 | size = 10, 263 | colour = "gray20" 264 | ), 265 | axis.text.x = element_text( 266 | angle = 60, 267 | size = 8, 268 | hjust = 1, 269 | face = "plain", 270 | colour = "gray20" 271 | )) 272 | 273 | print(balloon_plot) 274 | 275 | } 276 | 277 | ############# 278 | plot_balloon(dat_plot[dat_plot$Category != "Microbe Abundance",]) 279 | 280 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/2_balloon_plot_nomicrobre.png"), 281 | width=638/72,height=458/72,units="in",scale=1, 282 | device = "png") 283 | 284 | 285 | dat_plot2 <- dat_plot[dat_plot$Category %in% c("Microbe Abundance", "Positive control"), ] 286 | plot_balloon(dat_plot2) 287 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/3_balloon_plot_onlymicrobre.png"), 288 | width=750/72,height=1400/72,units="in",scale=1, dpi=700, 289 | device = "png") 290 | 291 | # dat_plot2 <- dat_plot[dat_plot$Category == "Microbe Abundance", ] 292 | # 293 | # kk <-unique(dat_plot2$outcome) 294 | # split_kk <- split(kk, ceiling(seq_along(kk)/(length(kk)/2+1))) 295 | # 296 | # dat_plot2 <- dat_plot[(dat_plot$Category == "Microbe Abundance" & 297 | # dat_plot$outcome %in% split_kk[[1]]) | 298 | # (dat_plot$Category == "Positive control"),] 299 | # 300 | # 301 | # plot_balloon(dat_plot2) 302 | # 303 | # ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/3b_balloon_plot_onlymicrobre.png"), 304 | # width=750/72,height=900/72,units="in",scale=1, dpi=700, 305 | # device = "png") 306 | # 307 | # dat_plot3 <- dat_plot[(dat_plot$Category == "Microbe Abundance" & 308 | # dat_plot$outcome %in% split_kk[[2]]) | 309 | # (dat_plot$Category == "Positive control"),] 310 | # 311 | # 312 | # plot_balloon(dat_plot3) 313 | # 314 | # ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/3b_balloon_plot_onlymicrobre.png"), 315 | # width=750/72,height=900/72,units="in",scale=1, dpi=700, 316 | # device = "png") 317 | 318 | #######Sensitivity forest 319 | #sensitivity 320 | list_sensitivity <- readRDS( "Data/Modified/Sensitivity/list_sensitivity") 321 | veclog <- readRDS( "Data/Modified/Sensitivity/veclog") 322 | 323 | # x <- list_sensitivity[[1]] 324 | # x[method %in% c("Weighted mode", "Weighted median"), any(pval<0.05)] & 325 | # x[method %in% c("IVW radial", "Weighted median"), any(pval<0.05)] 326 | # vec_which<-sapply(list_sensitivity, function(x) x[method %in% c("Weighted mode", "Weighted median"), pval] 327 | # df_compare_traits_groups 328 | 329 | 330 | length(list_sensitivity) 331 | sum(veclog) 332 | dt_sen <- rbindlist(list_sensitivity[veclog], fill = TRUE) 333 | dt_sen <- cleanify(dt_sen) 334 | 335 | dt_sen <- dt_sen[order(exposure_clean, outcome_clean), ] 336 | max_chr_exp <-dt_sen[, max(nchar(exposure_clean))] 337 | dt_sen[, outcome_clean := as.character(outcome_clean)] 338 | max_chr_out <-dt_sen[, max(nchar(outcome_clean))] 339 | 340 | #exposure_really_clean 341 | # dt_sen$exposure_really_clean <- 342 | # dt_sen[, paste0(exposure_clean, paste(rep("-", max_chr_exp - nchar(exposure_clean)),collapse = "")), by = seq_len(nrow(dt_sen))]$V1 343 | # 344 | # 345 | # dt_sen$outcome_really_clean <- 346 | # dt_sen[, paste0(paste(rep("-", max_chr_out - nchar(outcome_clean)),collapse = ""), outcome_clean), by = seq_len(nrow(dt_sen))]$V1 347 | # 348 | 349 | dt_sen[, exposure_really_clean := paste0(exposure_clean, " --> ")] 350 | dt_sen[, outcome_really_clean := outcome_clean] 351 | 352 | dt_sen[, method := factor(method, levels = unique(method))] 353 | dt_sen[, name := paste0(exposure_really_clean, outcome_really_clean)] 354 | dt_sen[, unique(nchar(name))] 355 | dt_sen[, Category_other := ifelse(Category == "Microbe Abundance","Microbe Abundance", "Microbiota Associated Metabolites")] 356 | dt_sen[method == "Contamination mixture", se := ((b - lci) + (uci - b))/(2*1.96) ] 357 | 358 | 359 | my_forest_plot <- function(data) { 360 | forestplot( 361 | df = data, 362 | name = name, 363 | se = se, 364 | estimate = b, 365 | pvalue = pval, 366 | psignif = 0.05, 367 | xlab = "Effect size (SD or log(OR)) per 1-SD\nincrease in gut microbiota features", 368 | ci = 0.95, 369 | colour = method, 370 | xlim = dt_sen[method != "MR Egger", round(c(min(lci),max(uci)), digits = 1)] 371 | ) + theme(legend.position = "right") + 372 | theme(text = element_text(size = 10)) + 373 | scale_x_continuous(breaks = c(-0.3,-0.2,-0.1,0.1,0.2,0.3,0.4)) + 374 | ggforce::facet_col( 375 | # facets = ~outcome_category, 376 | facets = ~Category_other, 377 | scales = "free_y", 378 | space = "free" 379 | ) 380 | } 381 | 382 | my_forest_plot(data= dt_sen) 383 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/4_forest_sensitivity.png"), 384 | width=732/72,height=437/72,units="in",scale=1, 385 | device = "png") 386 | 387 | # Histogram of power figure 6 388 | primary_df <- primary_df_full 389 | primary_df <- primary_df[exposure != "willer_LDL-cholesterol:ieu-a-300"] 390 | under.8<-primary_df[, mean(power), by = outcome][V1 <0.8, ]$outcome 391 | primary_df[, category := ifelse(outcome %in% under.8, outcome, "other")] 392 | 393 | sum(primary_df$power<0.8)/length(primary_df$power) 394 | 395 | ggplot(primary_df, aes(x=power)) + 396 | geom_histogram(binwidth = 0.1) + 397 | scale_x_continuous(breaks =seq(0.2, 1, by = 0.2)) + 398 | theme_classic() + 399 | annotate(geom = "text", x = 0.8, y = 300, 400 | label = paste0(round(sum(primary_df$power>0.8)/length(primary_df$power)*100,1), "% \n over 0.8"), 401 | hjust = 0, vjust = 1, size = 5) + 402 | geom_vline(xintercept=0.8, linetype="dotted", colour = "red") + 403 | ggtitle(paste0("Power to find a 0.1 effect for all ", nrow(primary_df)," associations" )) 404 | 405 | ggsave("Results/Figures/5_histogram_of_power.png", 406 | width=460/72,height=326/72, units="in", scale=1, device = "png") 407 | 408 | #MVMR forest plot figure 7 409 | 410 | MVMR_dat <- fread("/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Sensitivity/MVMR") 411 | MVMR_dat[, MVMR := MVMR %>% ifelse(. == "no counfounder", "no confounder", .)] 412 | MVMR_dat <- cleanify(MVMR_dat) 413 | MVMR_dat <- MVMR_dat[order(exposure_clean, outcome_clean), ] 414 | max_chr_exp <-MVMR_dat[, max(nchar(exposure_clean))] 415 | MVMR_dat[, outcome_clean := as.character(outcome_clean)] 416 | max_chr_out <-MVMR_dat[, max(nchar(outcome_clean))] 417 | MVMR_dat[,outcome_clean := as.character(outcome_clean)] 418 | MVMR_dat[, exposure_really_clean := paste0(exposure_clean, " --> ")] 419 | MVMR_dat[, outcome_really_clean := outcome_clean] 420 | 421 | MVMR_dat[, method := factor(MVMR, levels = c("no confounder", "with BMI", "with alcohol intake frequency"))] 422 | MVMR_dat[, name := paste0(exposure_really_clean, outcome_really_clean)] 423 | MVMR_dat[, Category_other := ifelse(Category == "Microbe Abundance","Microbe Abundance", "Microbiota Associated Metabolites")] 424 | 425 | my_forest_plot(MVMR_dat) 426 | 427 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Figures/MVMR_forest_plot.png"), 428 | width=730/72,height=430/72,units="in",scale=1, 429 | device = "png") 430 | 431 | fwrite(MVMR_dat, "Data/Modified/MVMR_clean.txt") 432 | # list_MVMR <- list("1a_MVMR_forest_plot" = MVMR_dat[outcome_category == "Disease",], 433 | # "1b_MVMR_forest_plot" = MVMR_dat[outcome_category == "Risk_factor",]) 434 | # 435 | # for(i in 1:length(list_MVMR)) { 436 | # 437 | # MVMR <- list_MVMR[[i]] 438 | # resultsA <- data.frame(variable = as.character(1:nrow(MVMR)), 439 | # estimate = round(MVMR$b, digits =3), 440 | # lci = round(MVMR$lci, digits = 3), 441 | # uci = round(MVMR$uci, digits = 3), 442 | # P_value = formatC(MVMR$pval, format = "e", digits = 1), 443 | # n = MVMR$nsnp) 444 | # 445 | # mylabels <- data.frame(heading1 = "", 446 | # heading2 = paste0(MVMR$exposure_clean, " with ", MVMR$outcome_clean), 447 | # heading3 = MVMR$MVMR, 448 | # variable = as.character(1:nrow(MVMR))) 449 | # 450 | # make_forest_plot(panels = list(resultsA), 451 | # col.key = "variable", 452 | # panel.headings = "IVW estimates with and without correcting for confounder", 453 | # row.labels = mylabels, 454 | # exponentiate = TRUE, 455 | # pointsize = 2, 456 | # rows = unique(mylabels$heading1), 457 | # col.stderr = NULL, 458 | # col.lci = "lci", 459 | # col.uci = "uci", 460 | # col.left = c("n"), 461 | # col.left.heading = c("n SNP"), 462 | # col.right = "P_value", 463 | # col.right.heading = c("OR (95% CI)", "P-value"), 464 | # xlab = "IVW estimates", 465 | # colour = "blue", 466 | # cicolour = "darkred", 467 | # blankrows = c(0,1,0,0), 468 | # xlim = c(0.65, 1.8) 469 | # ) 470 | # 471 | # ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Supplementary_material/", names(list_MVMR)[i],".png"), 472 | # width=710/72,height=576/72, units="in", scale=1, 473 | # device = "png") 474 | # 475 | # } 476 | 477 | #10-11 -> For all IVW estimate with p < 0.05, perform sensitivity analysis and present the scatter plot 478 | list_sensitivity <- readRDS( "/home/gagelo01/workspace/Projects/Dysbiose_project/Data/Modified/Sensitivity/list_sensitivity") 479 | list_sensitivity <- list_sensitivity[sapply(list_sensitivity, function(x) sum(x$pval < 0.05)) > 3] 480 | list_sensitivity <- lapply(list_sensitivity, cleanify) 481 | list_sensitivity <- lapply(list_sensitivity, function(x) x[, c("exposure", "outcome", "exposure_copy", "outcome_copy") := .(exposure_clean, outcome_clean, exposure, outcome)]) 482 | 483 | harm_all <- fread( "Data/Modified/harm_all.txt") 484 | harm_all <- cleanify(harm_all) 485 | fwrite(harm_all, "Data/Modified/harm_all_clean.txt") 486 | harm_all[, id.exposure := exposure] 487 | harm_all[, id.outcome := outcome] 488 | list_harm <- split(harm_all, harm_all$exposure_outcome) 489 | 490 | list_harm<-list_harm[!sapply(list_harm, function(x) { x[1,"SNP"] == 0})] 491 | lapply(list_harm, setDT) 492 | list_harm <- lapply(list_harm, cleanify) 493 | list_harm <- lapply(list_harm, function(x) x[, c("exposure", "outcome") := .(exposure_clean, outcome_clean)]) 494 | 495 | for(j in 1:length(list_sensitivity)) { 496 | vecto <- rep(NA, length(list_harm)) 497 | for(i in 1:length(list_harm)) { 498 | expi <- list_harm[[i]][1, "exposure"] == list_sensitivity[[j]][1,"exposure"] 499 | outci <- list_harm[[i]][1, "outcome"] == list_sensitivity[[j]][1,"outcome"] 500 | vecto[i] <- expi & outci 501 | 502 | } 503 | 504 | index <- which(vecto) 505 | li_sen <- list_sensitivity[[j]] 506 | 507 | p <- mr_scatter_plot(li_sen, list_harm[[index]]) 508 | ggplot_object <- p[[1]] + theme_classic() + 509 | theme(legend.position="top") 510 | ggplot_object 511 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Supplementary_material/Scatter_plot/", 512 | j, "_", list_sensitivity[[j]][1,"exposure_copy"], "__" ,list_sensitivity[[j]][1,"outcome_copy"],"_", "scatter_plot.png"), 513 | width=538/72,height=335/72, units="in", scale=1, 514 | device = "png") 515 | 516 | saveRDS(ggplot_object, file = paste0("Results/Supplementary_material/ggplot_object/", j, "_scatter.rdata")) 517 | } 518 | 519 | 520 | #8-9 -> For all IVW estimate with p < 0.05 and nSNPs > 1, perform a forest plot that look like Iyas and all.----- 521 | 522 | list_sensitivity_forest <- vector(mode = "list", length = length(list_sensitivity)) 523 | 524 | for(i in 1:length(list_sensitivity)) { 525 | li_sen <- list_sensitivity[[i]] 526 | if(li_sen[method == "Robust adjusted profile score (RAPS)", uci - lci] > 8) { 527 | li_sen[method == "Robust adjusted profile score (RAPS)", uci := b + 4] 528 | li_sen[method == "Robust adjusted profile score (RAPS)", lci := b - 4] 529 | } 530 | 531 | if(li_sen[1,"outcome"] == "Parental lifespan") { 532 | exponentiate <- FALSE 533 | col.right.heading <- c("Beta (95% CI)", "P-value") 534 | xlab <- paste0("1 year change in ", li_sen[1,]$outcome_clean, " per 1-SD increase of ", li_sen[1,]$exposure_clean) 535 | nullval <- 0 536 | } else { 537 | exponentiate <- TRUE 538 | col.right.heading <- c("OR (95% CI)", "P-value") 539 | xlab <- paste0("Odds ratio of ", li_sen[1,]$outcome_clean, " per 1-SD increase of ", li_sen[1,]$exposure_clean) 540 | nullval <- 1 541 | } 542 | 543 | 544 | 545 | resultsA <- data.frame(variable = li_sen$method, 546 | estimate = round(li_sen$b, digits =3), 547 | lci = round(li_sen$lci, digits = 3), 548 | uci = round(li_sen$uci, digits = 3), 549 | n = li_sen$nsnp, 550 | pval = formatC(li_sen$pval, format = "e", digits = 1)) 551 | 552 | mylabels <- data.frame(heading1 = li_sen$type_of_test, 553 | heading2 = li_sen$method, 554 | heading3 = as.character(NA), 555 | variable = li_sen$method) 556 | 557 | list_sensitivity_forest[[i]] <- 558 | make_forest_plot(panels = list(resultsA), 559 | col.key = "variable", 560 | row.labels = mylabels, 561 | exponentiate = exponentiate, 562 | pointsize = 2, 563 | rows = unique(mylabels$heading1), 564 | col.stderr = NULL, 565 | col.lci = "lci", 566 | col.uci = "uci", 567 | col.left = c("n"), 568 | col.left.heading = c("n SNP"), 569 | col.right.heading = col.right.heading, 570 | col.right = "pval", 571 | xlab = xlab, 572 | nullval = nullval, 573 | blankrows = c(0,0,0,0), 574 | panel.names = "") 575 | 576 | ggplot_object <- list_sensitivity_forest[[i]] 577 | ggplot_object[[1]] 578 | ggsave(paste0("/home/gagelo01/workspace/Projects/Dysbiose_project/Results/Supplementary_material/Forest_plot/", 579 | i, "_", li_sen[1, "exposure_copy"], "__", li_sen[1, "outcome_copy"], "_forest_plot.png" ), 580 | width=659/72,height=357/72,units="in",scale=1, device = "png") 581 | 582 | saveRDS(ggplot_object, file = paste0("Results/Supplementary_material/ggplot_object/", i, "_forest.rdata")) 583 | 584 | } 585 | 586 | ###two panel supplementary figure 587 | 588 | for(i in 1:length(list_sensitivity)) { 589 | a <- readRDS(file = paste0("Results/Supplementary_material/ggplot_object/", i, "_scatter.rdata")) 590 | b <- readRDS(file = paste0("Results/Supplementary_material/ggplot_object/", i, "_forest.rdata")) 591 | 592 | ggarrange(a, b[[1]], 593 | labels = c("A", "B"), 594 | ncol = 1, nrow = 2) 595 | 596 | ggsave(paste0("Results/Supplementary_material/TwoPanel/", i, "_scatter_forest.png"), 597 | width=490/72,height=475/72,units="in",scale=1, 598 | device = "png") 599 | } 600 | 601 | # ##forest plot between IBD and diseases 602 | # harm_all_clean <- cleanify(harm_all) 603 | # harm_all_clean[exposure_clean == "IBD_IBD", exposure_clean := "Inflammatory bowel disease"] 604 | # harm_all_clean[,exposure := exposure_clean] 605 | # harm_all_clean[,outcome := outcome_clean] 606 | # singlesnp <- mr_singlesnp(dat = harm_all_clean[exposure == "Inflammatory bowel disease"], single_method = "mr_wald_ratio", all_method = "mr_ivw") 607 | # setDT(singlesnp) 608 | # index <- unique(singlesnp[,outcome]) 609 | # for(i in 1:length(index)) { 610 | # s<-mr_forest_plot(singlesnp_results = singlesnp[outcome == index[i]], exponentiate = TRUE) 611 | # s[[1]] + 612 | # theme_classic() + 613 | # theme(legend.position="none") + 614 | # theme(axis.text.y = element_text(size=6), 615 | # axis.title=element_text(size=10)) 616 | # 617 | # titre<-gsub(" ", "_", index[i]) 618 | # ggsave(paste0("Results/Supplementary_material/Singlesnp/singlesnp_IBD_", titre, ".png"), 619 | # width=459/72,height=594/72,units="in",scale=1, device = "png") 620 | # 621 | # } 622 | 623 | 624 | --------------------------------------------------------------------------------