├── .gitattributes ├── .gitignore ├── README.md ├── Smith_Deutsch_NatureMedicine_2024 ├── AFR │ ├── hg38_score_info │ │ ├── Beta Cell 1.csv │ │ ├── Beta Cell 2.csv │ │ ├── Blood Markers.csv │ │ ├── Hyper Insulin.csv │ │ ├── Lipodystrophy.csv │ │ ├── Obesity.csv │ │ ├── Proinsulin.csv │ │ └── Total_GRS.csv │ └── liftover_map.txt ├── AMR │ ├── hg38_score_info │ │ ├── ASAT Pos.csv │ │ ├── Beta Cell 2.csv │ │ ├── Lipodystrophy.csv │ │ ├── Obesity.csv │ │ ├── Total_GRS.csv │ │ └── Unknown.csv │ └── liftover_map.txt ├── EAS │ ├── hg38_score_info │ │ ├── ALP Neg.csv │ │ ├── Beta Cell 1.csv │ │ ├── Beta Cell 2.csv │ │ ├── Lipodystrophy 1.csv │ │ ├── Lipodystrophy 2.csv │ │ ├── Liver Lipid.csv │ │ ├── Obesity.csv │ │ ├── Proinsulin.csv │ │ ├── Total_GRS.csv │ │ └── Unknown.csv │ └── liftover_map.txt ├── EUR │ ├── hg38_score_info │ │ ├── ALP Neg.csv │ │ ├── ASAT Pos.csv │ │ ├── Beta Cell 1.csv │ │ ├── Beta Cell 2.csv │ │ ├── Bilirubin.csv │ │ ├── Cholesterol.csv │ │ ├── Hyper Insulin.csv │ │ ├── Lipodystrophy 1.csv │ │ ├── Lipodystrophy 2.csv │ │ ├── Liver-Lipid.csv │ │ ├── Obesity.csv │ │ ├── Proinsulin.csv │ │ ├── SHBG-LpA.csv │ │ ├── Total_GRS.csv │ │ └── VAT Neg.csv │ └── liftover_map.txt └── MultiAncestry │ ├── hg38_score_info │ ├── ALP Neg.csv │ ├── Beta Cell 1.csv │ ├── Beta Cell 2.csv │ ├── Bilirubin.csv │ ├── Cholesterol.csv │ ├── Hyper Insulin.csv │ ├── Lipodystrophy 1.csv │ ├── Lipodystrophy 2.csv │ ├── Liver-Lipid.csv │ ├── Obesity.csv │ ├── Proinsulin.csv │ ├── SHBG-LpA.csv │ └── Total_GRS.csv │ └── liftover_map.txt ├── doc └── Variant clustering preprocessing pipeline_plan_KW.docx ├── example_data ├── clustering_data_sources_example.xlsx ├── my_GWAS │ ├── ALP_sample.txt │ ├── ALT_sample.txt │ ├── AST_sample.txt │ ├── Adiponectin_sample.txt │ ├── Albumin_sample.txt │ ├── BFP_sample.txt │ ├── BMI_sample.txt │ ├── C_reactive_protein_sample.txt │ ├── Cholesterol_sample.txt │ ├── FGadjBMI_sample.txt │ ├── FIadjBMI_sample.txt │ ├── GGT_sample.txt │ ├── Glucose_sample.txt │ ├── HDL_sample.txt │ ├── HOMAB_sample.txt │ ├── HOMAIR_sample.txt │ ├── Haemoglobin_concentration_sample.txt │ ├── HbA1c.adjBMI_sample.txt │ ├── HbA1c_sample.txt │ ├── LipoproteinA_sample.txt │ ├── Lymphocyte_count_sample.txt │ ├── Monocyte_count_sample.txt │ ├── Neutrophil_count_sample.txt │ ├── Proins_sample.txt │ ├── RBC_sample.txt │ ├── SHBG_sample.txt │ ├── T2D_GWAS_1_sample.txt │ ├── T2D_GWAS_2_sample.txt │ ├── T2D_GWAS_3_sample.txt │ ├── TG_sample.txt │ ├── Urate_sample.txt │ ├── Urea_sample.txt │ ├── VitaminD_sample.txt │ ├── WBC_sample.txt │ ├── WHR_female_sample.txt │ └── WHR_male_sample.txt └── rsID_map_example.txt └── scripts ├── .Rhistory ├── archive ├── choose_variants_2021.R ├── compiled_code.txt ├── get_data_for_ALL_variants_from_ALL_datasets_findproxy_readall_new_BMI_edit.pl ├── gwas_variant_selection.R ├── main.BayesNMF.script_to_Jaeyoon_edit_claire_T2D.R ├── prep_bNMF_2021.R ├── process_traits.R ├── proximal_preprocessing.R ├── run_bNMF_2021.R └── test_pipeline_2021.R ├── bNMF_example_pipeline.R ├── choose_variants.R ├── choose_variants_2025.R ├── format_bNMF_results.Rmd ├── generate_varid_to_rsid_map_file.R ├── post_bNMF_2025.R ├── prep_bNMF.R ├── prep_bNMF_2025.R ├── run_bNMF.R └── run_bNMF_2025.R /.gitattributes: -------------------------------------------------------------------------------- 1 | list_VARID_rsID_updated.txt.gz filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # High-level directories 2 | data/* 3 | opt/* 4 | cache/* 5 | 6 | *.DS_store 7 | pipeline_walkthrough/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## T2D Multi-ancestry Partitioned Polygenic Scores 2 | Cluster weights are available from the [Smith, Deutsch et al Nature Medicine 2024](https://www.nature.com/articles/s41591-024-02865-3) paper in the "Smith_Deutsch_NatureMedicine_2024" folder. Partitioned polygenic scores (pPS) can be generated using the enclosed variant cluster weights. We have included weights for ancestry-specific and multi-ancestry clusters: 3 | 4 | * In the weights files, the "Effect_Allele" column denotes the T2D risk-increasing allele. 5 | * When generating the pPS, all genotypes should be aligned to this allele! 6 | * Weights have been provided in hg38, however a liftover map (hg19 to hg38) is included in each subfolder. 7 | 8 | ## Pipeline for GWAS clustering using Bayesian non-negative matrix factorization (bNMF) 9 | 10 | The bNMF procedure, as applied here, is used to detect clusters of GWAS variants for some outcome of interest based on the associations of those variants with a set of additional traits. This pipeline includes pre-processing steps (such as quality control of variants and traits and the choice of proxy variants), preparation of the z-score matrix, clustering, and summarization of results. 11 | 12 | **Important:** The current pipeline makes certain assumptions and uses some hard-coded filenames, including: 13 | 14 | * "VAR_ID"s for GWAS and trait-specific summary statistics are in a specific format (CHR_POS_REF_ALT), with alleles aligned in a consistent way across traits (i.e. variant matching is performed using a simple string match). 15 | * The variant reference file linking VAR_IDs to rsIDs is based on the VAR_ID format above, and points to a file available on the Broad Institute compute cluster. The VAR_ID to rsID pairs can be generated using the script **"generate_varid_to_rsid_map_file.R"** 16 | 17 | ### Requirments 18 | The pipeline relies APIs for performing several linkage disequilibrium (LD)-based operations, including LD-pruning and proxy variant searches. The user must either acquire an personal token for LDlinkR or download the topLD API, depending on how they wish to perform these steps: 19 | 20 | * **LDlinkR**: request token from https://ldlink.nih.gov/?tab=apiaccess 21 | * LDlinkR is used within the ld_prune and choose_proxies functions. Alternatives include performing position-based clumping using the snp_clump function, and using the topLD API option for choose_proxies 22 | * **topLD**: download API to your project folder (instructions here: https://github.com/linnabrown/topld_api) 23 | * Note that topLD is currently not compatible with macOS. 24 | 25 | ### 1. Choose the set of variants to be clustered (choose_variants.R) 26 | **ld_prune**: LD-based pruning of the input variant set 27 | **snp_clip**: Alternative to ld_prune; uses chromosomal position to prune variant set 28 | **count_traits_per_variant**: Assess the fraction of traits missing each variant of interest 29 | **find_variants_needing_proxies**: Determine which variants need proxies (allele considerations, missingness, etc.) 30 | **choose_proxies**: Select proxies for the necessary variants and output the final variant set for clustering 31 | 32 | ### 2. Prepare the final z-score matrix (prep_bNMF.R) 33 | **fetch_summary_stats**: Retrieve z-scores and sample sizes across all traits for the final variant set 34 | **fill_missing_zcores**: Fill missing values in the variant-trait association matrix 35 | **prep_z_matrix**: Final trait filters, sample size adjustment, and creation of non-negative input matrix (N (variants) x 2M (traits), with separate columns for positive trait associations (zero otherwise) and negative trait associations) 36 | 37 | ### 3. Run bNMF and summarize the results (run_bNMF.R) 38 | **run_bNMF**: Run the bNMF procedure (over multiple iterations) 39 | **summarize_bNMF**: Summarize the results and create heatmaps for visualization 40 | 41 | ### 4. Further summarize and visualize results 42 | **format_bNMF_results.Rmd**: Generates a HTML file which includes several plots and tables which summarize the cluster results 43 | 44 | * also included is a calculation for finding an optimal weight threshold for defining cluster-defining variants and traits 45 | 46 | ### Outputs 47 | Most steps of the pipeline will print messages with details of the procedure. In addition, the following outputs will be written to the working directory. 48 | 49 | * no_proxies_found.txt: A list of variants that were excluded and for which no acceptable proxies were found. 50 | * run_summary.txt: A table listing the chosen K (# of clusters) and negative log-likelihood for each bNMF iteration. 51 | * z_score_mat.rds: A binary R object containing the N x M z-score matrix after all preprocessing steps. 52 | * L2EU.W.mat.K[]: The matrix of feature contributions to clusters for the K in question (one per K chosen in at least one iteration). 53 | * L2EU.H.mat.K[]: The matrix of variant contributions to clusters for the K in question. 54 | * W_plot_K[].pdf: A heatmap displaying feature contributions to clusters for the K in question. 55 | * H_plot_K[].pdf: A heatmap displaying variant contributions to clusters for the K in question. 56 | 57 | ### Contributors 58 | * Claire Kim (design and code) 59 | * Kenny Westerman (design and code) 60 | * Kirk Smith (code) 61 | * Jaegil Kim (code) 62 | * Marcin von Grotthuss (code) 63 | * Miriam Udler (design and supervision) 64 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Beta Cell 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,197541732,C,T,rs9325395,T,1.14049703135893 3 | 5,31436933,C,T,rs493760,C,1.28649231683287 4 | 6,20686342,C,A,rs7766070,A,3.41321775882255 5 | 8,41651740,G,A,rs12549902,A,4.15965995628624 6 | 10,112992744,T,C,rs17747324,C,1.10483442041379 7 | 10,112998590,C,T,rs7903146,T,1.79073378748542 8 | 11,2175119,A,G,rs4930044,A,1.54513936560311 9 | 12,41574480,G,A,rs10880103,G,1.34707283985582 10 | 12,75039749,A,G,rs10879874,A,1.23803020813872 11 | 12,80592093,G,T,rs1528287,T,1.30853148600033 12 | 13,31135669,A,C,rs9538367,C,1.22973026426221 13 | 17,77902588,A,G,rs9899517,A,1.70309057170068 14 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Beta Cell 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,123346931,A,G,rs11708067,A,2.85678418025812 3 | 3,185816694,G,A,rs9859406,A,1.09532761532444 4 | 6,20686342,C,A,rs7766070,A,1.65025246141402 5 | 7,15019701,T,C,rs12540947,T,1.08718444457625 6 | 7,15025271,T,G,rs4721401,G,1.51636423495046 7 | 7,157233358,A,G,rs1182443,G,0.982566657994938 8 | 7,28140937,T,C,rs864745,T,1.03509730159163 9 | 7,44145489,G,A,rs2908274,A,1.08604250980022 10 | 7,44184122,G,A,rs730497,A,3.28882703813248 11 | 10,112992744,T,C,rs17747324,C,3.34452390445164 12 | 10,112998590,C,T,rs7903146,T,3.88464226247631 13 | 10,113014674,A,C,rs36090025,C,3.00494927966238 14 | 11,2670270,G,A,rs231361,A,1.28057459942797 15 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Blood Markers.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,158670244,A,G,rs857682,A,5.12028565509816 3 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Hyper Insulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,159022630,C,T,rs60436412,T,0.937208121482674 3 | 3,185816694,G,A,rs9859406,A,1.4547676152557 4 | 3,190741327,A,G,rs6792341,G,1.70532156926971 5 | 6,39048860,C,T,rs10305420,C,1.18031610117673 6 | 7,28140937,T,C,rs864745,T,1.38053889749524 7 | 7,50819477,T,C,rs7781440,T,1.5370579485721 8 | 7,53139155,A,C,rs10259582,C,0.975441660585956 9 | 7,6694510,T,G,rs3801034,G,0.984780208023824 10 | 8,13987532,T,G,rs6991416,G,1.51623374674583 11 | 11,2190447,G,A,rs7396447,G,1.59235929925333 12 | 11,4671034,G,T,rs10836447,G,1.90220698103679 13 | 12,55216782,G,T,rs12314766,G,1.00404060576134 14 | 12,55662031,C,T,rs11614818,C,1.04278700039453 15 | 12,55786980,G,A,rs1681087,A,1.06354540778224 16 | 12,65788675,G,A,rs971779,A,1.15455466224296 17 | 12,80922459,A,G,rs7306751,G,0.951919927045835 18 | 13,68824414,T,C,rs60108948,T,1.39201366039072 19 | 17,30816009,C,T,rs112944916,T,1.41391360410219 20 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Lipodystrophy.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 5,56511543,C,T,rs464605,T,5.61538480583823 3 | 7,6694510,T,G,rs3801034,G,1.35073259004649 4 | 12,65788675,G,A,rs971779,A,1.01288557808651 5 | 12,85437387,G,A,rs1533223,A,0.988679452869238 6 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Obesity.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 6,42910968,C,T,rs112504542,T,1.30936561514991 3 | 7,28140937,T,C,rs864745,T,0.975796362368512 4 | 7,6694510,T,G,rs3801034,G,1.16998873599611 5 | 12,57574955,G,A,rs11172254,G,1.09360276744307 6 | 16,53777876,A,G,rs62033400,G,5.98919318612742 7 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Proinsulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 6,43850987,T,C,rs9472141,C,4.12812379000962 3 | 7,15019701,T,C,rs12540947,T,1.24419174028044 4 | 7,15025271,T,G,rs4721401,G,1.26922197503576 5 | 7,56156535,G,T,rs34224159,G,1.00495716026767 6 | 11,120779344,T,C,rs2846098,C,0.946849140526209 7 | 12,85437387,G,A,rs1533223,A,0.923071432638415 8 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Total_GRS.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,158670244,A,G,rs857682,A,0.0626 3 | 2,159022630,C,T,rs60436412,T,0.089 4 | 3,123346931,A,G,rs11708067,A,0.1178 5 | 3,185816694,G,A,rs9859406,A,0.1152 6 | 3,190741327,A,G,rs6792341,G,0.0877 7 | 3,197541732,C,T,rs9325395,T,0.0678 8 | 5,31436933,C,T,rs493760,C,0.33 9 | 5,56511543,C,T,rs464605,T,0.0774 10 | 6,20686342,C,A,rs7766070,A,0.0794 11 | 6,39048860,C,T,rs10305420,C,0.1423 12 | 6,42910968,C,T,rs112504542,T,0.0665 13 | 6,43850987,T,C,rs9472141,C,0.0629 14 | 7,15019701,T,C,rs12540947,T,0.1256 15 | 7,15025271,T,G,rs4721401,G,0.1012 16 | 7,157233358,A,G,rs1182443,G,0.0645 17 | 7,28140937,T,C,rs864745,T,0.0825 18 | 7,44145489,G,A,rs2908274,A,0.089 19 | 7,44184122,G,A,rs730497,A,0.0918 20 | 7,50819477,T,C,rs7781440,T,0.0861 21 | 7,53139155,A,C,rs10259582,C,0.0756 22 | 7,56156535,G,T,rs34224159,G,0.1109 23 | 7,6694510,T,G,rs3801034,G,0.078 24 | 8,13987532,T,G,rs6991416,G,0.0697 25 | 8,41651740,G,A,rs12549902,A,0.1138 26 | 10,112992744,T,C,rs17747324,C,0.2021 27 | 10,112998590,C,T,rs7903146,T,0.226 28 | 10,113014674,A,C,rs36090025,C,0.109 29 | 11,120779344,T,C,rs2846098,C,0.3094 30 | 11,2175119,A,G,rs4930044,A,0.0709 31 | 11,2190447,G,A,rs7396447,G,0.097 32 | 11,2670270,G,A,rs231361,A,0.0798 33 | 11,4671034,G,T,rs10836447,G,0.0782 34 | 12,41574480,G,A,rs10880103,G,0.0772 35 | 12,55216782,G,T,rs12314766,G,0.1298 36 | 12,55662031,C,T,rs11614818,C,0.087 37 | 12,55786980,G,A,rs1681087,A,0.0657 38 | 12,57574955,G,A,rs11172254,G,0.0974 39 | 12,65788675,G,A,rs971779,A,0.1251 40 | 12,75039749,A,G,rs10879874,A,0.1001 41 | 12,80592093,G,T,rs1528287,T,0.4939 42 | 12,80922459,A,G,rs7306751,G,0.0647 43 | 12,85437387,G,A,rs1533223,A,0.066 44 | 13,31135669,A,C,rs9538367,C,0.3072 45 | 13,68824414,T,C,rs60108948,T,0.0867 46 | 16,53777876,A,G,rs62033400,G,0.1506 47 | 17,30816009,C,T,rs112944916,T,0.0999 48 | 17,77902588,A,G,rs9899517,A,0.0652 49 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AFR/liftover_map.txt: -------------------------------------------------------------------------------- 1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene 2 | 7_53206848_A_C,rs10259582,7_53139155_A_C,POM121L12 3 | 6_39016636_C_T,rs10305420,6_39048860_C_T,GLP1R 4 | 12_75455390_C_T,rs10785171,12_75061610_C_T,KCNC2 5 | 1_229024237_G_A,rs10799493,1_228888490_G_A,RHOU 6 | 11_4692264_G_T,rs10836447,11_4671034_G_T,OR51E2 7 | 12_75433529_A_G,rs10879874,12_75039749_A_G,KCNC2 8 | 12_41968282_G_A,rs10880103,12_41574480_G_A,PDZRN4 9 | 11_12352989_C_T,rs11022305,11_12331442_C_T,11:12352989 10 | 12_57968738_G_A,rs11172254,12_57574955_G_A,KIF5A 11 | 12_64151880_T_C,rs111784736,12_63758100_T_C,RXYLT1 12 | 6_42878706_C_T,rs112504542,6_42910968_C_T,PTCRA 13 | 17_29143027_C_T,rs112944916,17_30816009_C_T,CRLF3 14 | 21_38438951_C_T,rs116043344,21_37066651_C_T,PIGP 15 | 12_56055815_C_T,rs11614818,12_55662031_C_T,METTL7B 16 | 3_123065778_A_G,rs11708067,3_123346931_A_G,ADCY5 17 | 7_157026052_A_G,rs1182443,7_157233358_A_G,UBE3C 18 | 12_55610566_G_T,rs12314766,12_55216782_G_T,OR10A7 19 | 12_61844049_T_C,rs12370484,12_61450268_T_C,TAFA2 20 | 3_76570616_G_A,rs12497354,3_76521465_G_A,ROBO2 21 | 7_15059326_T_C,rs12540947,7_15019701_T_C,DGKB 22 | 8_41509259_G_A,rs12549902,8_41651740_G_A,NKX6-3 23 | 6_22207900_T_C,rs13193644,6_22207671_T_C,CASC15 24 | 12_80985872_G_T,rs1528287,12_80592093_G_T,PTPRQ 25 | 12_85831165_G_A,rs1533223,12_85437387_G_A,ALX1 26 | 12_56180764_G_A,rs1681087,12_55786980_G_A,DNAJC14 27 | 9_135474131_G_A,rs17149330,9_132598744_G_A,DDX31 28 | 10_114752503_T_C,rs17747324,10_112992744_T_C,TCF7L2 29 | 5_60930584_C_T,rs1910026,5_61634757_C_T,C5orf64 30 | 12_128224722_A_G,rs191362921,12_127740177_A_G,LINC02393 31 | 12_82080177_A_G,rs1922403,12_81686398_A_G,PPFIA2 32 | 16_52901915_G_A,rs2052267,16_52868003_G_A,CHD9 33 | 13_97758894_C_T,rs2055423,13_97106640_C_T,MBNL2 34 | 13_106074896_T_C,rs2147479,13_105422547_T_C,DAOA-AS1 35 | 6_9065677_C_T,rs2224791,6_9065444_C_T,LOC100506207 36 | 11_2691500_G_A,rs231361,11_2670270_G_A,KCNQ1 37 | 21_40295064_T_C,rs2410066,21_38923140_T_C,ETS2 38 | 11_2641129_A_G,rs2412058,11_2619899_A_G,KCNQ1 39 | 11_120650053_T_C,rs2846098,11_120779344_T_C,GRIK4 40 | 7_44185088_G_A,rs2908274,7_44145489_G_A,GCK 41 | 7_56224228_G_T,rs34224159,7_56156535_G_T,PSPH 42 | 3_193938796_G_A,rs35402322,3_194221007_G_A,LINC00887 43 | 10_114774433_A_C,rs36090025,10_113014674_A_C,TCF7L2 44 | 7_6734141_T_G,rs3801034,7_6694510_T_G,ZNF12 45 | 12_70850622_T_C,rs380835,12_70456842_T_C,KCNMB4 46 | 12_88335960_G_A,rs4146751,12_87942183_G_A,C12orf50 47 | 5_55807370_C_T,rs464605,5_56511543_C_T,ANKRD55 48 | 7_15064896_T_G,rs4721401,7_15025271_T_G,DGKB 49 | 12_54339052_C_A,rs4759058,12_53945268_C_A,HOXC13 50 | 12_52220968_C_T,rs4762019,12_51827184_C_T,FIGNL2 51 | 3_127710138_C_T,rs4857925,3_127991295_C_T,KBTBD12 52 | 11_2196349_A_G,rs4930044,11_2175119_A_G,MIR4686 53 | 5_31437040_C_T,rs493760,5_31436933_C_T,DROSHA 54 | 2_16663741_C_T,rs5023163,2_16482473_C_T,CYRIA 55 | 7_67444857_T_C,rs538285390,7_67979870_T_C,STAG3L4 56 | 12_77581589_C_A,rs55695739,12_77187809_C_A,E2F7 57 | 12_54527168_C_T,rs55730794,12_54133384_C_T,LINC02381 58 | 8_11505266_A_C,rs56965246,8_11647757_A_C,GATA4 59 | 13_69398546_T_C,rs60108948,13_68824414_T_C,LINC00550 60 | 2_159879142_C_T,rs60436412,2_159022630_C_T,TANC1 61 | 15_57042759_T_G,rs62022933,15_56750561_T_G,ZNF280D 62 | 16_53811788_A_G,rs62033400,16_53777876_A_G,FTO 63 | 3_190459116_A_G,rs6792341,3_190741327_A_G,IL1RAP 64 | 13_91908725_C_T,rs68126334,13_91256471_C_T,LINC00379 65 | 6_15275305_A_G,rs6921502,6_15275074_A_G,JARID2 66 | 7_40618995_A_G,rs6948900,7_40579396_A_G,SUGCT 67 | 8_13845041_T_G,rs6991416,8_13987532_T_G,SGCZ 68 | 7_44223721_G_A,rs730497,7_44184122_G_A,GCK 69 | 12_81316238_A_G,rs7306751,12_80922459_A_G,LIN7A 70 | 12_38710523_G_A,rs7315028,12_38316721_G_A,ALG10B 71 | 11_2211677_G_A,rs7396447,11_2190447_G_A,MIR4686 72 | 12_66242327_A_G,rs74234985,12_65848547_A_G,HMGA2 73 | 3_187644695_C_T,rs74802573,3_187926907_C_T,BCL6 74 | 11_2187477_C_T,rs7483805,11_2166247_C_T,TH 75 | 5_55780388_C_T,rs7730776,5_56484561_C_T,ANKRD55 76 | 6_20686573_C_A,rs7766070,6_20686342_C_A,CDKAL1 77 | 7_50887174_T_C,rs7781440,7_50819477_T_C,GRB10 78 | 7_113340067_A_G,rs7802710,7_113700012_A_G,PPP1R3A 79 | 10_114758349_C_T,rs7903146,10_112998590_C_T,TCF7L2 80 | 1_51209148_T_C,rs79090772,1_50743476_T_C,FAF1 81 | 11_121266882_A_G,rs7938784,11_121396173_A_G,SORL1 82 | 12_66431690_G_A,rs7965495,12_66037910_G_A,HMGA2 83 | 12_33653477_T_C,rs7972688,12_33500542_T_C,SYT10 84 | 1_158640034_A_G,rs857682,1_158670244_A_G,SPTA1 85 | 7_28180556_T_C,rs864745,7_28140937_T_C,JAZF1 86 | 3_197268603_C_T,rs9325395,3_197541732_C_T,BDH1 87 | 6_165014556_C_A,rs9347878,6_164593523_C_A,C6orf118 88 | 6_8874553_T_C,rs9393070,6_8874320_T_C,LOC100506207 89 | 6_43818724_T_C,rs9472141,6_43850987_T_C,LINC01512 90 | 13_31709806_A_C,rs9538367,13_31135669_A_C,HSPH1 91 | 7_43320431_A_G,rs9648079,7_43280832_A_G,HECW1 92 | 12_66182455_G_A,rs971779,12_65788675_G_A,RPSAP52 93 | 3_193811168_A_G,rs9823161,3_194093379_A_G,HES1 94 | 3_185534482_G_A,rs9859406,3_185816694_G_A,IGF2BP2 95 | 17_75898670_A_G,rs9899517,17_77902588_A_G,LINC01973 96 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/ASAT Pos.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,185796605,T,C,rs6444081,C,1.85709728620563 3 | 4,156902837,A,G,rs36029656,G,1.6925373099693 4 | 5,56558326,C,T,rs6867983,T,5.02054454459969 5 | 7,28156603,A,G,rs849134,A,1.36149359686958 6 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Beta Cell 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,185796605,T,C,rs6444081,C,1.49929836175987 3 | 7,28156603,A,G,rs849134,A,1.71723423601776 4 | 10,112990398,C,T,rs11196182,C,1.81731405933532 5 | 10,112998590,C,T,rs7903146,T,5.4918602156638 6 | 10,12211598,G,A,rs12221133,A,1.32097143637614 7 | 11,2828300,A,C,rs2283228,A,1.12295327353213 8 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Lipodystrophy.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 7,130765628,G,A,rs172306,A,4.53455920944434 3 | 10,121164303,C,T,rs7087606,C,1.77462714072911 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Obesity.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 10,113315384,C,T,rs4918811,C,0.962879223117557 3 | 10,79182863,C,T,rs810517,C,0.95425477256916 4 | 13,97061795,T,C,rs7333342,C,0.974049344424921 5 | 16,53794154,C,T,rs17817964,T,6.34437002752861 6 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Total_GRS.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,185796605,T,C,rs6444081,C,0.1504 3 | 4,130073468,G,A,rs55653883,G,0.1658 4 | 4,152146894,A,C,rs35752631,A,0.1404 5 | 4,156902837,A,G,rs36029656,G,0.1277 6 | 5,56558326,C,T,rs6867983,T,0.1457 7 | 7,130765628,G,A,rs172306,A,0.1241 8 | 7,28156603,A,G,rs849134,A,0.0894 9 | 7,94409622,C,T,rs42518,C,0.0817 10 | 9,28773206,A,C,rs824250,C,0.0709 11 | 10,112990398,C,T,rs11196182,C,0.252 12 | 10,112998590,C,T,rs7903146,T,0.325 13 | 10,113315384,C,T,rs4918811,C,0.1009 14 | 10,121164303,C,T,rs7087606,C,0.1708 15 | 10,12211598,G,A,rs12221133,A,0.1247 16 | 10,79182863,C,T,rs810517,C,0.1058 17 | 11,2821317,G,A,rs12294675,A,0.1879 18 | 11,2828300,A,C,rs2283228,A,0.2936 19 | 12,4226777,T,C,rs67904513,T,0.1979 20 | 13,111640797,G,T,rs113507970,T,1.3373 21 | 13,97061795,T,C,rs7333342,C,0.0878 22 | 16,53794154,C,T,rs17817964,T,0.0945 23 | 17,7042968,T,C,rs13342692,C,0.1737 24 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Unknown.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 4,130073468,G,A,rs55653883,G,1.22345648743075 3 | 4,152146894,A,C,rs35752631,A,1.55505592177207 4 | 7,94409622,C,T,rs42518,C,1.13529799606247 5 | 9,28773206,A,C,rs824250,C,1.07511172465863 6 | 11,2821317,G,A,rs12294675,A,1.53046203465743 7 | 11,2828300,A,C,rs2283228,A,1.12914340155893 8 | 12,4226777,T,C,rs67904513,T,1.17350682402633 9 | 13,111640797,G,T,rs113507970,T,1.13530179286828 10 | 17,7042968,T,C,rs13342692,C,1.05499505510544 11 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/AMR/liftover_map.txt: -------------------------------------------------------------------------------- 1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene 2 | 4_105844273_T_G,rs10005603,4_104923116_T_G,TET2 3 | 9_11606348_T_C,rs10511567,9_11606348_T_C,PTPRD 4 | 10_114750157_C_T,rs11196182,10_112990398_C_T,TCF7L2 5 | 10_119713515_T_C,rs11198201,10_117954004_T_C,RAB11FIP2 6 | 13_112293144_G_T,rs113507970,13_111640797_G_T,TEX29 7 | 3_103220188_G_A,rs113829802,3_103501344_G_A,MIR548A3 8 | 14_68010986_C_T,rs118148639,14_67544269_C_T,PLEKHH1 9 | 4_1291058_C_T,rs11931251,4_1297270_C_T,MAEA 10 | 10_12253597_G_A,rs12221133,10_12211598_G_A,CDC123 11 | 11_2842547_G_A,rs12294675,11_2821317_G_A,KCNQ1 12 | 8_106035307_A_C,rs12548220,8_105023079_A_C,ZFPM2 13 | 16_78332174_T_G,rs12596093,16_78298277_T_G,WWOX 14 | 11_2800669_C_T,rs12794000,11_2779439_C_T,KCNQ1 15 | 9_13972415_G_A,rs13283809,9_13972416_G_A,LINC00583 16 | 17_6946287_T_C,rs13342692,17_7042968_T_C,SLC16A11 17 | 2_59182401_G_A,rs17049846,2_58955266_G_A,LINC01122 18 | 7_130450387_G_A,rs172306,7_130765628_G_A,KLF14 19 | 16_53828066_C_T,rs17817964,16_53794154_C_T,FTO 20 | 10_119418104_G_A,rs2184898,10_117658593_G_A,EMX2 21 | 11_2849530_A_C,rs2283228,11_2828300_A_C,KCNQ1 22 | 10_122347276_T_C,rs2291314,10_120587764_T_C,PLPP4 23 | 11_2901222_C_T,rs2411873,11_2879992_C_T,CDKN1C 24 | 5_79087302_G_A,rs259097,5_79791479_G_A,CMYA5 25 | 6_88418077_C_T,rs2787932,6_87708359_C_T,AKIRIN2 26 | 9_94521525_G_A,rs34134725,9_91759243_G_A,ROR2 27 | 4_153068046_A_C,rs35752631,4_152146894_A_C,FBXW7 28 | 4_157823989_A_G,rs36029656,4_156902837_A_G,PDGFC 29 | 7_94038934_C_T,rs42518,7_94409622_C_T,COL1A2 30 | 13_80628580_G_A,rs4885691,13_80054445_G_A,SPRY2 31 | 10_115075143_C_T,rs4918811,10_113315384_C_T,TCF7L2 32 | 11_2197286_A_G,rs4929965,11_2176056_A_G,MIR4686 33 | 11_2878935_C_T,rs4930016,11_2857705_C_T,KCNQ1 34 | 11_71318195_G_A,rs4945066,11_71607149_G_A,KRTAP5-11 35 | 4_130994623_G_A,rs55653883,4_130073468_G_A,C4orf33 36 | 10_71830776_C_T,rs55919533,10_70071020_C_T,MACROH2A2 37 | 22_32205632_A_C,rs5998135,22_31809646_A_C,DEPDC5 38 | 4_55048912_T_C,rs61650959,4_54182745_T_C,PDGFRA 39 | 1_3459867_A_G,rs61762173,1_3543303_A_G,MEGF6 40 | 1_82903595_G_A,rs61765084,1_82437912_G_A,ADGRL2 41 | 22_44998325_G_A,rs62228440,22_44602445_G_A,LINC00229 42 | 6_97177101_C_T,rs62412972,6_96729225_C_T,GPR63 43 | 3_185514393_T_C,rs6444081,3_185796605_T_C,IGF2BP2 44 | 1_82027976_G_A,rs6674509,1_81562291_G_A,ADGRL2 45 | 12_4335943_T_C,rs67904513,12_4226777_T_C,CCND2 46 | 5_55854153_C_T,rs6867983,5_56558326_C_T,MAP3K1 47 | 10_122923817_C_T,rs7087606,10_121164303_C_T,WDR11 48 | 18_24153466_C_A,rs7243826,18_26573502_C_A,KCTD1 49 | 4_707600_G_A,rs73221106,4_713811_G_A,PCGF3 50 | 13_97714049_T_C,rs7333342,13_97061795_T_C,OXGR1 51 | 11_2063846_C_T,rs73398031,11_2042616_C_T,H19 52 | 10_116080773_G_A,rs758212,10_114321014_G_A,AFAP1L2 53 | 4_117300599_A_G,rs7675851,4_116379443_A_G,MIR1973 54 | 4_154911022_A_G,rs7688570,4_153989870_A_G,SFRP2 55 | 7_843157_G_A,rs7801525,7_803520_G_A,SUN1 56 | 10_123932642_T_G,rs7897826,10_122173127_T_G,TACC2 57 | 10_114758349_C_T,rs7903146,10_112998590_C_T,TCF7L2 58 | 11_5751135_C_T,rs7935270,11_5729905_C_T,TRIM5 59 | 15_45072082_G_T,rs8023560,15_44779884_G_T,TRIM69 60 | 11_107253239_C_A,rs80288792,11_107382513_C_A,CWF19L2 61 | 10_80942620_C_T,rs810517,10_79182863_C_T,ZMIZ1 62 | 9_28773204_A_C,rs824250,9_28773206_A_C,LINGO2 63 | 7_28196222_A_G,rs849134,7_28156603_A_G,JAZF1 64 | 6_42128717_G_A,rs9471790,6_42160979_G_A,GUCA1A 65 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/ALP Neg.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 9,133274084,T,C,rs529565,C,8.36178385315219 3 | 12,120994499,T,G,rs1169302,T,1.4493184052045 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Beta Cell 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,45796261,A,G,rs7551025,A,1.05897449154299 3 | 2,148810827,C,T,rs4499362,C,1.20214580073032 4 | 2,233388510,T,C,rs838719,C,1.65890007058726 5 | 2,58694642,G,A,rs13417036,A,1.1884555300111601 6 | 3,152664563,A,C,rs1850421,A,1.0407632959667 7 | 3,186943483,G,T,rs12494144,G,2.21150069431789 8 | 6,127095785,A,G,rs2800733,A,1.04440549728214 9 | 6,20682933,G,A,rs9350271,A,2.2011593661603 10 | 6,20743721,T,C,rs9358363,C,1.46659086596583 11 | 6,20926938,T,G,rs9465936,T,1.21580166598046 12 | 7,127580864,T,C,rs989100,C,0.947532312659824 13 | 7,157231816,A,G,rs1182444,G,1.46035575247526 14 | 7,44226585,C,T,rs35452727,T,1.25959114181733 15 | 8,117172544,C,T,rs13266634,C,1.24175705038725 16 | 8,36974792,G,A,rs56687477,A,1.11760420250299 17 | 8,38485494,T,C,rs328301,T,1.21832381259362 18 | 8,41648861,A,G,rs12549294,G,2.92003539595794 19 | 8,41661944,A,G,rs515071,G,1.13571219509328 20 | 9,136350937,G,A,rs78270318,A,1.13082004056711 21 | 9,22132879,T,C,rs10965248,T,1.4445767083959 22 | 9,81359621,A,G,rs10125947,A,1.15780696586224 23 | 10,112994329,T,C,rs7901695,C,1.14634451292131 24 | 10,12265895,C,T,rs11257655,T,1.49618088251067 25 | 10,92782853,G,A,rs7896332,G,1.23305976294232 26 | 10,97259387,C,T,rs3740522,C,1.18676481411811 27 | 11,17391413,A,G,rs7124355,A,1.13791609454109 28 | 11,35417556,G,T,rs1923293,G,1.1133954528047 29 | 11,92964815,C,T,rs10466351,T,2.69364990189508 30 | 12,108236003,G,A,rs1426371,G,1.60776816602254 31 | 13,32988367,T,C,rs7997912,C,1.67924628477302 32 | 13,91295793,G,A,rs9523295,G,0.920252193799057 33 | 15,62104190,A,G,rs7172432,A,1.90805714438906 34 | 15,89885662,A,C,rs10852123,A,1.33017201375341 35 | 15,90979023,G,A,rs8026714,A,1.02450165378977 36 | 17,67645535,C,T,rs2706710,T,0.977179999261815 37 | 19,45654658,G,A,rs7507912,G,1.01121814162339 38 | 19,45856295,A,G,rs12609371,G,0.953844507475111 39 | 20,44366172,T,C,rs12625671,C,0.963386102400816 40 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Beta Cell 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,27508073,T,C,rs1260326,C,0.938834406752605 3 | 2,44961214,C,T,rs895636,T,1.98631732420755 4 | 2,60358671,T,C,rs243019,C,0.954279346393862 5 | 6,20682933,G,A,rs9350271,A,1.70867475612877 6 | 7,14858657,C,T,rs17168486,T,1.7427604052071 7 | 7,15024298,T,G,rs10950550,T,1.99450729695613 8 | 8,117172544,C,T,rs13266634,C,3.64925590051388 9 | 9,22132879,T,C,rs10965248,T,1.32250264059456 10 | 9,22137686,T,G,rs7018475,G,0.934217933381001 11 | 9,4292083,G,A,rs10758593,A,1.32408451085596 12 | 10,112994329,T,C,rs7901695,C,2.66742274538238 13 | 10,12265895,C,T,rs11257655,T,0.923058952262586 14 | 11,2836003,G,A,rs60808706,G,0.938558688818934 15 | 11,92964815,C,T,rs10466351,T,3.41810346728567 16 | 13,32988367,T,C,rs7997912,C,1.02831632190351 17 | 15,62104190,A,G,rs7172432,A,1.93563411356787 18 | 15,77454848,A,G,rs7178572,G,1.00567163395728 19 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Lipodystrophy 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,39478577,T,C,rs61779313,C,1.8505544202947 3 | 1,50728312,A,G,rs12090545,A,1.70980451666358 4 | 2,164745671,A,G,rs3769891,A,2.82807773456512 5 | 3,185777532,G,A,rs13092876,A,1.55038216609814 6 | 6,126643364,A,G,rs4273712,G,1.34061525226534 7 | 6,34246893,A,G,rs4711389,A,1.43776952084951 8 | 7,93477781,A,C,rs2074120,A,0.957966020800177 9 | 10,121207656,G,A,rs10788149,G,1.26026948350704 10 | 10,92782853,G,A,rs7896332,G,0.964758195142507 11 | 12,71055741,G,T,rs7313668,T,1.02078514149782 12 | 16,81501185,T,C,rs2925979,T,3.63678616571073 13 | 18,63178651,T,C,rs12454712,T,1.92434210253456 14 | 19,33396499,T,C,rs7250869,T,1.12505423482159 15 | 19,45654658,G,A,rs7507912,G,1.16217682759086 16 | 19,7293108,T,C,rs8101064,T,1.39614440545665 17 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Lipodystrophy 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,164745671,A,G,rs3769891,A,0.972242711885497 3 | 3,187980545,T,C,rs13086331,T,1.13875980113942 4 | 3,23055584,C,T,rs861983,T,1.2694221182209 5 | 5,134528909,G,A,rs329122,A,1.37438593786571 6 | 5,56510924,A,G,rs459193,G,4.20300059160372 7 | 6,34246893,A,G,rs4711389,A,0.923917693997165 8 | 8,131867530,T,C,rs10505581,C,0.932698668823286 9 | 9,1032567,G,A,rs1016565,A,1.63852517573449 10 | 10,121169979,T,C,rs10886863,C,1.48873464298509 11 | 12,117963051,G,A,rs111246699,A,1.02260481232994 12 | 12,120994499,T,G,rs1169302,T,1.93277518295661 13 | 17,31315412,T,C,rs7502556,T,1.49419289027242 14 | 18,63178651,T,C,rs12454712,T,1.42356374304542 15 | 19,33396499,T,C,rs7250869,T,2.82777959216272 16 | 19,7293108,T,C,rs8101064,T,0.983107797973924 17 | 20,58890516,G,A,rs6123837,A,0.929247394504364 18 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Liver Lipid.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,27508073,T,C,rs1260326,C,8.70714920205364 3 | 6,7231610,G,A,rs9379084,G,0.954621381002466 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Obesity.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,177909798,G,A,rs532504,A,2.38687601806275 3 | 1,39478577,T,C,rs61779313,C,1.20426863670431 4 | 2,630662,A,C,rs6731688,C,2.41359285170887 5 | 4,45184122,G,A,rs10938398,A,2.3625225272987 6 | 5,75279159,A,G,rs2126736,A,1.26239274968625 7 | 5,96508059,C,T,rs261982,T,1.07415276743519 8 | 6,50819746,G,T,rs62405419,T,1.71330188212786 9 | 7,70231919,A,G,rs12698877,G,0.97916886732597 10 | 11,27678578,C,T,rs7103411,T,2.30465226379423 11 | 16,20243775,C,T,rs4238585,T,1.19815302452613 12 | 16,3597097,G,A,rs2240885,A,0.917582164874537 13 | 16,53767042,T,C,rs1421085,C,5.36461255586884 14 | 18,60185354,T,C,rs476828,C,3.02194014380283 15 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Proinsulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 8,41661944,A,G,rs515071,G,1.10184318280305 3 | 11,72752390,G,A,rs7109575,G,6.12083008155497 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Total_GRS.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,177909798,G,A,rs532504,A,0.0547999999999974 3 | 1,39478577,T,C,rs61779313,C,0.0603000000000033 4 | 1,45796261,A,G,rs7551025,A,0.0490999999999999 5 | 1,50728312,A,G,rs12090545,A,0.0883000000000005 6 | 2,148810827,C,T,rs4499362,C,0.0435 7 | 2,164745671,A,G,rs3769891,A,0.0549999999999998 8 | 2,233388510,T,C,rs838719,C,0.0452999999999993 9 | 2,27508073,T,C,rs1260326,C,0.0632000000000013 10 | 2,44961214,C,T,rs895636,T,0.0523000000000021 11 | 2,58694642,G,A,rs13417036,A,0.0496999999999998 12 | 2,60358671,T,C,rs243019,C,0.0553000000000006 13 | 2,630662,A,C,rs6731688,C,0.0982000000000037 14 | 3,122246352,T,G,rs9859381,G,0.0391999999999962 15 | 3,152664563,A,C,rs1850421,A,0.0442000000000003 16 | 3,170953520,T,G,rs7642480,T,0.0584 17 | 3,185777532,G,A,rs13092876,A,0.125699999999997 18 | 3,186943483,G,T,rs12494144,G,0.0442999999999999 19 | 3,187980545,T,C,rs13086331,T,0.0508999999999999 20 | 3,23055584,C,T,rs861983,T,0.136900000000004 21 | 4,1260747,C,T,rs7656416,C,0.1002 22 | 4,45184122,G,A,rs10938398,A,0.0457000000000029 23 | 5,134528909,G,A,rs329122,A,0.0386000000000032 24 | 5,56510924,A,G,rs459193,G,0.0736999999999982 25 | 5,75279159,A,G,rs2126736,A,0.0376999999999998 26 | 5,96508059,C,T,rs261982,T,0.039699999999998 27 | 6,126643364,A,G,rs4273712,G,0.0468999999999971 28 | 6,127095785,A,G,rs2800733,A,0.0640000000000005 29 | 6,20682933,G,A,rs9350271,A,0.193300000000001 30 | 6,20743721,T,C,rs9358363,C,0.0600000000000004 31 | 6,20926938,T,G,rs9465936,T,0.0540999999999997 32 | 6,34246893,A,G,rs4711389,A,0.0926000000000004 33 | 6,39078868,A,C,rs742762,A,0.0750999999999999 34 | 6,50819746,G,T,rs62405419,T,0.0441999999999981 35 | 6,7231610,G,A,rs9379084,G,0.0717999999999996 36 | 7,127580864,T,C,rs989100,C,0.0662999999999975 37 | 7,14858657,C,T,rs17168486,T,0.0643000000000005 38 | 7,15024298,T,G,rs10950550,T,0.0650000000000005 39 | 7,157231816,A,G,rs1182444,G,0.047300000000003 40 | 7,44226585,C,T,rs35452727,T,0.0559999999999987 41 | 7,70231919,A,G,rs12698877,G,0.0672999999999987 42 | 7,93477781,A,C,rs2074120,A,0.0409000000000003 43 | 8,117172544,C,T,rs13266634,C,0.116 44 | 8,131867530,T,C,rs10505581,C,0.0416000000000018 45 | 8,36974792,G,A,rs56687477,A,0.0461000000000004 46 | 8,38485494,T,C,rs328301,T,0.0395000000000002 47 | 8,41648861,A,G,rs12549294,G,0.0706000000000022 48 | 8,41661944,A,G,rs515071,G,0.0653999999999977 49 | 8,74290985,A,G,rs185063984,G,0.1324 50 | 9,1032567,G,A,rs1016565,A,0.0375000000000047 51 | 9,133274084,T,C,rs529565,C,0.0428000000000014 52 | 9,136350937,G,A,rs78270318,A,0.067499999999998 53 | 9,22132879,T,C,rs10965248,T,0.1829 54 | 9,22137686,T,G,rs7018475,G,0.0593999999999996 55 | 9,4292083,G,A,rs10758593,A,0.0633000000000042 56 | 9,81359621,A,G,rs10125947,A,0.0432999999999998 57 | 10,112994329,T,C,rs7901695,C,0.275400000000002 58 | 10,121169979,T,C,rs10886863,C,0.0595000000000034 59 | 10,121207656,G,A,rs10788149,G,0.0521999999999996 60 | 10,12265895,C,T,rs11257655,T,0.113999999999997 61 | 10,92782853,G,A,rs7896332,G,0.0570000000000003 62 | 10,97259387,C,T,rs3740522,C,0.0459999999999996 63 | 11,17391413,A,G,rs7124355,A,0.0741999999999996 64 | 11,27678578,C,T,rs7103411,T,0.0413000000000032 65 | 11,2836003,G,A,rs60808706,G,0.2394 66 | 11,35417556,G,T,rs1923293,G,0.0548999999999997 67 | 11,72752390,G,A,rs7109575,G,0.141 68 | 11,92964815,C,T,rs10466351,T,0.037100000000001 69 | 12,108236003,G,A,rs1426371,G,0.0479000000000003 70 | 12,117963051,G,A,rs111246699,A,0.0613999999999981 71 | 12,120994499,T,G,rs1169302,T,0.0500999999999995 72 | 12,71055741,G,T,rs7313668,T,0.0446999999999987 73 | 13,22015744,A,G,rs9316706,A,0.0414 74 | 13,32988367,T,C,rs7997912,C,0.078799999999996 75 | 13,91295793,G,A,rs9523295,G,0.0774999999999998 76 | 15,62104190,A,G,rs7172432,A,0.0792000000000003 77 | 15,75444946,T,C,rs7171507,T,0.0457000000000001 78 | 15,77454848,A,G,rs7178572,G,0.0717000000000007 79 | 15,89885662,A,C,rs10852123,A,0.0601 80 | 15,90979023,G,A,rs8026714,A,0.0657000000000009 81 | 16,20243775,C,T,rs4238585,T,0.0470000000000041 82 | 16,3597097,G,A,rs2240885,A,0.0421999999999966 83 | 16,53767042,T,C,rs1421085,C,0.130299999999999 84 | 16,81501185,T,C,rs2925979,T,0.0423000000000003 85 | 17,31315412,T,C,rs7502556,T,0.0442999999999999 86 | 17,37741595,A,C,rs8064454,A,0.1202 87 | 17,67645535,C,T,rs2706710,T,0.0711000000000007 88 | 18,60185354,T,C,rs476828,C,0.0840000000000036 89 | 18,63178651,T,C,rs12454712,T,0.0543000000000005 90 | 19,33396499,T,C,rs7250869,T,0.0557999999999999 91 | 19,45654658,G,A,rs7507912,G,0.0932000000000006 92 | 19,45856295,A,G,rs12609371,G,0.0394000000000014 93 | 19,7293108,T,C,rs8101064,T,0.0634 94 | 20,44366172,T,C,rs12625671,C,0.0655000000000018 95 | 20,51538847,T,C,rs6021276,T,0.0426000000000005 96 | 20,58890516,G,A,rs6123837,A,0.0387000000000039 97 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Unknown.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,233388510,T,C,rs838719,C,1.36292318377145 3 | 2,44961214,C,T,rs895636,T,1.36471787570867 4 | 3,122246352,T,G,rs9859381,G,0.925934188534673 5 | 3,170953520,T,G,rs7642480,T,1.00385231931068 6 | 4,1260747,C,T,rs7656416,C,1.33469344718539 7 | 6,20682933,G,A,rs9350271,A,1.20245846308046 8 | 6,39078868,A,C,rs742762,A,1.09343062879404 9 | 7,15024298,T,G,rs10950550,T,1.10075181627983 10 | 8,74290985,A,G,rs185063984,G,1.03203544993695 11 | 9,22132879,T,C,rs10965248,T,1.09197796768426 12 | 11,17391413,A,G,rs7124355,A,0.911107861026035 13 | 11,2836003,G,A,rs60808706,G,2.00966819176367 14 | 13,22015744,A,G,rs9316706,A,0.903453718276932 15 | 15,75444946,T,C,rs7171507,T,1.3029635126487 16 | 15,77454848,A,G,rs7178572,G,0.91899776059859 17 | 17,37741595,A,C,rs8064454,A,1.12401219294317 18 | 20,51538847,T,C,rs6021276,T,0.976659512991192 19 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EAS/liftover_map.txt: -------------------------------------------------------------------------------- 1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene 2 | 4_153520279_G_A,rs10011838,4_152599127_G_A,TMEM154 3 | 9_83974536_A_G,rs10125947,9_81359621_A_G,TLE1 4 | 9_1032567_G_A,rs1016565,9_1032567_G_A,DMRT2 5 | 11_92697981_C_T,rs10466351,11_92964815_C_T,MTNR1B 6 | 8_132879777_T_C,rs10505581,8_131867530_T_C,EFR3A 7 | 13_26781528_G_A,rs10507349,13_26207391_G_A,RNF6 8 | 9_4292083_G_A,rs10758593,9_4292083_G_A,GLIS3 9 | 10_122967170_G_A,rs10788149,10_121207656_G_A,FGFR2 10 | 10_124149917_T_C,rs10788284,10_122390401_T_C,PLEKHA1 11 | 15_90428894_A_C,rs10852123,15_89885662_A_C,AP3S2 12 | 12_97850215_C_A,rs10860209,12_97456437_C_A,RMST 13 | 10_114711755_C_T,rs10885396,10_112951996_C_T,TCF7L2 14 | 10_122929493_T_C,rs10886863,10_121169979_T_C,WDR11 15 | 4_45186139_G_A,rs10938398,4_45184122_G_A,GNPDA2 16 | 5_51787940_A_G,rs10940253,5_52492106_A_G,ITGA1 17 | 7_15063923_T_G,rs10950550,7_15024298_T_G,DGKB 18 | 9_22132878_T_C,rs10965248,9_22132879_T_C,CDKN2B-AS1 19 | 11_2203154_T_C,rs11043003,11_2181924_T_C,MIR4686 20 | 12_118400856_G_A,rs111246699,12_117963051_G_A,KSR2 21 | 10_94479070_C_T,rs111268405,10_92719313_C_T,HHEX 22 | 11_2292242_C_T,rs111826047,11_2271012_C_T,ASCL2 23 | 10_122834572_G_T,rs11199753,10_121075059_G_T,WDR11 24 | 10_12307894_C_T,rs11257655,10_12265895_C_T,CDC123 25 | 9_98278413_C_T,rs113154802,9_95516131_C_T,PTCH1 26 | 17_4024824_G_A,rs113547729,17_4121530_G_A,ZZEF1 27 | 14_77320585_C_T,rs11620646,14_76854242_C_T,LRRC74A 28 | 12_121432302_T_G,rs1169302,12_120994499_T_G,HNF1A 29 | 16_20323168_G_A,rs117267808,16_20311846_G_A,GP2 30 | 6_39283083_G_A,rs11753141,6_39315307_G_A,KCNK16 31 | 11_128389391_C_T,rs11819995,11_128519496_C_T,ETS1 32 | 7_157024510_A_G,rs1182444,7_157231816_A_G,UBE3C 33 | 3_23258614_G_A,rs11926494,3_23217123_G_A,UBE2E2 34 | 1_51193984_A_G,rs12090545,1_50728312_A_G,FAF1 35 | 13_80707429_A_G,rs1215468,13_80133294_A_G,SPRY2 36 | 6_20586039_A_G,rs12213132,6_20585808_A_G,CDKAL1 37 | 13_51088809_G_A,rs123378,13_50514673_G_A,DLEU1 38 | 10_89684214_C_A,rs1236816,10_87924457_C_A,PTEN 39 | 1_214155398_C_A,rs12403994,1_213982055_C_A,PROX1 40 | 14_24878370_C_T,rs12437434,14_24409164_C_T,NYNRIN 41 | 17_36056192_T_G,rs12452659,17_37696186_T_G,HNF1B 42 | 18_60845884_T_C,rs12454712,18_63178651_T_C,BCL2 43 | 3_186661271_G_T,rs12494144,3_186943483_G_T,ST6GAL1 44 | 3_124921920_G_A,rs12497133,3_125203076_G_A,SLC12A8 45 | 8_41506380_A_G,rs12549294,8_41648861_A_G,NKX6-3 46 | 16_72022534_C_T,rs12600132,16_71988635_C_T,PKD1L3 47 | 2_27730940_T_C,rs1260326,2_27508073_T_C,GCKR 48 | 19_46359553_A_G,rs12609371,19_45856295_A_G,SYMPK 49 | 20_42994812_T_C,rs12625671,20_44366172_T_C,HNF4A 50 | 7_69696905_A_G,rs12698877,7_70231919_A_G,AUTS2 51 | 12_114124239_G_A,rs12816687,12_113686434_G_A,RBM19 52 | 3_187698333_T_C,rs13086331,3_187980545_T_C,LPP-AS2 53 | 3_185495320_G_A,rs13092876,3_185777532_G_A,IGF2BP2 54 | 8_118184783_C_T,rs13266634,8_117172544_C_T,SLC30A8 55 | 9_81917127_T_C,rs1328412,9_79302212_T_C,TLE4 56 | 2_58921777_G_A,rs13417036,2_58694642_G_A,LINC01122 57 | 22_50439430_A_G,rs137845,22_50001001_A_G,IL17REL 58 | 16_53800954_T_C,rs1421085,16_53767042_T_C,FTO 59 | 12_108629780_G_A,rs1426371,12_108236003_G_A,WSCD2 60 | 3_114960798_A_C,rs1459513,3_115241951_A_C,ZBTB20 61 | 10_63717113_C_T,rs146716733,10_61957354_C_T,ARID5B 62 | 11_2767262_G_A,rs149658560,11_2746032_G_A,KCNQ1 63 | 6_20455564_C_T,rs16883824,6_20455333_C_T,E2F3 64 | 10_64903204_G_A,rs16913026,10_63143444_G_A,NRBF2 65 | 15_28506833_G_A,rs16950949,15_28261687_G_A,HERC2 66 | 7_14898282_C_T,rs17168486,7_14858657_C_T,DGKB 67 | 10_71321279_A_G,rs177045,10_69561523_A_G,NEUROG3 68 | 8_118028794_T_G,rs17744945,8_117016555_T_G,SLC30A8 69 | 1_22068326_A_G,rs1825307,1_21741833_A_G,USP48 70 | 3_152382352_A_C,rs1850421,3_152664563_A_C,P2RY1 71 | 8_75203220_A_G,rs185063984,8_74290985_A_G,JPH1 72 | 17_36051372_C_T,rs1859211,17_37691367_C_T,HNF1B 73 | 17_6953781_C_T,rs186568031,17_7050462_C_T,SLC16A11 74 | 8_37380139_T_C,rs1892609,8_37522621_T_C,ZNF703 75 | 11_35439103_G_T,rs1923293,11_35417556_G_T,SLC1A2 76 | 10_71273357_G_A,rs1955163,10_69513601_G_A,TSPAN15 77 | 5_52097183_A_G,rs2059202,5_52801349_A_G,PELO 78 | 7_93107093_A_C,rs2074120,7_93477781_A_C,CALCR 79 | 6_33550025_A_G,rs210148,6_33582248_A_G,GGNBP1 80 | 5_74574984_A_G,rs2126736,5_75279159_A_G,ANKRD31 81 | 11_2863820_A_C,rs2237898,11_2842590_A_C,KCNQ1 82 | 16_3647098_G_A,rs2240885,16_3597097_G_A,SLX4 83 | 1_64107893_G_A,rs2269245,1_63642222_G_A,PGM1 84 | 1_184020945_G_A,rs2274432,1_184051811_G_A,TSEN15 85 | 11_2569903_C_T,rs2283159,11_2548673_C_T,KCNQ1 86 | 11_2745107_C_T,rs231917,11_2723877_C_T,KCNQ1 87 | 6_137293227_T_C,rs2327777,6_136972090_T_C,NHEG1 88 | 2_60585806_T_C,rs243019,2_60358671_T_C,MIR4432 89 | 12_66232810_G_T,rs2583934,12_65839030_G_T,HMGA2 90 | 5_95843763_C_T,rs261982,5_96508059_C_T,PCSK1 91 | 17_65641651_C_T,rs2706710,17_67645535_C_T,PITPNC1 92 | 9_84308948_G_A,rs2796441,9_81694033_G_A,TLE1 93 | 6_127416930_A_G,rs2800733,6_127095785_A_G,RSPO3 94 | 10_12331586_A_G,rs2801473,10_12289587_A_G,CDC123 95 | 4_71844118_G_A,rs28599782,4_70978401_G_A,MOB1B 96 | 22_46313618_G_T,rs28637892,22_45917738_G_T,WNT7B 97 | 7_44174857_T_G,rs2908279,7_44135258_T_G,MYL7 98 | 16_81534790_T_C,rs2925979,16_81501185_T_C,CMIP 99 | 5_176513896_C_A,rs3135911,5_177086895_C_A,FGFR4 100 | 7_127793861_G_A,rs322728,7_128153809_G_A,MIR129-1 101 | 8_38343012_T_C,rs328301,8_38485494_T_C,FGFR1 102 | 5_133864599_G_A,rs329122,5_134528909_G_A,JADE2 103 | 8_17927609_C_T,rs34642578,8_18070100_C_T,ASAH1 104 | 1_229672955_G_A,rs348330,1_229537208_G_A,ABCB10 105 | 8_73503743_A_C,rs349359,8_72591508_A_C,KCNB2 106 | 7_44266184_C_T,rs35452727,7_44226585_C_T,CAMK2B 107 | 10_94435673_G_A,rs35906730,10_92675916_G_A,HHEX 108 | 10_99019144_C_T,rs3740522,10_97259387_C_T,ARHGAP19 109 | 15_40616742_G_A,rs3743140,15_40324541_G_A,CCDC9B 110 | 12_27963402_G_A,rs3751236,12_27810469_G_A,KLHL42 111 | 2_165602181_A_G,rs3769891,2_164745671_A_G,COBLL1 112 | 10_12401811_T_C,rs4073527,10_12359812_T_C,CAMK1D 113 | 16_20255097_C_T,rs4238585,16_20243775_C_T,GP2 114 | 6_126964510_A_G,rs4273712,6_126643364_A_G,CENPW 115 | 4_6289986_T_G,rs4458523,4_6288259_T_G,WFS1 116 | 2_149568396_C_T,rs4499362,2_148810827_C_T,EPC2 117 | 7_126864679_T_G,rs4532535,7_127224625_T_G,GRM8 118 | 9_4309006_C_T,rs4567095,9_4309006_C_T,GLIS3 119 | 5_55806751_A_G,rs459193,5_56510924_A_G,ANKRD55 120 | 6_34214670_A_G,rs4711389,6_34246893_A_G,SMIM29 121 | 18_57852587_T_C,rs476828,18_60185354_T_C,MC4R 122 | 8_41519462_A_G,rs515071,8_41661944_A_G,ANK1 123 | 9_136149500_T_C,rs529565,9_133274084_T_C,ABO 124 | 1_177878933_G_A,rs532504,1_177909798_G_A,SEC16B 125 | 14_103237952_G_A,rs55700915,14_102771615_G_A,TRAF3 126 | 8_36832310_G_A,rs56687477,8_36974792_G_A,KCNU1 127 | 14_77382503_A_G,rs58524310,14_76916160_A_G,LRRC74A 128 | 11_3116024_A_G,rs59772385,11_3094794_A_G,OSBPL5 129 | 15_93825358_C_A,rs59876980,15_93282129_C_A,RGMA 130 | 3_123174832_C_T,rs60054445,3_123455985_C_T,ADCY5 131 | 8_126471274_G_A,rs60089934,8_125459032_G_A,TRIB1 132 | 20_48830265_T_C,rs6012876,20_50213728_T_C,CEBPB 133 | 20_50155386_T_C,rs6021276,20_51538847_T_C,NFATC2 134 | 3_23551971_G_A,rs60410861,3_23510480_G_A,UBE2E2 135 | 1_20688352_C_T,rs60573766,1_20361859_C_T,LINC01141 136 | 11_2857233_G_A,rs60808706,11_2836003_G_A,KCNQ1 137 | 20_57465571_G_A,rs6123837,20_58890516_G_A,GNAS 138 | 1_39944249_T_C,rs61779313,1_39478577_T_C,MACF1 139 | 12_4289091_C_A,rs61910828,12_4179925_C_A,CCND2 140 | 14_38809661_A_G,rs61975988,14_38340457_A_G,CLEC14A 141 | 6_50787459_G_T,rs62405419,6_50819746_G_T,TFAP2B 142 | 17_36124082_A_G,rs6607292,17_37764098_A_G,HNF1B 143 | 2_630662_A_C,rs6731688,2_630662_A_C,TMEM18 144 | 9_22137685_T_G,rs7018475,9_22137686_T_G,CDKN2B-AS1 145 | 10_80943841_G_A,rs703980,10_79184084_G_A,ZMIZ1 146 | 9_22288404_G_A,rs7045760,9_22288405_G_A,DMRTA1 147 | 11_27700125_C_T,rs7103411,11_27678578_C_T,BDNF 148 | 11_72463435_G_A,rs7109575,11_72752390_G_A,ARAP1 149 | 11_17412960_A_G,rs7124355,11_17391413_A_G,ABCC8 150 | 15_75737287_T_C,rs7171507,15_75444946_T_C,SIN3A 151 | 15_62396389_A_G,rs7172432,15_62104190_A_G,C2CD4A 152 | 15_77747190_A_G,rs7178572,15_77454848_A_G,HMG20A 153 | 19_33887405_T_C,rs7250869,19_33396499_T_C,PEPD 154 | 11_2853163_T_C,rs72844296,11_2831933_T_C,KCNQ1 155 | 22_46484465_A_G,rs7289813,22_46088585_A_G,MIRLET7BHG 156 | 20_22430241_G_A,rs73085586,20_22449603_G_A,LOC284788 157 | 12_71449521_G_T,rs7313668,12_71055741_G_T,TSPAN8 158 | 6_39046644_A_C,rs742762,6_39078868_A_C,GLP1R 159 | 17_29642430_T_C,rs7502556,17_31315412_T_C,NF1 160 | 19_46157916_G_A,rs7507912,19_45654658_G_A,EML2 161 | 6_39289871_G_A,rs75343229,6_39322095_G_A,KCNK16 162 | 1_46261933_A_G,rs7551025,1_45796261_A_G,MAST2 163 | 3_170671309_T_G,rs7642480,3_170953520_T_G,SLC2A2 164 | 4_1254535_C_T,rs7656416,4_1260747_C_T,CTBP1-DT 165 | 6_131954797_T_G,rs7739842,6_131633657_T_G,ENPP3 166 | 7_13886654_C_T,rs7787720,7_13847029_C_T,ETV1 167 | 9_139245389_G_A,rs78270318,9_136350937_G_A,GPSM1 168 | 10_112678657_G_T,rs7895872,10_110918899_G_T,BBIP1 169 | 10_94542610_G_A,rs7896332,10_92782853_G_A,EXOC6 170 | 10_114754088_T_C,rs7901695,10_112994329_T_C,TCF7L2 171 | 10_77303697_T_C,rs7906280,10_75543939_T_C,ZNF503-AS2 172 | 10_94316828_C_A,rs7917163,10_92557071_C_A,IDE 173 | 11_2626334_A_G,rs7947981,11_2605104_A_G,KCNQ1 174 | 13_33562505_T_C,rs7997912,13_32988367_T_C,KL 175 | 7_140651523_G_A,rs801089,7_140951723_G_A,BRAF 176 | 6_117996631_T_C,rs80196932,6_117675468_T_C,NUS1 177 | 15_91522253_G_A,rs8026714,15_90979023_G_A,PRC1 178 | 15_38828140_G_T,rs8043085,15_38535939_G_T,RASGRP1 179 | 17_36101586_A_C,rs8064454,17_37741595_A_C,HNF1B 180 | 19_7293119_T_C,rs8101064,19_7293108_T_C,INSR 181 | 2_234297156_T_C,rs838719,2_233388510_T_C,DGKD 182 | 3_23097075_C_T,rs861983,3_23055584_C_T,UBE2E2 183 | 2_45188353_C_T,rs895636,2_44961214_C_T,SIX3 184 | 8_95960886_G_T,rs896852,8_94948658_G_T,TP53INP1 185 | 9_4243162_G_A,rs911490,9_4243162_G_A,GLIS3 186 | 5_14777799_A_G,rs9312873,5_14777690_A_G,ANKH 187 | 13_22589883_A_G,rs9316706,13_22015744_A_G,LINC00424 188 | 6_20683164_G_A,rs9350271,6_20682933_G_A,CDKAL1 189 | 6_20743952_T_C,rs9358363,6_20743721_T_C,CDKAL1 190 | 6_139205386_T_C,rs9376382,6_138884249_T_C,ECT2L 191 | 6_7231843_G_A,rs9379084,6_7231610_G_A,RREB1 192 | 6_143056556_T_C,rs9390022,6_142735419_T_C,HIVEP2 193 | 6_20509339_A_G,rs942041,6_20509108_A_G,E2F3 194 | 6_20927169_T_G,rs9465936,6_20926938_T_G,CDKAL1 195 | 13_91948047_G_A,rs9523295,13_91295793_G_A,MIR17HG 196 | 3_121965199_T_G,rs9859381,3_122246352_T_G,CASR 197 | 3_63891105_T_C,rs9870576,3_63905429_T_C,ATXN7 198 | 7_127220918_T_C,rs989100,7_127580864_T_C,GCC1 199 | 17_40913366_C_T,rs9892728,17_42761348_C_T,RAMP2 200 | 18_7076836_C_T,rs9948462,18_7076837_C_T,LAMA1 201 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/ALP Neg.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,196083316,T,C,rs7619708,T,0.811268374976863 3 | 6,139515991,T,C,rs11155073,T,1.70122184458857 4 | 6,43834141,C,T,rs2894536,C,0.848664475361421 5 | 6,43843237,G,A,rs9369425,G,1.155447956349 6 | 7,100715101,A,G,rs534043,G,0.764667990679758 7 | 9,133269828,C,T,rs495203,T,5.324443416831 8 | 10,69340132,G,A,rs10159477,G,3.69108483702209 9 | 11,61798436,T,C,rs174541,T,2.17170144914074 10 | 12,48342520,C,A,rs2732480,C,1.41878190679511 11 | 16,29946895,G,A,rs8054556,A,0.74715798473066 12 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/ASAT Pos.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,11239454,C,T,rs7544489,T,0.840788000437575 3 | 1,155299985,A,G,rs3020781,G,0.823621158224411 4 | 1,50972693,C,T,rs3176466,C,1.03655309336363 5 | 2,18540396,G,A,rs7558413,A,0.954927244263011 6 | 3,46948414,G,A,rs12491473,G,0.856540760181833 7 | 4,156731601,C,A,rs28819812,C,1.17684262294889 8 | 4,18023861,G,A,rs2011603,A,1.55586100743138 9 | 5,158599736,C,T,rs890940,T,0.838594967456831 10 | 5,53478680,G,A,rs62370480,A,1.74052293502224 11 | 5,53976834,G,A,rs4865796,A,1.08742124424837 12 | 5,56566067,G,A,rs9687846,A,2.09045528273711 13 | 6,43790159,C,A,rs998584,A,0.97430556694122 14 | 7,15886603,T,C,rs38221,T,0.735247066717914 15 | 7,40777054,C,T,rs17439448,T,0.83480231990919 16 | 7,4643627,G,A,rs62450857,A,0.911446672696138 17 | 8,26015118,A,G,rs17818197,G,0.736947611830721 18 | 9,94180777,G,A,rs10821311,A,0.80088704874938 19 | 10,122433665,T,G,rs2280141,T,0.940558954640188 20 | 10,68583018,C,T,rs10998304,C,0.835291215123657 21 | 11,17388025,T,C,rs5219,T,0.891800699436919 22 | 12,12718165,T,G,rs2066827,G,0.738867415588662 23 | 12,4275678,T,G,rs76895963,T,1.12801582156112 24 | 12,65965972,C,A,rs8756,A,1.53597273874399 25 | 15,40106553,T,C,rs484943,T,1.16059228847848 26 | 15,75522047,C,T,rs6495182,C,1.1139220422407 27 | 16,28906323,T,C,rs7188071,T,0.781186930213114 28 | 17,67853811,T,G,rs2046323,G,0.82708525898556 29 | 18,55383415,A,C,rs72926932,C,0.766823157775034 30 | 18,63178651,T,C,rs12454712,T,1.25235887701051 31 | 20,33720469,C,T,rs67611724,T,0.880527157857956 32 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Beta Cell 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,20404624,T,C,rs10916785,T,0.741404206287625 3 | 1,26156065,T,C,rs10794522,C,0.805212778329622 4 | 1,6612669,A,C,rs11583755,C,1.25357097726457 5 | 2,120568332,G,A,rs9784137,G,0.820403466645448 6 | 2,212971253,A,G,rs17354348,A,1.01652074032952 7 | 2,233414615,A,G,rs838733,G,0.96775916799516 8 | 2,59084401,G,A,rs980183,G,0.750786192301261 9 | 2,67651196,T,C,rs1430780,T,0.953168180513232 10 | 3,12443429,A,C,rs60710264,C,1.04050302917565 11 | 3,152675088,T,G,rs9828639,T,0.903080832873115 12 | 3,186927836,C,T,rs13095782,T,0.752962034876219 13 | 3,186947857,C,T,rs3887925,T,1.52187930901443 14 | 3,23414091,T,C,rs35352848,T,1.21643075310443 15 | 3,35626269,A,G,rs10490871,G,0.895805899824739 16 | 3,9472332,G,A,rs3872707,A,1.27207396389584 17 | 4,105127134,T,C,rs17035289,C,1.50731554574906 18 | 4,20209330,A,G,rs11940813,G,0.99297172533189 19 | 4,751184,G,T,rs1531583,T,1.12056880681728 20 | 5,14788235,T,C,rs31931,T,1.45529760950762 21 | 5,158599736,C,T,rs890940,T,0.773259704059205 22 | 6,153106967,A,G,rs9383649,G,0.812825108274305 23 | 6,20679478,A,G,rs7756992,G,2.4158851314958 24 | 6,21008508,G,A,rs12192642,G,1.0179259075545 25 | 6,21079126,A,G,rs11967298,A,0.786669456973421 26 | 6,41044666,A,G,rs4714422,G,0.785686443600627 27 | 6,43790159,C,A,rs998584,A,1.36807730642919 28 | 6,7231610,G,A,rs9379084,G,0.777504300052759 29 | 7,102849009,A,G,rs56269620,A,0.841545229287769 30 | 7,30688836,C,T,rs917195,C,1.25275539527058 31 | 7,74693803,G,A,rs67755137,A,1.00561625403194 32 | 8,117173494,A,G,rs11558471,A,1.57021450268091 33 | 8,14267300,A,C,rs17294565,C,0.772226910587326 34 | 8,41643342,C,T,rs59191643,T,1.58981160160702 35 | 8,41672271,C,T,rs80105613,T,0.836685731976527 36 | 9,114181077,A,G,rs1431819,G,0.91771140542696 37 | 9,123351733,T,C,rs4838049,C,0.988468432674962 38 | 9,19080354,A,G,rs10963942,G,1.20289365831587 39 | 9,22003368,G,A,rs1063192,A,1.4671350314182 40 | 9,22134095,T,C,rs10811661,T,1.35935876634001 41 | 10,112989975,G,A,rs35011184,A,1.09498269557365 42 | 10,112999686,G,A,rs11196187,A,0.99848985803851 43 | 10,122433665,T,G,rs2280141,T,0.771338774506973 44 | 10,12265895,C,T,rs11257655,T,0.754770534442257 45 | 10,69692529,T,C,rs2812533,T,0.848763744071059 46 | 10,92105487,C,T,rs72807217,T,1.21863058213275 47 | 10,92703125,C,T,rs1111875,C,2.19850973740941 48 | 10,92740354,G,A,rs11187152,G,0.919680097388424 49 | 10,97299888,C,A,rs10882891,C,0.921215873844895 50 | 11,1675619,C,T,rs2334499,T,1.16349952469302 51 | 11,17388025,T,C,rs5219,T,0.751504045374263 52 | 11,2080053,G,A,rs11600952,A,1.07055990704825 53 | 11,2176056,A,G,rs4929965,A,1.13083153510612 54 | 11,2835964,A,C,rs2237895,C,0.885974644001596 55 | 11,2887735,A,G,rs450563,G,1.10786668524409 56 | 11,43855148,C,T,rs11555762,T,0.94971081927632 57 | 11,45834447,C,T,rs6485644,C,1.26005361446507 58 | 11,61798436,T,C,rs174541,T,1.59006227316493 59 | 11,92983658,G,A,rs11020132,A,0.775554598095402 60 | 12,108224853,C,T,rs3764002,C,1.55270649486474 61 | 12,121017786,G,A,rs56158042,G,1.08594768430691 62 | 12,121483887,C,T,rs7977709,C,1.25973625944639 63 | 12,21628312,G,T,rs10841868,G,1.14015482497832 64 | 12,26138216,C,T,rs56008051,C,0.830199393097582 65 | 12,6572620,A,G,rs67013744,G,0.809827435852229 66 | 13,32980164,G,A,rs576674,G,0.769566338445002 67 | 14,22819744,A,G,rs17122776,G,0.820396791694036 68 | 14,76834520,C,T,rs2056857,C,0.793738038091963 69 | 15,62099409,C,T,rs7163757,C,2.47960484122266 70 | 15,89836982,C,A,rs8031576,C,1.01613891151716 71 | 15,90968837,G,A,rs2290203,A,1.05530558031703 72 | 16,250389,T,C,rs55857387,T,1.0330434186243 73 | 17,36486677,A,G,rs4796224,G,0.762622711438937 74 | 17,37740776,A,G,rs11657964,A,0.806954666517041 75 | 17,48046280,A,C,rs9895554,A,0.990992463718845 76 | 17,9884641,G,A,rs7219033,A,1.53943529075368 77 | 18,57013486,G,A,rs12969494,G,0.966317439945646 78 | 19,45653979,G,A,rs10407429,G,1.14864321922069 79 | 20,41203988,T,C,rs17265513,C,0.986958226806199 80 | 20,62649310,C,T,rs2427363,C,1.10892377064271 81 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Beta Cell 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117605762,G,A,rs41276588,A,0.748458007347135 3 | 1,213985913,T,C,rs340874,C,1.50768670293772 4 | 1,229537208,G,A,rs348330,G,0.761342766656576 5 | 2,27508073,T,C,rs1260326,C,0.785198311558966 6 | 2,43444037,C,T,rs13414140,C,1.32635211961424 7 | 3,123346931,A,G,rs11708067,A,2.19165673547182 8 | 3,125202613,C,T,rs9873519,T,0.790351147190566 9 | 3,141415727,C,T,rs73872717,C,0.892883540217284 10 | 3,171015287,G,A,rs9873618,G,2.0205957229683 11 | 3,185803160,A,G,rs9854769,G,0.918261576228066 12 | 3,188024054,A,G,rs6777684,G,1.01919813122508 13 | 4,184792454,G,A,rs72695645,G,0.821010028344514 14 | 4,6291188,A,G,rs10010131,G,0.743525812495279 15 | 5,77131486,G,A,rs6878122,G,1.14389148167193 16 | 6,20679478,A,G,rs7756992,G,1.28626568929162 17 | 6,7275708,T,C,rs11243150,T,0.928903505017285 18 | 7,14858657,C,T,rs17168486,T,1.57737700043141 19 | 7,15024684,G,T,rs2191349,T,2.31443791524668 20 | 7,28160478,C,T,rs1513272,C,0.796075848951056 21 | 7,44161285,T,C,rs2041547,C,1.79951127425708 22 | 7,44195138,C,T,rs2908286,T,3.96133647523467 23 | 7,44325950,C,T,rs116913033,C,0.790033277345729 24 | 8,117173494,A,G,rs11558471,A,2.6136452141312 25 | 8,144285362,T,C,rs7828303,T,0.830068773578635 26 | 8,94672919,A,C,rs11786992,A,0.813994537199422 27 | 9,22134095,T,C,rs10811661,T,1.38544207775808 28 | 9,22137686,T,G,rs7018475,G,1.06669271435742 29 | 9,4291928,A,C,rs10974438,C,1.37497690065346 30 | 10,111288563,C,T,rs10885123,C,1.28455257339798 31 | 10,112989975,G,A,rs35011184,A,2.62472848176576 32 | 10,112999686,G,A,rs11196187,A,0.793197108006303 33 | 10,113064714,G,A,rs10885410,G,0.757624060468176 34 | 10,113101545,A,G,rs10885414,A,0.756106327158006 35 | 10,12265895,C,T,rs11257655,T,0.963638260283328 36 | 10,69340132,G,A,rs10159477,G,1.24198869315766 37 | 11,2671019,C,T,rs231360,T,0.741101217668221 38 | 11,2835964,A,C,rs2237895,C,0.856599374358303 39 | 11,45834447,C,T,rs6485644,C,0.893196006972234 40 | 11,72721940,G,A,rs11603334,G,1.39565082078472 41 | 12,132493708,C,T,rs11614914,T,0.895318119515759 42 | 12,4275678,T,G,rs76895963,T,0.8896763935819 43 | 12,97454449,C,T,rs113036477,C,1.05579805257086 44 | 13,32980164,G,A,rs576674,G,1.18368599542561 45 | 15,62099409,C,T,rs7163757,C,0.814269257625806 46 | 15,77489993,A,G,rs12910361,G,0.964617012002076 47 | 20,58812207,A,G,rs911300,G,0.791727162282264 48 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Bilirubin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,233414615,A,G,rs838733,G,1.5238203831105 3 | 2,233660318,C,T,rs2602374,T,12.179891843664 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Cholesterol.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 5,75660016,G,A,rs253412,A,0.783887678173866 3 | 11,61798436,T,C,rs174541,T,0.73969606852391 4 | 19,19268740,C,T,rs58542926,T,1.8861528421618 5 | 19,44854120,T,C,rs4803764,T,2.88351258749252 6 | 19,44908684,T,C,rs429358,T,7.76835350626357 7 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Hyper Insulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117544762,G,A,rs2767323,G,1.06850776010211 3 | 1,147649198,G,A,rs79489938,G,0.758406347323169 4 | 1,205075959,G,A,rs1572993,A,1.5781307777311 5 | 2,164838388,C,T,rs355799,C,0.995210581126864 6 | 2,25412902,A,G,rs34845373,A,1.39200791735617 7 | 2,60328479,A,G,rs35824707,G,0.792662982698415 8 | 2,60357684,G,A,rs243021,A,0.865499331610328 9 | 2,60491212,C,T,rs7599488,T,0.988819438304368 10 | 2,65049318,T,C,rs1009358,T,1.22892931755776 11 | 3,12070747,C,T,rs12489177,T,0.812081073731205 12 | 3,141415727,C,T,rs73872717,C,0.822404304660683 13 | 3,15699882,T,C,rs4465929,T,0.888925538334548 14 | 3,36828739,A,G,rs11129735,A,1.45011873486622 15 | 3,49943163,C,T,rs4688760,T,1.05742855785914 16 | 3,71606894,C,T,rs844215,C,1.18706070735424 17 | 4,156731601,C,A,rs28819812,C,1.32965890684405 18 | 4,48892964,G,A,rs2289065,G,1.21424859452003 19 | 5,134529762,A,G,rs329124,G,1.00130765376286 20 | 5,54029680,A,G,rs31229,G,0.965661922747786 21 | 5,56503357,C,A,rs157843,A,1.02135333483549 22 | 5,56569218,C,T,rs66485956,T,0.946921308917374 23 | 5,68418419,A,G,rs4976033,G,1.04637295206295 24 | 5,88370824,G,T,rs2410763,T,1.1199061768283 25 | 6,39048860,C,T,rs10305420,C,0.963576281930092 26 | 6,39089153,T,C,rs9296291,T,0.964564282312171 27 | 7,150840547,G,A,rs62492368,A,0.956430787690121 28 | 7,23523649,C,T,rs76365198,C,1.02125747627562 29 | 7,2721116,C,A,rs798549,C,1.05903011044634 30 | 7,50514274,A,G,rs2876826,G,0.754799359198824 31 | 7,50960703,T,C,rs1018942,T,1.18560012451619 32 | 8,10342743,C,T,rs73195303,C,0.839306814640715 33 | 8,10928869,C,A,rs28566988,C,1.19498619603307 34 | 8,144285362,T,C,rs7828303,T,1.03031649213372 35 | 8,9324101,G,A,rs2126263,G,1.07378550456749 36 | 8,9404091,G,A,rs62493853,A,0.765689614863192 37 | 9,1039939,G,A,rs756145,A,0.852062118451503 38 | 9,94180777,G,A,rs10821311,A,0.758765388754933 39 | 10,63545492,C,T,rs12263348,T,1.33440212033686 40 | 10,68583018,C,T,rs10998304,C,1.11279715156381 41 | 10,73839369,A,G,rs2675662,A,0.88512430991802 42 | 11,32439327,C,T,rs7943101,T,0.882817814928975 43 | 11,65527328,C,T,rs1783541,T,0.758466325974499 44 | 11,76528106,C,T,rs2513523,T,0.73522821888135 45 | 12,117974568,G,A,rs34965774,A,1.00621468152664 46 | 12,123973455,C,A,rs12823740,C,1.2112050480976 47 | 12,95534337,C,A,rs11108094,A,1.03981494779862 48 | 13,41114265,A,G,rs4397977,A,0.786186211380992 49 | 13,58562303,C,T,rs4886092,C,0.744234357277956 50 | 14,102910984,G,A,rs10133111,A,1.27553182821105 51 | 14,29275326,A,G,rs8005994,A,0.959587336467102 52 | 15,39354022,A,G,rs17622532,A,0.997518161298793 53 | 15,52807109,T,C,rs75332279,C,1.0533509136327 54 | 16,81501185,T,C,rs2925979,T,0.827427027290875 55 | 16,81576875,G,A,rs11642655,A,0.858903664903455 56 | 16,918292,G,A,rs4984980,A,0.771560526620746 57 | 17,31310290,G,A,rs12602834,G,1.25445170819062 58 | 17,4149155,T,G,rs11652572,T,0.795253836224119 59 | 17,77390827,A,G,rs1656794,G,1.50510338691836 60 | 17,78765957,G,A,rs62075585,G,0.928655402338812 61 | 18,13566625,C,T,rs113780182,T,0.854306244187556 62 | 18,42507133,G,T,rs1431841,T,0.965322255476429 63 | 19,4949909,G,A,rs12977104,A,0.745338342172867 64 | 19,7903283,G,A,rs2115107,A,1.38635303732756 65 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Lipodystrophy 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,11239454,C,T,rs7544489,T,1.062952593457 3 | 1,172391892,G,T,rs4916253,G,1.500246740117 4 | 1,219568478,G,A,rs2820444,G,2.17849243941319 5 | 1,39354638,C,T,rs61779275,T,0.905656530439659 6 | 2,145590469,G,A,rs7609422,G,0.787122369319404 7 | 2,164672366,C,T,rs13389219,C,2.85569882509556 8 | 2,164838388,C,T,rs355799,C,0.959670396756692 9 | 2,226087049,A,G,rs17231538,A,1.12833654721908 10 | 2,226236593,T,C,rs2972145,C,2.56106579833414 11 | 2,25312674,T,C,rs34048824,T,0.977629369910311 12 | 2,65434334,A,C,rs12185610,A,0.786637992723619 13 | 3,12252703,A,G,rs9826367,A,0.846334489463612 14 | 3,12288284,C,T,rs17036160,C,1.46090054738595 15 | 3,12443429,A,C,rs60710264,C,1.11936111015193 16 | 3,185159838,A,G,rs10937208,G,0.971056303335598 17 | 3,185701201,A,G,rs73175562,A,0.86637571501157 18 | 3,185803160,A,G,rs9854769,G,1.15644342672621 19 | 3,46948414,G,A,rs12491473,G,1.2111506687999 20 | 3,47652174,C,T,rs62262091,T,1.01388288647304 21 | 3,53089257,A,G,rs891368,G,0.898953307776491 22 | 3,64722438,C,T,rs4132228,C,1.74665906967227 23 | 4,156731601,C,A,rs28819812,C,1.28080873150431 24 | 4,88818328,C,T,rs13131633,T,1.91915702228461 25 | 4,88830707,C,T,rs7660000,C,1.04564316356407 26 | 5,158599736,C,T,rs890940,T,0.759547663080347 27 | 5,56566067,G,A,rs9687846,A,1.53921910750697 28 | 5,68418419,A,G,rs4976033,G,0.970214727491333 29 | 6,139515991,T,C,rs11155073,T,2.74724318371907 30 | 6,163711969,C,T,rs4709746,C,1.16406196348262 31 | 6,34279270,C,T,rs77136196,T,1.82551588591726 32 | 6,43790159,C,A,rs998584,A,3.28618195974912 33 | 7,130768623,A,G,rs1596972,G,0.805694198492982 34 | 7,28160478,C,T,rs1513272,C,0.802795869424777 35 | 8,19973410,C,T,rs10096633,C,1.33295639349242 36 | 10,63545492,C,T,rs12263348,T,0.875831654885228 37 | 11,64263769,C,T,rs35169799,T,2.10171892843801 38 | 11,65638129,G,T,rs2306363,G,1.12035933468448 39 | 12,121222984,T,C,rs25643,C,0.789629638410106 40 | 12,123008576,A,G,rs12820906,A,0.994715586903685 41 | 12,123863518,C,T,rs11057376,T,1.39159451808673 42 | 12,123973455,C,A,rs12823740,C,2.9872520617637 43 | 12,124025844,C,T,rs10773051,C,2.3417345784784 44 | 12,26138216,C,T,rs56008051,C,0.907557365091295 45 | 12,26312652,C,T,rs11048458,T,2.20455814061685 46 | 12,65965972,C,A,rs8756,A,0.83117843176366 47 | 12,71129263,C,A,rs1705263,C,0.808572961413311 48 | 15,39354022,A,G,rs17622532,A,0.74772412052731 49 | 15,40088651,G,A,rs2242186,A,0.883526716548812 50 | 16,53395261,C,T,rs2908797,C,0.832470382095215 51 | 16,81501185,T,C,rs2925979,T,2.09487020762773 52 | 16,81576875,G,A,rs11642655,A,0.775114585172354 53 | 17,17807956,A,G,rs4925114,A,1.14223858165401 54 | 17,7646363,C,T,rs1641523,C,0.740951629444799 55 | 18,63178651,T,C,rs12454712,T,0.811591683451073 56 | 19,7235135,A,G,rs17175860,G,0.754031453229013 57 | 19,8364439,G,A,rs116843064,G,0.83176142135224 58 | 20,34008898,G,A,rs2268078,A,0.886389370907059 59 | 20,52383088,T,C,rs2426439,C,0.87250292231854 60 | 20,64080106,C,T,rs8126001,C,0.837788372743015 61 | 22,38204535,C,T,rs2267373,T,1.23484184455549 62 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Lipodystrophy 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,26958422,C,T,rs79598313,T,0.888015136432011 3 | 2,164672366,C,T,rs13389219,C,1.14063104078903 4 | 2,226236593,T,C,rs2972145,C,1.89015160924628 5 | 2,65049318,T,C,rs1009358,T,0.862733719467642 6 | 3,12252703,A,G,rs9826367,A,0.92830771830743 7 | 3,12288284,C,T,rs17036160,C,2.02911253590886 8 | 3,135906656,C,T,rs9852406,T,1.22995669281764 9 | 5,53976834,G,A,rs4865796,A,0.929463278915424 10 | 5,56503357,C,A,rs157843,A,1.51438235945776 11 | 5,68418419,A,G,rs4976033,G,0.927069934053808 12 | 7,130768623,A,G,rs1596972,G,0.998770029334821 13 | 8,10138879,A,G,rs34990153,A,0.789724773751201 14 | 8,19973410,C,T,rs10096633,C,1.47334889996029 15 | 8,8342448,G,A,rs62496027,A,0.749405889748518 16 | 8,9324101,G,A,rs2126263,G,1.17044882791933 17 | 9,1039939,G,A,rs756145,A,0.78001272191752 18 | 10,100152307,T,C,rs2862954,T,2.45036397686714 19 | 11,47870031,T,C,rs11604324,C,0.824748485040415 20 | 15,60646617,C,A,rs8033609,A,0.751884212448393 21 | 18,63178651,T,C,rs12454712,T,0.845392522961538 22 | 19,19268740,C,T,rs58542926,T,1.82222860052993 23 | 19,33444196,C,T,rs2287821,C,0.745393407987891 24 | 19,7903283,G,A,rs2115107,A,0.813833592469361 25 | 20,33720469,C,T,rs67611724,T,0.775716633669226 26 | 20,44413724,C,T,rs1800961,T,1.11008238172632 27 | 20,46966072,G,A,rs6066138,G,0.91985248758557 28 | 22,43928975,G,A,rs3747207,A,3.84592717411791 29 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Liver-Lipid.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,39354638,C,T,rs61779275,T,0.745213024949263 3 | 2,27508073,T,C,rs1260326,C,7.41431942024394 4 | 6,126470949,A,G,rs11759026,G,0.782090004684815 5 | 8,115627247,T,C,rs2737226,T,0.746153081743436 6 | 8,9324101,G,A,rs2126263,G,2.39961390738293 7 | 11,61798436,T,C,rs174541,T,1.50296407059363 8 | 12,120979061,C,T,rs1800574,T,0.913991554177009 9 | 12,121017786,G,A,rs56158042,G,1.35618944275262 10 | 20,44413724,C,T,rs1800961,T,1.13321439663169 11 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Obesity.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,111749681,T,C,rs197379,C,0.906686937859058 3 | 1,177904706,G,A,rs490689,A,1.7232856957805 4 | 1,201794371,C,T,rs567185,T,0.836071559295798 5 | 1,62114219,G,T,rs12140153,G,0.806830055368476 6 | 1,72374091,C,A,rs2613503,A,1.04128134326271 7 | 2,160289101,A,G,rs6432613,G,0.862847696815228 8 | 2,174332817,C,A,rs12992995,C,0.83215696271914 9 | 2,180753927,A,G,rs6741676,A,1.11660443016913 10 | 2,228108944,A,G,rs7561798,G,0.8976388932199 11 | 2,422144,T,C,rs62107261,T,1.98298069078581 12 | 2,58706456,C,T,rs1861410,C,1.43799570093576 13 | 2,59084401,G,A,rs980183,G,1.14525943633913 14 | 2,653874,C,T,rs10188334,C,1.78172903748151 15 | 3,131926093,A,G,rs1225052,G,0.94314067454209 16 | 3,15699882,T,C,rs4465929,T,0.966285047478682 17 | 3,173401978,A,G,rs686998,G,1.01381352259294 18 | 3,35626269,A,G,rs10490871,G,1.15278867636112 19 | 3,47652174,C,T,rs62262091,T,0.851811627881173 20 | 3,49943163,C,T,rs4688760,T,1.26999868676804 21 | 3,89937130,T,C,rs11716527,C,0.804234986757685 22 | 3,9472332,G,A,rs3872707,A,0.780789589036755 23 | 4,136162038,A,C,rs1296328,A,1.05797603780357 24 | 4,139958775,G,T,rs2604918,G,0.859608018407233 25 | 4,45184122,G,A,rs10938398,A,1.69150857286116 26 | 4,90322714,G,A,rs7656001,A,0.832106478481 27 | 5,75660016,G,A,rs253412,A,1.15329123782529 28 | 5,79141082,C,A,rs12519500,C,0.877663146272336 29 | 5,88370824,G,T,rs2410763,T,0.817225490295907 30 | 6,125731213,A,G,rs2008027,G,0.77606784551336 31 | 6,40441504,T,C,rs34298980,T,1.05321873125 32 | 6,50821065,A,C,rs3798519,C,1.3835338791592 33 | 7,30688836,C,T,rs917195,C,0.778750849720167 34 | 7,50514274,A,G,rs2876826,G,0.78236721159589 35 | 7,70320618,G,A,rs1880368,A,0.763004684876532 36 | 8,115627247,T,C,rs2737226,T,0.866919137696046 37 | 8,115876242,A,C,rs1569339,C,0.789808624557141 38 | 8,30996517,A,G,rs2725371,A,1.17904485455876 39 | 9,123824284,C,A,rs1752169,A,1.02267196970459 40 | 9,28410685,T,C,rs1412234,C,1.28360402194409 41 | 10,33729802,T,C,rs36051838,C,0.834743288007079 42 | 10,75887349,G,A,rs7099048,A,0.880274244123929 43 | 11,43855148,C,T,rs11555762,T,1.24429723286945 44 | 11,47659818,T,G,rs1056387,T,0.866516283324162 45 | 11,65527328,C,T,rs1783541,T,0.78347857436091 46 | 11,8632981,T,C,rs10769936,C,0.916255412126923 47 | 12,41444433,T,C,rs2733289,C,0.894063867574765 48 | 12,49869365,G,A,rs7132908,A,1.59394719860947 49 | 13,30468315,G,T,rs11842871,G,1.02320162169402 50 | 13,58082465,T,C,rs9563574,T,1.0218543205169 51 | 14,102910984,G,A,rs10133111,A,1.0774972017038 52 | 14,103393972,T,G,rs12890750,G,0.926623481724374 53 | 14,32833676,G,T,rs17522122,T,0.951612312983814 54 | 14,46844338,G,A,rs2933211,A,0.814660384231547 55 | 14,69059590,C,T,rs4899280,T,0.904985376354747 56 | 14,79318962,A,G,rs10498536,A,0.825727843742782 57 | 14,79473182,C,T,rs10145154,T,1.24531115706448 58 | 16,15059860,T,C,rs9927842,T,0.748716106908275 59 | 16,28906323,T,C,rs7188071,T,1.67962453569743 60 | 16,29946895,G,A,rs8054556,A,1.31040484740787 61 | 16,53767042,T,C,rs1421085,C,4.06996774228195 62 | 16,53794125,C,T,rs73612051,C,1.01961505829021 63 | 16,53832063,C,T,rs9302652,C,1.12501007204517 64 | 16,69534400,G,T,rs2032912,G,1.34594325721168 65 | 17,36486677,A,G,rs4796224,G,0.891280448890062 66 | 17,48982960,C,A,rs35895680,C,0.760610664245459 67 | 17,67853811,T,G,rs2046323,G,0.908559679700781 68 | 18,23503774,C,T,rs303760,T,1.11100743455543 69 | 18,60178844,C,T,rs663640,T,2.12136439115411 70 | 18,60383735,C,T,rs79688165,C,1.5795552438972 71 | 19,18723704,C,T,rs10404726,C,1.16284431655652 72 | 19,44908684,T,C,rs429358,T,1.00539943715663 73 | 19,47076928,A,G,rs11667244,G,0.877008931378147 74 | 20,52383088,T,C,rs2426439,C,0.967245182676206 75 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Proinsulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117605762,G,A,rs41276588,A,1.20856189101032 3 | 1,6612669,A,C,rs11583755,C,1.05467736031182 4 | 1,72374091,C,A,rs2613503,A,0.817293538109438 5 | 2,157478828,G,A,rs7568172,G,0.77594787045051306 6 | 2,212971253,A,G,rs17354348,A,0.732103309684394 7 | 2,262553,T,C,rs7596678,T,1.011889366208 8 | 2,43202667,G,A,rs10174764,A,0.91526286448804 9 | 2,58706456,C,T,rs1861410,C,1.15013355887749 10 | 3,186947857,C,T,rs3887925,T,1.34817619568173 11 | 3,35626269,A,G,rs10490871,G,1.22625669641648 12 | 4,184792454,G,A,rs72695645,G,0.749231424200112 13 | 5,103250706,G,A,rs75432112,A,1.34673511603681 14 | 5,75660016,G,A,rs253412,A,0.785428511797446 15 | 6,20679478,A,G,rs7756992,G,1.05783101135753 16 | 6,20736006,C,A,rs9358361,A,0.979907797810997 17 | 6,43843237,G,A,rs9369425,G,1.43699979087686 18 | 7,157147880,A,G,rs7798816,A,1.04550740766286 19 | 8,10138879,A,G,rs34990153,A,0.779160010387261 20 | 8,41643342,C,T,rs59191643,T,1.9137822511338 21 | 8,41651058,C,A,rs13262861,C,0.752403657263152 22 | 9,133269828,C,T,rs495203,T,0.979071076171371 23 | 9,136355594,T,C,rs78503878,C,0.861585019257345 24 | 9,19080354,A,G,rs10963942,G,0.788975886826281 25 | 10,112989259,G,A,rs11196181,G,1.32323478820699 26 | 10,112989975,G,A,rs35011184,A,0.909537760703753 27 | 10,112999686,G,A,rs11196187,A,0.977230036516327 28 | 10,12265895,C,T,rs11257655,T,1.4133021037223 29 | 10,70888579,C,T,rs827237,T,0.762494267706961 30 | 10,75887349,G,A,rs7099048,A,0.999874939413022 31 | 10,92196795,T,G,rs7071943,G,0.990836509897838 32 | 10,92703125,C,T,rs1111875,C,1.34288155106174 33 | 10,97299888,C,A,rs10882891,C,0.768376635725208 34 | 11,17388025,T,C,rs5219,T,1.06653488409401 35 | 11,2835964,A,C,rs2237895,C,1.2430368002492 36 | 11,45834447,C,T,rs6485644,C,1.24648413352621 37 | 11,69234580,C,T,rs55974245,C,0.878253973987723 38 | 11,72721940,G,A,rs11603334,G,1.20194371183669 39 | 11,93280221,T,G,rs7943372,G,0.82025363003658 40 | 12,121017786,G,A,rs56158042,G,1.34790357076606 41 | 12,4275530,C,T,rs3217792,C,1.36302549295646 42 | 13,32980164,G,A,rs576674,G,1.3948446847927 43 | 13,50519978,T,G,rs9316500,T,1.01955253387866 44 | 13,91302938,G,A,rs9523299,G,0.734808662675904 45 | 14,76834520,C,T,rs2056857,C,0.821735190803588 46 | 15,89836982,C,A,rs8031576,C,0.978977844759263 47 | 16,53767042,T,C,rs1421085,C,0.7641299913021 48 | 17,42544897,C,T,rs684214,T,0.872152701713213 49 | 17,48982960,C,A,rs35895680,C,0.956498495420977 50 | 20,44366172,T,C,rs12625671,C,0.795998517724989 51 | 20,63691346,A,G,rs6011033,G,0.733908926452423 52 | 20,63839432,G,A,rs4809369,G,0.801811875421717 53 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/SHBG-LpA.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,26958422,C,T,rs79598313,T,1.76988798874768 3 | 6,160354985,C,A,rs543159,C,2.58672811443119 4 | 10,63545492,C,T,rs12263348,T,2.20027042335222 5 | 12,120979061,C,T,rs1800574,T,0.765632283390125 6 | 12,121017786,G,A,rs56158042,G,0.748315576819208 7 | 12,4275678,T,G,rs76895963,T,1.04077446531587 8 | 17,7646363,C,T,rs1641523,C,5.63473967075984 9 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Total_GRS.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,111749681,T,C,rs197379,C,0.0269 3 | 1,11239454,C,T,rs7544489,T,0.027 4 | 1,117544762,G,A,rs2767323,G,0.0339 5 | 1,117605762,G,A,rs41276588,A,0.0375 6 | 1,147649198,G,A,rs79489938,G,0.1131 7 | 1,155299985,A,G,rs3020781,G,0.0333 8 | 1,172391892,G,T,rs4916253,G,0.0274 9 | 1,177904706,G,A,rs490689,A,0.0335 10 | 1,201794371,C,T,rs567185,T,0.037 11 | 1,20404624,T,C,rs10916785,T,0.0271 12 | 1,205075959,G,A,rs1572993,A,0.031 13 | 1,213985913,T,C,rs340874,C,0.0668 14 | 1,219568478,G,A,rs2820444,G,0.0472 15 | 1,229537208,G,A,rs348330,G,0.0525 16 | 1,26156065,T,C,rs10794522,C,0.0261 17 | 1,26958422,C,T,rs79598313,T,0.084 18 | 1,39354638,C,T,rs61779275,T,0.0753 19 | 1,50972693,C,T,rs3176466,C,0.0642 20 | 1,62114219,G,T,rs12140153,G,0.0461 21 | 1,6612669,A,C,rs11583755,C,0.0387 22 | 1,72374091,C,A,rs2613503,A,0.0392 23 | 2,120568332,G,A,rs9784137,G,0.0608 24 | 2,144503607,A,C,rs2033159,C,0.0352 25 | 2,145590469,G,A,rs7609422,G,0.0295 26 | 2,157478828,G,A,rs7568172,G,0.0674 27 | 2,160289101,A,G,rs6432613,G,0.0393 28 | 2,164672366,C,T,rs13389219,C,0.0648 29 | 2,164838388,C,T,rs355799,C,0.0439 30 | 2,174332817,C,A,rs12992995,C,0.0312 31 | 2,180753927,A,G,rs6741676,A,0.0324 32 | 2,18540396,G,A,rs7558413,A,0.0286 33 | 2,212971253,A,G,rs17354348,A,0.0294 34 | 2,226087049,A,G,rs17231538,A,0.076 35 | 2,226236593,T,C,rs2972145,C,0.09 36 | 2,226340889,A,G,rs12694695,A,0.0291 37 | 2,228108944,A,G,rs7561798,G,0.0281 38 | 2,233414615,A,G,rs838733,G,0.0289 39 | 2,233660318,C,T,rs2602374,T,0.0291 40 | 2,25312674,T,C,rs34048824,T,0.0323 41 | 2,25412902,A,G,rs34845373,A,0.0372 42 | 2,262553,T,C,rs7596678,T,0.0271 43 | 2,27508073,T,C,rs1260326,C,0.0638 44 | 2,422144,T,C,rs62107261,T,0.0749 45 | 2,43202667,G,A,rs10174764,A,0.0392 46 | 2,43444037,C,T,rs13414140,C,0.1183 47 | 2,58706456,C,T,rs1861410,C,0.0353 48 | 2,59084401,G,A,rs980183,G,0.0363 49 | 2,60328479,A,G,rs35824707,G,0.0685 50 | 2,60357684,G,A,rs243021,A,0.054 51 | 2,60491212,C,T,rs7599488,T,0.0255 52 | 2,65049318,T,C,rs1009358,T,0.0493 53 | 2,653874,C,T,rs10188334,C,0.0503 54 | 2,65434334,A,C,rs12185610,A,0.0451 55 | 2,67651196,T,C,rs1430780,T,0.0273 56 | 3,12070747,C,T,rs12489177,T,0.0745 57 | 3,12252703,A,G,rs9826367,A,0.0402 58 | 3,12288284,C,T,rs17036160,C,0.1028 59 | 3,123346931,A,G,rs11708067,A,0.0782 60 | 3,12443429,A,C,rs60710264,C,0.0374 61 | 3,125202613,C,T,rs9873519,T,0.0375 62 | 3,131926093,A,G,rs1225052,G,0.0271 63 | 3,135906656,C,T,rs9852406,T,0.0375 64 | 3,141415727,C,T,rs73872717,C,0.0856 65 | 3,152675088,T,G,rs9828639,T,0.0204 66 | 3,15699882,T,C,rs4465929,T,0.0304 67 | 3,171015287,G,A,rs9873618,G,0.0581 68 | 3,173401978,A,G,rs686998,G,0.0267 69 | 3,185159838,A,G,rs10937208,G,0.0443 70 | 3,185701201,A,G,rs73175562,A,0.0531 71 | 3,185803160,A,G,rs9854769,G,0.1075 72 | 3,186927836,C,T,rs13095782,T,0.0285 73 | 3,186947857,C,T,rs3887925,T,0.0424 74 | 3,188024054,A,G,rs6777684,G,0.0569 75 | 3,196083316,T,C,rs7619708,T,0.0334 76 | 3,23414091,T,C,rs35352848,T,0.0623 77 | 3,35626269,A,G,rs10490871,G,0.0272 78 | 3,36828739,A,G,rs11129735,A,0.026 79 | 3,46948414,G,A,rs12491473,G,0.0215 80 | 3,47652174,C,T,rs62262091,T,0.0557 81 | 3,49943163,C,T,rs4688760,T,0.034 82 | 3,53089257,A,G,rs891368,G,0.0247 83 | 3,64722438,C,T,rs4132228,C,0.0471 84 | 3,71606894,C,T,rs844215,C,0.0262 85 | 3,89937130,T,C,rs11716527,C,0.0492 86 | 3,9472332,G,A,rs3872707,A,0.0446 87 | 4,105127134,T,C,rs17035289,C,0.0428 88 | 4,136162038,A,C,rs1296328,A,0.0251 89 | 4,139958775,G,T,rs2604918,G,0.0313 90 | 4,152583143,C,T,rs6819331,C,0.0404 91 | 4,156731601,C,A,rs28819812,C,0.0384 92 | 4,18023861,G,A,rs2011603,A,0.0383 93 | 4,184792454,G,A,rs72695645,G,0.0611 94 | 4,20209330,A,G,rs11940813,G,0.0371 95 | 4,45184122,G,A,rs10938398,A,0.0425 96 | 4,48892964,G,A,rs2289065,G,0.0272 97 | 4,6291188,A,G,rs10010131,G,0.0816 98 | 4,751184,G,T,rs1531583,T,0.0994 99 | 4,88818328,C,T,rs13131633,T,0.0231 100 | 4,88830707,C,T,rs7660000,C,0.0309 101 | 4,90322714,G,A,rs7656001,A,0.026 102 | 5,103250706,G,A,rs75432112,A,0.1344 103 | 5,134529762,A,G,rs329124,G,0.0259 104 | 5,14788235,T,C,rs31931,T,0.038 105 | 5,158599736,C,T,rs890940,T,0.0484 106 | 5,53478680,G,A,rs62370480,A,0.0335 107 | 5,53976834,G,A,rs4865796,A,0.0466 108 | 5,54029680,A,G,rs31229,G,0.0391 109 | 5,56503357,C,A,rs157843,A,0.0688 110 | 5,56566067,G,A,rs9687846,A,0.0659 111 | 5,56569218,C,T,rs66485956,T,0.0385 112 | 5,68418419,A,G,rs4976033,G,0.028 113 | 5,75660016,G,A,rs253412,A,0.0459 114 | 5,77131486,G,A,rs6878122,G,0.0554 115 | 5,79141082,C,A,rs12519500,C,0.038 116 | 5,88370824,G,T,rs2410763,T,0.031 117 | 6,125731213,A,G,rs2008027,G,0.0272 118 | 6,126470949,A,G,rs11759026,G,0.0646 119 | 6,127093693,A,G,rs719727,A,0.056 120 | 6,139515991,T,C,rs11155073,T,0.0304 121 | 6,153106967,A,G,rs9383649,G,0.0329 122 | 6,160354985,C,A,rs543159,C,0.0324 123 | 6,163711969,C,T,rs4709746,C,0.0581 124 | 6,20679478,A,G,rs7756992,G,0.1217 125 | 6,20736006,C,A,rs9358361,A,0.043 126 | 6,21008508,G,A,rs12192642,G,0.0545 127 | 6,21079126,A,G,rs11967298,A,0.0378 128 | 6,34279270,C,T,rs77136196,T,0.0662 129 | 6,39048860,C,T,rs10305420,C,0.0316 130 | 6,39089153,T,C,rs9296291,T,0.0325 131 | 6,40441504,T,C,rs34298980,T,0.0381 132 | 6,41044666,A,G,rs4714422,G,0.0293 133 | 6,43790159,C,A,rs998584,A,0.0365 134 | 6,43834141,C,T,rs2894536,C,0.0384 135 | 6,43843237,G,A,rs9369425,G,0.0384 136 | 6,50821065,A,C,rs3798519,C,0.0495 137 | 6,7084624,G,T,rs4959424,G,0.0343 138 | 6,7231610,G,A,rs9379084,G,0.0747 139 | 6,7275708,T,C,rs11243150,T,0.0447 140 | 7,100715101,A,G,rs534043,G,0.0447 141 | 7,102849009,A,G,rs56269620,A,0.0316 142 | 7,130768623,A,G,rs1596972,G,0.0403 143 | 7,14858657,C,T,rs17168486,T,0.069 144 | 7,15024684,G,T,rs2191349,T,0.0661 145 | 7,150840547,G,A,rs62492368,A,0.0342 146 | 7,157147880,A,G,rs7798816,A,0.0489 147 | 7,15886603,T,C,rs38221,T,0.033 148 | 7,23523649,C,T,rs76365198,C,0.0496 149 | 7,2721116,C,A,rs798549,C,0.0295 150 | 7,28160478,C,T,rs1513272,C,0.0811 151 | 7,30688836,C,T,rs917195,C,0.0467 152 | 7,40777054,C,T,rs17439448,T,0.0404 153 | 7,44161285,T,C,rs2041547,C,0.0385 154 | 7,44195138,C,T,rs2908286,T,0.0683 155 | 7,44325950,C,T,rs116913033,C,0.0408 156 | 7,4643627,G,A,rs62450857,A,0.0388 157 | 7,50514274,A,G,rs2876826,G,0.0306 158 | 7,50960703,T,C,rs1018942,T,0.0711 159 | 7,70320618,G,A,rs1880368,A,0.0311 160 | 7,74693803,G,A,rs67755137,A,0.0333 161 | 8,10138879,A,G,rs34990153,A,0.0384 162 | 8,10342743,C,T,rs73195303,C,0.0314 163 | 8,10928869,C,A,rs28566988,C,0.0392 164 | 8,115627247,T,C,rs2737226,T,0.0383 165 | 8,115876242,A,C,rs1569339,C,0.0347 166 | 8,117173494,A,G,rs11558471,A,0.1031 167 | 8,14267300,A,C,rs17294565,C,0.0274 168 | 8,144285362,T,C,rs7828303,T,0.0427 169 | 8,19973410,C,T,rs10096633,C,0.0404 170 | 8,26015118,A,G,rs17818197,G,0.0346 171 | 8,26031751,C,T,rs7834679,C,0.0306 172 | 8,30996517,A,G,rs2725371,A,0.0371 173 | 8,41643342,C,T,rs59191643,T,0.0444 174 | 8,41651058,C,A,rs13262861,C,0.1016 175 | 8,41672271,C,T,rs80105613,T,0.0641 176 | 8,8342448,G,A,rs62496027,A,0.0276 177 | 8,9324101,G,A,rs2126263,G,0.0608 178 | 8,9404091,G,A,rs62493853,A,0.0563 179 | 8,94672919,A,C,rs11786992,A,0.0319 180 | 8,94919116,C,T,rs75912292,T,0.0783 181 | 9,1039939,G,A,rs756145,A,0.0286 182 | 9,114181077,A,G,rs1431819,G,0.0293 183 | 9,123351733,T,C,rs4838049,C,0.027 184 | 9,123824284,C,A,rs1752169,A,0.0317 185 | 9,133269828,C,T,rs495203,T,0.0493 186 | 9,136355594,T,C,rs78503878,C,0.0614 187 | 9,19080354,A,G,rs10963942,G,0.0371 188 | 9,22003368,G,A,rs1063192,A,0.0434 189 | 9,22134095,T,C,rs10811661,T,0.1379 190 | 9,22137686,T,G,rs7018475,G,0.1094 191 | 9,28410685,T,C,rs1412234,C,0.044 192 | 9,4291928,A,C,rs10974438,C,0.047 193 | 9,94180777,G,A,rs10821311,A,0.0358 194 | 10,100152307,T,C,rs2862954,T,0.0291 195 | 10,111288563,C,T,rs10885123,C,0.0316 196 | 10,112989259,G,A,rs11196181,G,0.1381 197 | 10,112989975,G,A,rs35011184,A,0.2608 198 | 10,112999686,G,A,rs11196187,A,0.1943 199 | 10,113064714,G,A,rs10885410,G,0.0895 200 | 10,113101545,A,G,rs10885414,A,0.0787 201 | 10,122433665,T,G,rs2280141,T,0.0453 202 | 10,12265895,C,T,rs11257655,T,0.0907 203 | 10,33729802,T,C,rs36051838,C,0.044 204 | 10,63545492,C,T,rs12263348,T,0.0282 205 | 10,68583018,C,T,rs10998304,C,0.0306 206 | 10,69340132,G,A,rs10159477,G,0.0408 207 | 10,69692529,T,C,rs2812533,T,0.0383 208 | 10,70888579,C,T,rs827237,T,0.0365 209 | 10,73839369,A,G,rs2675662,A,0.0267 210 | 10,75887349,G,A,rs7099048,A,0.0282 211 | 10,92105487,C,T,rs72807217,T,0.0478 212 | 10,92196795,T,G,rs7071943,G,0.0443 213 | 10,92703125,C,T,rs1111875,C,0.0923 214 | 10,92740354,G,A,rs11187152,G,0.0537 215 | 10,97299888,C,A,rs10882891,C,0.0313 216 | 11,1675619,C,T,rs2334499,T,0.0283 217 | 11,17388025,T,C,rs5219,T,0.0685 218 | 11,2080053,G,A,rs11600952,A,0.0356 219 | 11,2176056,A,G,rs4929965,A,0.0619 220 | 11,2671019,C,T,rs231360,T,0.0506 221 | 11,2835964,A,C,rs2237895,C,0.0725 222 | 11,28513351,G,A,rs4923543,A,0.0238 223 | 11,2887735,A,G,rs450563,G,0.0341 224 | 11,32439327,C,T,rs7943101,T,0.0358 225 | 11,43855148,C,T,rs11555762,T,0.0407 226 | 11,45834447,C,T,rs6485644,C,0.0294 227 | 11,47659818,T,G,rs1056387,T,0.0352 228 | 11,47870031,T,C,rs11604324,C,0.0736 229 | 11,61798436,T,C,rs174541,T,0.0292 230 | 11,64263769,C,T,rs35169799,T,0.0504 231 | 11,65527328,C,T,rs1783541,T,0.0484 232 | 11,65638129,G,T,rs2306363,G,0.0438 233 | 11,69234580,C,T,rs55974245,C,0.037 234 | 11,72721940,G,A,rs11603334,G,0.0899 235 | 11,76528106,C,T,rs2513523,T,0.0255 236 | 11,8632981,T,C,rs10769936,C,0.0349 237 | 11,92983658,G,A,rs11020132,A,0.1424 238 | 11,93280221,T,G,rs7943372,G,0.0382 239 | 12,108224853,C,T,rs3764002,C,0.0396 240 | 12,117974568,G,A,rs34965774,A,0.0521 241 | 12,120979061,C,T,rs1800574,T,0.147 242 | 12,121017786,G,A,rs56158042,G,0.0566 243 | 12,121222984,T,C,rs25643,C,0.0248 244 | 12,121483887,C,T,rs7977709,C,0.0261 245 | 12,123008576,A,G,rs12820906,A,0.0433 246 | 12,123863518,C,T,rs11057376,T,0.0394 247 | 12,123973455,C,A,rs12823740,C,0.041 248 | 12,124025844,C,T,rs10773051,C,0.035 249 | 12,12718165,T,G,rs2066827,G,0.0351 250 | 12,132493708,C,T,rs11614914,T,0.0389 251 | 12,21628312,G,T,rs10841868,G,0.0315 252 | 12,26138216,C,T,rs56008051,C,0.032 253 | 12,26312652,C,T,rs11048458,T,0.0458 254 | 12,41444433,T,C,rs2733289,C,0.0301 255 | 12,4265207,A,G,rs11063069,G,0.0458 256 | 12,4275530,C,T,rs3217792,C,0.1108 257 | 12,4275678,T,G,rs76895963,T,0.4388 258 | 12,48342520,C,A,rs2732480,C,0.0337 259 | 12,49869365,G,A,rs7132908,A,0.033 260 | 12,6572620,A,G,rs67013744,G,0.035 261 | 12,65965972,C,A,rs8756,A,0.0432 262 | 12,71129263,C,A,rs1705263,C,0.0398 263 | 12,95534337,C,A,rs11108094,A,0.0601 264 | 12,97454449,C,T,rs113036477,C,0.0715 265 | 13,30468315,G,T,rs11842871,G,0.0299 266 | 13,32980164,G,A,rs576674,G,0.0612 267 | 13,41114265,A,G,rs4397977,A,0.0291 268 | 13,50519978,T,G,rs9316500,T,0.0473 269 | 13,58082465,T,C,rs9563574,T,0.0406 270 | 13,58562303,C,T,rs4886092,C,0.0198 271 | 13,80143021,G,A,rs1359790,G,0.0795 272 | 13,91302938,G,A,rs9523299,G,0.0418 273 | 14,102910984,G,A,rs10133111,A,0.0341 274 | 14,103393972,T,G,rs12890750,G,0.0284 275 | 14,22819744,A,G,rs17122776,G,0.029 276 | 14,29275326,A,G,rs8005994,A,0.0265 277 | 14,32833676,G,T,rs17522122,T,0.0343 278 | 14,46844338,G,A,rs2933211,A,0.0268 279 | 14,69059590,C,T,rs4899280,T,0.0276 280 | 14,76834520,C,T,rs2056857,C,0.0261 281 | 14,79318962,A,G,rs10498536,A,0.043 282 | 14,79473182,C,T,rs10145154,T,0.0546 283 | 15,39354022,A,G,rs17622532,A,0.026 284 | 15,40088651,G,A,rs2242186,A,0.0643 285 | 15,40106553,T,C,rs484943,T,0.0332 286 | 15,52807109,T,C,rs75332279,C,0.0559 287 | 15,60646617,C,A,rs8033609,A,0.0265 288 | 15,62099409,C,T,rs7163757,C,0.0405 289 | 15,75522047,C,T,rs6495182,C,0.0406 290 | 15,77489993,A,G,rs12910361,G,0.0716 291 | 15,89836982,C,A,rs8031576,C,0.057 292 | 15,90968837,G,A,rs2290203,A,0.0557 293 | 16,15059860,T,C,rs9927842,T,0.0375 294 | 16,250389,T,C,rs55857387,T,0.0524 295 | 16,28906323,T,C,rs7188071,T,0.0292 296 | 16,29946895,G,A,rs8054556,A,0.0364 297 | 16,53395261,C,T,rs2908797,C,0.0363 298 | 16,53767042,T,C,rs1421085,C,0.1177 299 | 16,53794125,C,T,rs73612051,C,0.0565 300 | 16,53832063,C,T,rs9302652,C,0.044 301 | 16,69534400,G,T,rs2032912,G,0.0421 302 | 16,81501185,T,C,rs2925979,T,0.0448 303 | 16,81576875,G,A,rs11642655,A,0.0288 304 | 16,918292,G,A,rs4984980,A,0.0351 305 | 17,17807956,A,G,rs4925114,A,0.033 306 | 17,31310290,G,A,rs12602834,G,0.0289 307 | 17,36486677,A,G,rs4796224,G,0.0252 308 | 17,37703678,G,A,rs2189301,G,0.0462 309 | 17,37740776,A,G,rs11657964,A,0.0589 310 | 17,4149155,T,G,rs11652572,T,0.1151 311 | 17,42544897,C,T,rs684214,T,0.0424 312 | 17,48046280,A,C,rs9895554,A,0.0549 313 | 17,48982960,C,A,rs35895680,C,0.0559 314 | 17,578364,C,T,rs11870735,T,0.0339 315 | 17,67853811,T,G,rs2046323,G,0.0484 316 | 17,7646363,C,T,rs1641523,C,0.0255 317 | 17,77390827,A,G,rs1656794,G,0.0311 318 | 17,78765957,G,A,rs62075585,G,0.0295 319 | 17,9884641,G,A,rs7219033,A,0.0288 320 | 18,13566625,C,T,rs113780182,T,0.0309 321 | 18,23503774,C,T,rs303760,T,0.0342 322 | 18,42507133,G,T,rs1431841,T,0.0309 323 | 18,55383415,A,C,rs72926932,C,0.0749 324 | 18,57013486,G,A,rs12969494,G,0.0292 325 | 18,60178844,C,T,rs663640,T,0.0503 326 | 18,60383735,C,T,rs79688165,C,0.1216 327 | 18,63178651,T,C,rs12454712,T,0.0412 328 | 19,18723704,C,T,rs10404726,C,0.0277 329 | 19,19268740,C,T,rs58542926,T,0.089 330 | 19,33401503,C,T,rs889138,C,0.0312 331 | 19,33444196,C,T,rs2287821,C,0.0239 332 | 19,44854120,T,C,rs4803764,T,0.0342 333 | 19,44908684,T,C,rs429358,T,0.073 334 | 19,45653979,G,A,rs10407429,G,0.053900000000000003 335 | 19,47076928,A,G,rs11667244,G,0.0354 336 | 19,4949909,G,A,rs12977104,A,0.0412 337 | 19,7235135,A,G,rs17175860,G,0.0445 338 | 19,7903283,G,A,rs2115107,A,0.0382 339 | 19,8364439,G,A,rs116843064,G,0.0965 340 | 20,33720469,C,T,rs67611724,T,0.0443 341 | 20,34008898,G,A,rs2268078,A,0.0387 342 | 20,41203988,T,C,rs17265513,C,0.0325 343 | 20,44366172,T,C,rs12625671,C,0.0653 344 | 20,44413724,C,T,rs1800961,T,0.1077 345 | 20,46966072,G,A,rs6066138,G,0.0451 346 | 20,52383088,T,C,rs2426439,C,0.0365 347 | 20,58812207,A,G,rs911300,G,0.0348 348 | 20,62649310,C,T,rs2427363,C,0.027 349 | 20,63691346,A,G,rs6011033,G,0.0338 350 | 20,63839432,G,A,rs4809369,G,0.0337 351 | 20,64080106,C,T,rs8126001,C,0.0281 352 | 22,38204535,C,T,rs2267373,T,0.0284 353 | 22,43928975,G,A,rs3747207,A,0.0471 354 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/VAT Neg.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,144503607,A,C,rs2033159,C,0.807589704525295 3 | 2,226340889,A,G,rs12694695,A,1.05866105098753 4 | 3,12288284,C,T,rs17036160,C,0.969348804976055 5 | 3,171015287,G,A,rs9873618,G,0.830976371594637 6 | 4,152583143,C,T,rs6819331,C,0.821925217224353 7 | 5,134529762,A,G,rs329124,G,0.933352162688514 8 | 5,158599736,C,T,rs890940,T,0.76040754992184 9 | 5,56503357,C,A,rs157843,A,1.88634886045515 10 | 6,126470949,A,G,rs11759026,G,1.10123173241437 11 | 6,127093693,A,G,rs719727,A,1.13808379282431 12 | 6,160354985,C,A,rs543159,C,1.59514925179458 13 | 6,20736006,C,A,rs9358361,A,0.964379582992306 14 | 6,43843237,G,A,rs9369425,G,1.43338766867525 15 | 6,7084624,G,T,rs4959424,G,0.883157584356791 16 | 7,130768623,A,G,rs1596972,G,1.03325167441624 17 | 8,26031751,C,T,rs7834679,C,1.21208127201891 18 | 8,94919116,C,T,rs75912292,T,0.734691383702554 19 | 9,1039939,G,A,rs756145,A,0.851590615916217 20 | 11,1675619,C,T,rs2334499,T,0.80137380662311 21 | 11,28513351,G,A,rs4923543,A,0.924268516061603 22 | 11,72721940,G,A,rs11603334,G,1.77517413008634 23 | 12,4265207,A,G,rs11063069,G,0.776917146095885 24 | 13,80143021,G,A,rs1359790,G,0.806173341281547 25 | 15,62099409,C,T,rs7163757,C,0.766860570318094 26 | 17,37703678,G,A,rs2189301,G,0.890959695586082 27 | 17,578364,C,T,rs11870735,T,0.803746718455257 28 | 19,33401503,C,T,rs889138,C,1.78066725909926 29 | 19,33444196,C,T,rs2287821,C,0.845286039772586 30 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/ALP Neg.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,27508073,T,C,rs1260326,C,0.810841422500325 3 | 6,139515991,T,C,rs11155073,T,1.27808171314604 4 | 6,43846888,G,A,rs10456526,A,0.783560908019758 5 | 9,133274084,T,C,rs529565,C,7.5980383148254 6 | 11,61798436,T,C,rs174541,T,1.97989469578866 7 | 19,19268740,C,T,rs58542926,T,1.23063141179833 8 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Beta Cell 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117626841,G,A,rs2282456,A,0.790593075157008 3 | 1,147257439,A,G,rs3766524,G,0.785182467856915 4 | 1,201880798,A,G,rs41304257,A,0.786528889062839 5 | 1,21767674,C,T,rs12122128,C,1.23150726989541 6 | 1,46072648,T,C,rs785513,C,0.957021530441465 7 | 1,6612669,A,C,rs11583755,C,1.37031857925042 8 | 2,120579620,A,G,rs9308614,A,0.794281728405421 9 | 2,148675516,A,G,rs7558502,A,0.788501335050814 10 | 2,15958271,A,G,rs6750986,G,1.06347249633723 11 | 2,212964997,T,C,rs4673712,T,1.1692230131902 12 | 2,233395552,A,G,rs838722,G,1.25460247019725 13 | 2,67651196,T,C,rs1430780,T,0.943112677071227 14 | 3,186947857,C,T,rs3887925,T,1.75581631491611 15 | 3,23415589,T,C,rs13094957,T,1.21016941892711 16 | 3,28690319,G,A,rs9869477,G,0.825613746430223 17 | 3,35628658,G,A,rs1470560,A,1.07868296862592 18 | 3,9472332,G,A,rs3872707,A,1.29719164698694 19 | 4,105127134,T,C,rs17035289,C,1.60712536039401 20 | 4,155776632,T,C,rs2125799,C,1.04059530188695 21 | 4,726892,G,A,rs73221116,A,1.17145106687441 22 | 5,151945039,T,G,rs302395,G,0.862187129765993 23 | 5,158602726,G,A,rs1650505,A,0.801378806152145 24 | 6,138887596,C,A,rs1188832,C,0.895490707331789 25 | 6,20686765,C,A,rs9368222,A,2.49835691340481 26 | 6,41044378,A,C,rs9367093,C,0.780216120296282 27 | 6,43790159,C,A,rs998584,A,1.34310954078254 28 | 7,102793231,A,G,rs112613078,A,0.823742646266636 29 | 7,30688836,C,T,rs917195,C,1.29952150472681 30 | 7,36703281,C,T,rs6978327,T,0.806240575751213 31 | 7,74662156,A,G,rs13238568,G,1.32515294362145 32 | 7,90174320,T,C,rs6956980,C,0.781549811483159 33 | 8,117172544,C,T,rs13266634,C,1.53444442512662 34 | 8,134763303,C,T,rs4294149,T,1.25838719583287 35 | 8,14291481,A,C,rs35753840,C,0.819747448050646 36 | 8,38485494,T,C,rs328301,T,1.07567462759089 37 | 9,114181077,A,G,rs1431819,G,0.854451885941679 38 | 9,123331143,C,T,rs10739629,C,1.38927139145449 39 | 9,136353630,A,G,rs28642213,G,0.807196935889104 40 | 9,136360621,G,A,rs11145958,A,1.15528474059506 41 | 9,19074540,A,G,rs12380322,G,1.36006311916359 42 | 9,22134095,T,C,rs10811661,T,1.36383382139055 43 | 10,114054165,C,T,rs10787516,C,0.822099927248126 44 | 10,122433665,T,G,rs2280141,T,0.83747716545817 45 | 10,12265895,C,T,rs11257655,T,0.962620651889189 46 | 10,13498869,A,C,rs11258422,C,0.964423963268342 47 | 10,92703125,C,T,rs1111875,C,2.35424533937019 48 | 10,97331612,G,A,rs945187,G,1.01454202820145 49 | 11,1675619,C,T,rs2334499,T,1.22212251401522 50 | 11,17396930,C,A,rs757110,C,0.812644009592181 51 | 11,30586586,G,A,rs11031140,A,1.14106624642707 52 | 11,43856909,G,A,rs35251247,A,1.02641196863762 53 | 11,61798436,T,C,rs174541,T,1.52395304158992 54 | 11,69071719,T,C,rs3750957,C,1.410036755746 55 | 11,8655516,C,A,rs7941510,C,0.834137037387623 56 | 12,108236003,G,A,rs1426371,G,1.60222736004863 57 | 12,121017786,G,A,rs56158042,G,1.35511608016868 58 | 12,121544257,A,G,rs4981013,G,0.947263136685008 59 | 12,21690642,C,T,rs11046164,C,1.64158566799243 60 | 12,57574955,G,A,rs11172254,G,0.826223443528635 61 | 12,6582286,G,A,rs7316626,A,0.787098928431362 62 | 12,80923335,A,G,rs11114655,G,0.9771018307699 63 | 14,22819980,T,C,rs17122782,C,0.786334241817425 64 | 14,76834520,C,T,rs2056857,C,0.924686226918637 65 | 15,62099409,C,T,rs7163757,C,2.397207778958 66 | 15,83878470,C,T,rs1812707,T,1.03292104876316 67 | 15,89838046,C,T,rs893617,C,1.07569550074564 68 | 15,90968837,G,A,rs2290203,A,1.02709147510979 69 | 16,250389,T,C,rs55857387,T,1.10768821641707 70 | 16,78955,C,T,rs1013358,C,0.880647004389011 71 | 16,89564222,C,T,rs12932337,T,1.15003648357614 72 | 17,36506381,T,C,rs1109442,C,0.847605408953373 73 | 17,37739849,C,T,rs11651755,C,0.832093852088434 74 | 17,48101312,G,A,rs3744347,A,0.858605143270236 75 | 17,67649976,G,A,rs11658220,A,1.2145404391293 76 | 17,9884528,A,G,rs17810376,G,1.4907865287698 77 | 18,34002926,T,C,rs17747955,C,1.1213800974715 78 | 19,1646713,C,T,rs4807125,T,0.907214707837185 79 | 19,45655159,G,A,rs8107527,A,0.793392743864734 80 | 20,2119449,G,A,rs6137042,G,1.10261097898468 81 | 20,33848172,C,T,rs7274168,T,0.821809381249592 82 | 20,62649310,C,T,rs2427363,C,1.08007592675979 83 | 22,35309366,A,G,rs138771,A,1.37711769959318 84 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Beta Cell 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117626841,G,A,rs2282456,A,0.93032919071777 3 | 1,213985913,T,C,rs340874,C,1.6762660744519 4 | 1,229537208,G,A,rs348330,G,0.851039929325742 5 | 3,123346931,A,G,rs11708067,A,2.46744557309717 6 | 3,125202613,C,T,rs9873519,T,0.917203516585986 7 | 3,141382997,A,C,rs56243018,A,0.937403954956027 8 | 3,160434511,C,A,rs13403,C,0.877194450937002 9 | 3,171007094,T,C,rs8192675,T,2.55532848821731 10 | 3,185816694,G,A,rs9859406,A,1.02159387010531 11 | 3,188024054,A,G,rs6777684,G,1.18297263294537 12 | 5,77139179,G,A,rs7732130,G,1.11690554030416 13 | 6,20686765,C,A,rs9368222,A,1.45549628524273 14 | 6,7231610,G,A,rs9379084,G,0.975638874942903 15 | 6,7275693,A,G,rs11243149,A,0.934005086925184 16 | 7,14858657,C,T,rs17168486,T,1.50947366516019 17 | 7,15024630,G,T,rs2191348,T,2.30492784565317 18 | 7,28154778,C,A,rs860262,C,1.01713840341402 19 | 7,44184122,G,A,rs730497,A,4.56659061893173 20 | 7,44322946,C,T,rs11772021,C,0.904170136644996 21 | 7,90174320,T,C,rs6956980,C,0.849658415498303 22 | 8,117172544,C,T,rs13266634,C,2.56574126174382 23 | 8,94672919,A,C,rs11786992,A,0.960998820377985 24 | 8,94955144,A,G,rs10808671,A,0.785767900374254 25 | 9,136353630,A,G,rs28642213,G,0.980326704726829 26 | 9,22134095,T,C,rs10811661,T,1.44895718662821 27 | 9,22137686,T,G,rs7018475,G,0.931188856932379 28 | 9,4290541,C,A,rs10116772,A,1.33235499392144 29 | 10,111279909,C,T,rs11195502,C,1.39063168449064 30 | 10,112989975,G,A,rs35011184,A,2.66073173372723 31 | 10,12265895,C,T,rs11257655,T,1.28382554339938 32 | 11,2836003,G,A,rs60808706,G,1.08302359948751 33 | 11,45837033,G,A,rs12419690,G,0.815094115612855 34 | 11,61798436,T,C,rs174541,T,0.879486392485044 35 | 11,72721940,G,A,rs11603334,G,1.51934651195302 36 | 12,132493708,C,T,rs11614914,T,0.92752312567208 37 | 12,48342520,C,A,rs2732480,C,0.859374774395466 38 | 13,32988367,T,C,rs7997912,C,1.17886278318088 39 | 15,77489993,A,G,rs12910361,G,1.033996056745 40 | 16,250389,T,C,rs55857387,T,0.838644792384136 41 | 18,55383415,A,C,rs72926932,C,0.889741746333478 42 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Bilirubin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,233395552,A,G,rs838722,G,1.27131145158399 3 | 2,233759924,C,T,rs887829,T,14.3539837522901 4 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Cholesterol.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 5,75581978,T,C,rs5744672,T,1.08710434354778 3 | 15,58384622,G,A,rs11858759,G,0.946624771314544 4 | 19,19268740,C,T,rs58542926,T,1.96176642040467 5 | 19,44854120,T,C,rs4803764,T,2.86428721519638 6 | 19,44908684,T,C,rs429358,T,7.71724647555388 7 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Hyper Insulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,205075211,A,G,rs11240351,G,1.38958402750427 3 | 2,211410212,A,G,rs3828242,A,0.926494175774555 4 | 2,25412902,A,G,rs34845373,A,1.29579628911929 5 | 2,60357684,G,A,rs243021,A,1.11076642972781 6 | 3,36828739,A,G,rs11129735,A,1.12517756322701 7 | 3,49958085,T,C,rs6792892,C,1.76664185358465 8 | 3,88080986,C,T,rs73146095,C,1.32242042514351 9 | 4,44501486,G,A,rs34617913,A,0.948875110203746 10 | 4,51932458,T,C,rs1996617,C,1.17388003767607 11 | 5,134525973,C,T,rs329118,T,0.913841075351517 12 | 5,68418419,A,G,rs4976033,G,1.07548669590129 13 | 5,88401716,C,T,rs6870983,C,1.09002759214344 14 | 6,39048860,C,T,rs10305420,C,1.0232830052884 15 | 7,131889849,A,C,rs12667919,A,1.10651010188789 16 | 7,150840547,G,A,rs62492368,A,0.829251653337375 17 | 7,50510270,T,C,rs73121277,C,0.907315236654959 18 | 7,55734370,T,C,rs6972291,C,0.840283826446018 19 | 8,125618324,G,T,rs72724622,T,1.1973906210298 20 | 8,144301594,C,T,rs13268508,T,1.14805880248489 21 | 8,56583505,G,A,rs3887059,A,0.856144433795765 22 | 9,95033139,G,A,rs6479591,A,0.943154663623236 23 | 10,68622422,A,G,rs10998338,A,1.29665590452712 24 | 10,86357561,C,A,rs11201992,C,0.823809510571429 25 | 11,20930691,G,A,rs16907058,A,1.15171901943159 26 | 11,65558683,G,A,rs12789028,A,0.979598620500255 27 | 12,123943784,T,C,rs4930726,T,1.08154227375956 28 | 12,132060098,C,T,rs11830241,T,1.30895050959448 29 | 12,20438398,G,A,rs7134150,A,1.9221010387598 30 | 12,50489375,T,C,rs4519166,C,0.965545904776462 31 | 14,102909694,C,T,rs4906272,T,1.44933193891777 32 | 14,29257736,C,A,rs2333486,C,0.963458317119539 33 | 15,49501823,T,C,rs7169799,C,0.985610373873054 34 | 15,52790274,G,T,rs2440317,T,0.874242975234916 35 | 15,58384622,G,A,rs11858759,G,0.97286839395891 36 | 15,98733292,G,T,rs59646751,T,1.16109993287073 37 | 16,56437498,T,C,rs7189122,C,0.794036018387664 38 | 16,917241,A,G,rs12918782,G,0.880733714884338 39 | 17,31301531,C,A,rs2040792,C,1.32680097001544 40 | 17,77377118,T,C,rs312827,C,0.927961953624098 41 | 19,13065206,G,T,rs76567647,G,0.824970466633634 42 | 19,7903283,G,A,rs2115107,A,1.26973952886874 43 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Lipodystrophy 1.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,11257875,T,C,rs7554251,C,1.12657196230347 3 | 1,172399170,A,G,rs7546252,A,1.6480616896423 4 | 1,205075211,A,G,rs11240351,G,0.829713414422773 5 | 1,219577375,G,T,rs4846567,G,2.24461761071688 6 | 1,50743476,T,C,rs79090772,T,0.87587766177136706 7 | 2,111496274,T,C,rs1345203,T,1.40731557289665 8 | 2,164651879,C,T,rs10184004,C,2.85459330697619 9 | 2,218303709,G,A,rs1877712,G,0.812724620542421 10 | 2,226241205,C,T,rs2943650,T,2.34046222220295 11 | 2,65434334,A,C,rs12185610,A,0.800130752673367 12 | 3,12288284,C,T,rs17036160,C,1.32664508848256 13 | 3,129301935,A,G,rs6765930,A,0.793869800817226 14 | 3,185816694,G,A,rs9859406,A,1.20220197510624 15 | 3,64717718,G,T,rs66815886,G,1.98375391538072 16 | 4,144692655,G,A,rs6812830,G,1.2017416879526 17 | 4,156731601,C,A,rs28819812,C,1.42558363265606 18 | 4,88791970,C,T,rs9991328,T,1.95389709474007 19 | 5,158602726,G,A,rs1650505,A,0.848002161817244 20 | 5,56564954,A,G,rs3936511,G,1.9015115805769 21 | 5,68418419,A,G,rs4976033,G,0.930751438979016 22 | 6,139515991,T,C,rs11155073,T,2.85825785444522 23 | 6,160349886,T,G,rs501470,T,0.975867970776777 24 | 6,163711969,C,T,rs4709746,C,1.1573485541831 25 | 6,34276355,T,C,rs115245297,C,2.07064637140282 26 | 6,43790159,C,A,rs998584,A,3.58426411434303 27 | 7,130748625,C,T,rs4731702,C,0.817693983117954 28 | 7,25931533,G,A,rs6951827,G,0.835501837852443 29 | 7,28154778,C,A,rs860262,C,0.905773237340398 30 | 8,125618324,G,T,rs72724622,T,0.804415283565313 31 | 8,19973410,C,T,rs10096633,C,1.24960854946381 32 | 8,71507263,T,G,rs10096191,G,1.07475948476029 33 | 11,64333304,A,G,rs1662185,A,1.41127179514205 34 | 12,123943784,T,C,rs4930726,T,3.14739730540445 35 | 12,20317265,A,G,rs11045171,A,1.08591356581262 36 | 12,26310241,G,A,rs11048457,G,2.19887439257176 37 | 12,65965972,C,A,rs8756,A,0.950683509338436 38 | 12,71126981,T,G,rs10879261,G,0.859815702817528 39 | 15,58384622,G,A,rs11858759,G,0.884350253265618 40 | 15,63579093,C,T,rs7178762,C,0.928750119188819 41 | 16,81500184,C,A,rs56823429,C,2.0551961455962 42 | 17,17804815,T,C,rs11654081,T,1.30611229962082 43 | 18,63178651,T,C,rs12454712,T,0.862867594267646 44 | 19,33405526,A,C,rs4805881,A,1.00166925096288 45 | 19,7293108,T,C,rs8101064,T,0.898086333992418 46 | 20,52417142,G,A,rs4809906,G,0.935518590529031 47 | 20,64060707,A,C,rs6090040,A,0.818319211773247 48 | 22,38204535,C,T,rs2267373,T,1.17890503451983 49 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Lipodystrophy 2.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,120691632,C,T,rs62167015,C,0.797398616538251 3 | 2,164651879,C,T,rs10184004,C,1.29449369783007 4 | 2,226241205,C,T,rs2943650,T,2.13241645864393 5 | 2,65052280,A,G,rs2723065,A,1.08764223224661 6 | 3,12288284,C,T,rs17036160,C,2.22637661452041 7 | 3,136350630,G,T,rs667920,T,1.2460137382691 8 | 4,156731601,C,A,rs28819812,C,0.829824733359207 9 | 5,158602726,G,A,rs1650505,A,0.815428674289436 10 | 5,53976834,G,A,rs4865796,A,1.37888331795972 11 | 5,56511543,C,T,rs464605,T,1.94940914048045 12 | 5,56564954,A,G,rs3936511,G,0.960913815167076 13 | 5,68418419,A,G,rs4976033,G,1.11410145533654 14 | 6,163711969,C,T,rs4709746,C,0.946221868858236 15 | 7,130748625,C,T,rs4731702,C,1.04704307479767 16 | 8,10117074,A,G,rs60384372,A,0.839422035421327 17 | 8,19973410,C,T,rs10096633,C,1.12398176692462 18 | 8,37000965,A,G,rs13365225,G,0.860056466802468 19 | 9,1036552,C,T,rs7856320,T,0.863105213838364 20 | 9,134025582,G,A,rs379417,A,0.820532423562114 21 | 9,95516131,C,T,rs113154802,C,1.031773196178 22 | 10,100152437,C,T,rs1408579,C,1.83862303926759 23 | 12,65965972,C,A,rs8756,A,0.871596104642325 24 | 15,60646617,C,A,rs8033609,A,0.860433493797303 25 | 15,73681321,T,C,rs57909886,C,1.08695886258769 26 | 18,63178651,T,C,rs12454712,T,1.45357229233753 27 | 19,19268740,C,T,rs58542926,T,1.72865989211715 28 | 19,33405526,A,C,rs4805881,A,1.96702300087785 29 | 20,33710480,C,T,rs13042148,T,1.20194405879043 30 | 22,43928975,G,A,rs3747207,A,3.15066074218633 31 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Liver-Lipid.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 2,27508073,T,C,rs1260326,C,7.65474930376802 3 | 6,126470949,A,G,rs11759026,G,0.879411210989312 4 | 11,61798436,T,C,rs174541,T,1.5345778005327 5 | 12,121017786,G,A,rs56158042,G,1.29916653195848 6 | 15,43557688,A,G,rs2470134,G,1.31302163060977 7 | 15,58384622,G,A,rs11858759,G,1.23315729718332 8 | 15,62099409,C,T,rs7163757,C,0.850792322763108 9 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Obesity.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,111747361,C,T,rs197374,T,0.944977870373138 3 | 1,177919890,A,C,rs539515,C,1.77618274987658 4 | 1,72285869,A,G,rs2613499,A,1.07509681204075 5 | 2,160477361,A,C,rs6710938,A,1.10942198764396 6 | 2,174332817,C,A,rs12992995,C,0.888638338965529 7 | 2,180753927,A,G,rs6741676,A,1.12851090982705 8 | 2,204511186,C,A,rs4482463,C,0.907626096223435 9 | 2,228107168,T,C,rs13415288,C,0.868571244585216 10 | 2,58748008,T,C,rs12986742,C,1.11690750554327 11 | 2,59084401,G,A,rs980183,G,1.09369174741551 12 | 2,60057965,T,C,rs980329,C,0.869351547833271 13 | 2,653874,C,T,rs10188334,C,1.81854752841946 14 | 3,132032000,G,A,rs9857204,A,1.10991821922017 15 | 3,15664617,G,A,rs924753,G,1.05467560029361 16 | 3,173389653,T,C,rs247975,C,1.03582004807104 17 | 3,173992905,C,T,rs59489841,C,0.80456892598911 18 | 3,35628658,G,A,rs1470560,A,1.10218002850575 19 | 3,49958085,T,C,rs6792892,C,1.53620660614253 20 | 3,94262216,G,T,rs978444,G,0.998587558821075 21 | 3,9472332,G,A,rs3872707,A,0.785057832791062 22 | 4,136162038,A,C,rs1296328,A,1.03256581572728 23 | 4,45184122,G,A,rs10938398,A,1.68364771299316 24 | 4,90328687,T,C,rs17227797,C,0.887036500437549 25 | 5,154167849,G,A,rs7701886,G,0.869335843477036 26 | 5,79250470,G,A,rs2591392,G,0.839894318205273 27 | 5,88401716,C,T,rs6870983,C,0.908904807832778 28 | 5,88647527,C,T,rs13162708,T,0.851267963168398 29 | 5,96514546,A,C,rs261967,C,0.876782871919807 30 | 6,154033965,A,G,rs6936615,G,0.881816019309221 31 | 6,40441504,T,C,rs34298980,T,1.02735434805859 32 | 6,50821065,A,C,rs3798519,C,1.37909130885911 33 | 7,147961447,G,A,rs1922879,G,0.807265898202514 34 | 7,36703281,C,T,rs6978327,T,0.807848279510678 35 | 7,70184697,C,A,rs6975279,A,0.80829482242000406 36 | 7,78199422,G,A,rs3779274,G,0.803907382534401 37 | 8,115553138,T,C,rs3802219,T,0.789739152097947 38 | 8,30995310,T,C,rs2725370,T,1.14259309273326 39 | 8,34645053,A,C,rs4463416,A,0.829601893953105 40 | 9,130911265,C,T,rs6597649,T,0.922589397655005 41 | 9,28410685,T,C,rs1412234,C,1.24339704923915 42 | 10,33708299,A,C,rs71495046,C,0.840352973389949 43 | 10,75799410,T,C,rs11001500,C,0.898393574857749 44 | 11,27664649,G,T,rs10767659,G,1.78723583091228 45 | 11,43856909,G,A,rs35251247,A,1.26178987772888 46 | 11,47589600,C,T,rs11039307,T,1.13128399924468 47 | 11,8655516,C,A,rs7941510,C,0.915074992836882 48 | 12,105894667,G,A,rs12825669,G,0.797567267463839 49 | 12,41469591,C,T,rs2730827,T,0.812427188756336 50 | 12,49869365,G,A,rs7132908,A,1.66007378335109 51 | 12,60857620,T,C,rs12372209,T,0.862095802597203 52 | 13,30443131,G,A,rs12856169,G,1.02779513580648 53 | 13,53533448,G,T,rs9568868,T,0.852299702586262 54 | 13,58112626,T,C,rs7988244,T,1.0455155197036 55 | 14,102909694,C,T,rs4906272,T,0.930476424007877 56 | 14,103493689,A,G,rs56365443,A,0.812300807455789 57 | 14,32834334,C,T,rs12883788,T,0.929601085192436 58 | 15,83878470,C,T,rs1812707,T,0.963076983625625 59 | 16,15059860,T,C,rs9927842,T,0.80581894840142 60 | 16,20359659,A,C,rs9929710,A,0.803482209794921 61 | 16,287691,C,T,rs34665498,C,0.801311307225093 62 | 16,28886131,G,A,rs8056890,A,1.73326630512239 63 | 16,29946895,G,A,rs8054556,A,1.27926519064507 64 | 16,53767042,T,C,rs1421085,C,3.93479302022367 65 | 16,69632780,G,A,rs244415,G,1.16597243080594 66 | 17,36506381,T,C,rs1109442,C,0.860462806182295 67 | 17,67829132,T,C,rs12603589,C,1.02962326612309 68 | 17,80783826,A,G,rs11150745,A,0.930964254926842 69 | 18,23503774,C,T,rs303760,T,1.08178030111587 70 | 18,60161902,T,C,rs6567160,C,2.22011682246798 71 | 18,65759743,G,A,rs2032217,A,0.809834682625973 72 | 19,18723704,C,T,rs10404726,C,1.17123647879523 73 | 19,44908684,T,C,rs429358,T,0.859499197841359 74 | 19,47093845,T,C,rs10408163,C,0.835333895244067 75 | 20,52417142,G,A,rs4809906,G,0.88512812164516 76 | 22,41197577,C,T,rs11913442,T,0.845173784382441 77 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Proinsulin.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 1,117626841,G,A,rs2282456,A,0.781777368119471 3 | 4,152599323,C,T,rs6813195,C,0.972242142406481 4 | 5,134525973,C,T,rs329118,T,1.12383552565093 5 | 5,56511543,C,T,rs464605,T,0.860181484456669 6 | 6,126470949,A,G,rs11759026,G,0.785301222900212 7 | 6,127091583,G,A,rs12192275,G,1.01236936342738 8 | 6,160349886,T,G,rs501470,T,2.0120436508473 9 | 6,43846888,G,A,rs10456526,A,2.16139981815442 10 | 8,25607154,G,T,rs73221948,T,1.81084320190896 11 | 8,41651058,C,A,rs13262861,C,0.926474260567206 12 | 8,94672919,A,C,rs11786992,A,0.85945345793197 13 | 9,95033139,G,A,rs6479591,A,0.816311385231373 14 | 11,32459631,C,T,rs7927401,T,0.914822126342247 15 | 11,72721940,G,A,rs11603334,G,2.57463496862983 16 | 13,80141758,G,A,rs1215451,G,0.871278308670939 17 | 19,33405526,A,C,rs4805881,A,0.825142311008953 18 | -------------------------------------------------------------------------------- /Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/SHBG-LpA.csv: -------------------------------------------------------------------------------- 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight 2 | 3,136350630,G,T,rs667920,T,1.36652016101725 3 | 6,160349886,T,G,rs501470,T,2.13268926815861 4 | 17,7628647,T,C,rs858519,T,6.61457407629734 5 | -------------------------------------------------------------------------------- /doc/Variant clustering preprocessing pipeline_plan_KW.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gwas-partitioning/bnmf-clustering/8f0590e82dff74655450cc87bf3f62c00e074545/doc/Variant clustering preprocessing pipeline_plan_KW.docx -------------------------------------------------------------------------------- /example_data/clustering_data_sources_example.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gwas-partitioning/bnmf-clustering/8f0590e82dff74655450cc87bf3f62c00e074545/example_data/clustering_data_sources_example.xlsx -------------------------------------------------------------------------------- /scripts/.Rhistory: -------------------------------------------------------------------------------- 1 | start=Sys.time() 2 | # load requires packages 3 | install.packages("pacman") 4 | pacman::p_load(tidyverse, data.table, readxl, magrittr, dplyr, strex, 5 | rstudioapi, DT, kableExtra, GenomicRanges) 6 | if (!require("BiocManager", quietly = TRUE)) 7 | install.packages("BiocManager") 8 | BiocManager::install("GenomicRanges") 9 | # SECTION 1: PULL IN GWAS INFORMATION 10 | data_dir = "../example_data/" 11 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt") # From dbSNP v1.38 -- maps positional IDs to rsIDs 12 | # GWAS for main trait 13 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"), 14 | sheet="main_gwas") %>% 15 | data.frame() 16 | setwd("~/Partners HealthCare Dropbox/Kirk Smith/MGH/bnmf-clustering/scripts") 17 | data_dir = "../example_data/" 18 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt") # From dbSNP v1.38 -- maps positional IDs to rsIDs 19 | # GWAS for main trait 20 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"), 21 | sheet="main_gwas") %>% 22 | data.frame() 23 | # GWAS for clustering traits 24 | gwas_traits <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"), 25 | sheet="trait_gwas") 26 | # GWAS to be used for final allele alignment 27 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path) 28 | gwas_ss_files <- setNames(gwas$full_path, gwas$study) 29 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait) 30 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait) 31 | View(gwas) 32 | View(gwas_traits) 33 | # GWAS to be used for final allele alignment 34 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path) 35 | gwas_ss_files <- setNames(gwas$full_path, gwas$study) 36 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait) 37 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait) 38 | # SECTION 2: PULL SIGNIFICANT VARIANTS FROM MAIN TRAIT GWAS 39 | # P-value threshold for variants in main trait 40 | PVCUTOFF = 5e-8 41 | n_gwas <- length(gwas_ss_files) 42 | vars_sig = data.frame(VAR_ID = as.character(), 43 | P_VALUE = as.numeric(), 44 | Risk_Allele=as.character(), 45 | GWAS=as.character()) 46 | print(sprintf("Pulling significant SNPs w/ pval<%.1e from %i T2D GWAS...", PVCUTOFF, n_gwas)) 47 | for(i in 1:n_gwas) { 48 | print(paste0("...Reading ", names(gwas_ss_files)[i], "...")) 49 | vars <- fread(gwas_ss_files[i], data.table = F, stringsAsFactors=F) 50 | if (!"BETA" %in% colnames(vars)){ 51 | print("Converting Odds Ratio to Log Odds Ratio...") 52 | vars <- vars %>% 53 | mutate(BETA = log(as.numeric(ODDS_RATIO))) 54 | } 55 | vars <- vars %>% 56 | filter(as.numeric(P_VALUE) <= PVCUTOFF) %>% 57 | subset(grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]", VAR_ID)) %>% 58 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove = F) %>% 59 | mutate(Risk_Allele = ifelse(BETA>=0, ALT, REF)) %>% 60 | mutate(GWAS = gwas$study[i]) %>% 61 | select(VAR_ID, P_VALUE, Risk_Allele, GWAS) 62 | print(nrow(vars)) 63 | vars_sig = rbind(vars_sig, vars) 64 | } 65 | print(paste("No. total SNPs below pval cutoff:",nrow(vars_sig))) 66 | # remove duplicates 67 | vars_sig_uniq <- vars_sig %>% 68 | arrange(VAR_ID, P_VALUE) %>% 69 | filter(!duplicated(VAR_ID)) %>% # so we remove duplicates with the higher pvalue 70 | rename(PVALUE = P_VALUE) 71 | -------------------------------------------------------------------------------- /scripts/archive/choose_variants_2021.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(data.table) 3 | 4 | 5 | # CURRENT ASSUMPTIONS ABOUT FORMATTING: 6 | # - Genome build is hg19/GrCh37 7 | # - Summary statistic datasets are whitespace-delimited with columns: VAR_ID, BETA, SE, N_PH 8 | # - Variant IDs are all of the format: CHR_POS_REF_ALT 9 | 10 | 11 | ld_pruning <- function(gwas_variants, rsID_map_file, r2=0.1) { 12 | 13 | # Given a data frame of original GWAS variants (VAR_ID) and p-values (PVALUE), 14 | # prune to a set of independent variants based on some LD threshold 15 | # Leverage the LDlinkR package to fetch LD relationships for a set of input SNPs 16 | 17 | write(gwas_variants$VAR_ID, "all_gwas_varid.tmp") 18 | all_var_df <- fread(cmd=paste0("grep -wFf all_gwas_varid.tmp ", 19 | rsID_map_file), 20 | header=F, col.names=c("VAR_ID", "rsID"), 21 | data.table=F, stringsAsFactors=F) %>% 22 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 23 | sep="_", remove=F) %>% 24 | inner_join(gwas_variants, by="VAR_ID") %>% 25 | arrange(PVALUE) # This ordering is important for the pruning steps below! 26 | system("rm all_gwas_varid.tmp") 27 | 28 | pruned_vars <- c() 29 | 30 | for (i in 1:22) { 31 | print(paste0("Pruning chromosome ", i, "...")) 32 | var_df <- filter(all_var_df, CHR == i) 33 | 34 | if (nrow(var_df) == 0) { 35 | next 36 | } else if (nrow(var_df) == 1) { 37 | pruned_vars <- c(pruned_vars, var_df$rsID) 38 | next 39 | } 40 | 41 | ld_mat <- LDlinkR::LDmatrix(snps=var_df$rsID, 42 | pop="CEU", 43 | r2d="r2", 44 | token="20ff5a8454d7") ## This should be replaced by each user's own token (retrieve at: https://ldlink.nci.nih.gov/?tab=apiaccess) 45 | rownames(ld_mat) <- ld_mat$RS_number 46 | ld_mat$RS_number <- NULL 47 | ld_mat <- as.matrix(ld_mat) 48 | ld_mat <- ld_mat[rowSums(is.na(ld_mat)) != ncol(ld_mat), 49 | colSums(is.na(ld_mat)) != nrow(ld_mat)] 50 | 51 | remaining_snps <- var_df$rsID 52 | 53 | while(length(remaining_snps) > 0) { 54 | pruned_vars <- c(pruned_vars, remaining_snps[1]) 55 | if (remaining_snps[1] %in% rownames(ld_mat)) { 56 | remaining_snps <- setdiff( 57 | remaining_snps, 58 | rownames(ld_mat)[ld_mat[, remaining_snps[1]] >= r2] 59 | ) 60 | } else { 61 | remaining_snps <- setdiff(remaining_snps, remaining_snps[1]) 62 | } 63 | } 64 | } 65 | 66 | print(paste0(length(pruned_vars), " variants remain after pruning.")) 67 | filter(all_var_df, rsID %in% pruned_vars) 68 | } 69 | 70 | 71 | count_traits_per_variant <- function(gwas_variants, ss_files) { 72 | 73 | # Given a vector of variants and a named vector of summary statistics files 74 | # for traits to be clustered, output a vector of non-missing trait fractions 75 | # per variant 76 | 77 | print("Assessing variant missingness across traits...") 78 | 79 | variant_df_list <- lapply(1:length(ss_files), function(i) { 80 | print(paste0("...Reading ", names(ss_files)[i], "...")) 81 | fread(ss_files[i], data.table=F, stringsAsFactors=F) %>% 82 | filter(VAR_ID %in% gwas_variants) 83 | }) 84 | variant_counts_df <- bind_rows(variant_df_list) %>% 85 | group_by(VAR_ID) %>% 86 | summarise(frac=n() / length(ss_files)) 87 | variant_counts <- ifelse( 88 | gwas_variants %in% variant_counts_df$VAR_ID, 89 | variant_counts_df$frac[match(gwas_variants, variant_counts_df$VAR_ID)], # If in counts data frame, take the non-missing fraction 90 | 0 # If not in data frame, then the non-missing fraction is 0 91 | ) 92 | setNames(variant_counts, gwas_variants) 93 | } 94 | 95 | 96 | find_variants_needing_proxies <- function(gwas_variant_df, var_nonmissingness, 97 | rsID_map_file) { 98 | 99 | # Given a data frame containing GWAS variants and alleles as well as a vector 100 | # of trait missingness fractions per variant (from count_traits_per_variant), 101 | # output a vector of variants that need proxies 102 | # Criteria (any of the following): 103 | # Strand-ambiguous (AT or GC) 104 | # Multi-allelic 105 | # Low-count (available in < 80% of traits) 106 | # rsID_map_file should point to a whitespace-delimited file with columns 107 | # corresponding to VAR_ID and rsID 108 | 109 | print("Choosing variants in need of proxies...") 110 | 111 | gwas_variant_df <- gwas_variant_df %>% 112 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 113 | sep="_", remove=F) 114 | 115 | need_proxies_varid <- with(gwas_variant_df, { 116 | strand_ambig <- VAR_ID[paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")] 117 | print(paste0("...", length(strand_ambig), " strand-ambiguous variants")) 118 | 119 | multi_allelic <- grep("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", VAR_ID, value=T) # i.e. ALT allele has a comma 120 | print(paste0("...", length(multi_allelic), " multi-allelic variants")) 121 | 122 | low_cnt <- VAR_ID[!(VAR_ID %in% names(var_nonmissingness)) | 123 | var_nonmissingness[VAR_ID] < 0.8] 124 | print(paste0("...", length(low_cnt), " variants with excessive missingness")) 125 | 126 | unique(c(strand_ambig, multi_allelic, low_cnt)) 127 | }) 128 | print(paste0("...", length(need_proxies_varid), " unique variants in total")) 129 | 130 | if (length(need_proxies_varid) == 0) return(tibble(VAR_ID=c(), rsID=c())) 131 | 132 | write(need_proxies_varid, "need_proxies_varid.tmp") 133 | varid_rsid_map <- fread(cmd=paste0("grep -wFf need_proxies_varid.tmp ", 134 | rsID_map_file), 135 | header=F, col.names=c("VAR_ID", "rsID"), 136 | data.table=F, stringsAsFactors=F) 137 | need_proxies_rsid <- varid_rsid_map$rsID[match(need_proxies_varid, 138 | varid_rsid_map$VAR_ID)] 139 | print(paste0("...", length(unique(varid_rsid_map$rsID)), 140 | " of these are mapped to rsIDs")) 141 | system("rm need_proxies_varid.tmp") 142 | 143 | left_join(tibble(VAR_ID=need_proxies_varid), varid_rsid_map, by="VAR_ID") 144 | } 145 | 146 | 147 | choose_proxies <- function(need_proxies, 148 | tabix_path, ld_file, 149 | rsID_map_file, trait_ss_files, 150 | pruned_variants) { 151 | 152 | # Given a vector of variants (rsIDs) needing proxies 153 | # (from find_variants_needing_proxies) and an LD reference file, 154 | # output a data frame linking each variant to a data frame containing possible 155 | # proxies (variant ID + r^2 + alleles) 156 | # Criteria for eligibility: 157 | # Not strand-ambiguous 158 | # Trait fraction >= 80% 159 | # r^2 >= 0.8 with the index variant 160 | # Choose based on first trait count, then r^2 161 | 162 | # First, run "/path/to/tabix /path/to/LDfile rsID_1 rsID_2 ..." 163 | system(paste0(tabix_path, " ", ld_file, " ", 164 | paste(need_proxies$rsID, collapse=" "), 165 | " > ld_ref.tmp")) 166 | 167 | proxy_df <- read_tsv("ld_ref.tmp", col_names=c("rsID", "proxy_data")) %>% 168 | separate_rows(proxy_data, sep=";") %>% 169 | separate(proxy_data, into=c("proxy_rsID", "r2", "D"), sep=",") 170 | 171 | write(proxy_df$proxy_rsID, "potential_proxies_rsid.tmp") 172 | potential_proxies_map <- fread(cmd=paste0("grep -wFf potential_proxies_rsid.tmp ", 173 | rsID_map_file), 174 | header=F, col.names=c("proxy_VAR_ID", "proxy_rsID"), 175 | data.table=F, stringsAsFactors=F) 176 | 177 | system("rm ld_ref.tmp potential_proxies_rsid.tmp") 178 | 179 | proxy_missingness <- count_traits_per_variant( 180 | potential_proxies_map$proxy_VAR_ID, 181 | trait_ss_files 182 | ) 183 | proxy_missingness_df <- tibble( 184 | proxy_VAR_ID=names(proxy_missingness), 185 | frac_nonmissing=proxy_missingness 186 | ) 187 | 188 | final_proxy_df <- proxy_df %>% 189 | inner_join(potential_proxies_map, by="proxy_rsID") %>% 190 | separate(proxy_VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 191 | sep="_", remove=F) %>% 192 | inner_join(proxy_missingness_df, by="proxy_VAR_ID") %>% 193 | filter( 194 | !(paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")), # Not strand-ambiguous 195 | !grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", proxy_VAR_ID), # Not multi-allelic 196 | frac_nonmissing >= 0.8, # Sufficient fraction of traits non-missing 197 | r2 >= 0.8 # Sufficient LD with the proxied variant 198 | ) %>% 199 | group_by(rsID) %>% 200 | arrange(desc(frac_nonmissing), 201 | desc(r2), 202 | CHR) %>% # Arbitrary sort for reproducibility in case of missingness + r2 ties 203 | dplyr::slice(1) %>% 204 | ungroup() 205 | 206 | proxies_found <- final_proxy_df$rsID 207 | no_proxies_found <- setdiff(need_proxies$rsID, proxies_found) 208 | print(paste0("No proxies needed for ", 209 | length(setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID)), 210 | " variants.")) 211 | print(paste0("Proxies found for ", length(proxies_found), " variants.")) 212 | print(paste0("No adequate proxies found for ", length(no_proxies_found), 213 | " variants.")) 214 | if (length(no_proxies_found) > 0) { 215 | write(no_proxies_found, "no_proxies_found.txt") 216 | print("See no_proxies_found.txt for a list of these variants.") 217 | } 218 | 219 | final_variant_set <- c( 220 | setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID), # Original pruned variants that don't need proxies 221 | final_proxy_df$proxy_VAR_ID # Proxy variants fulfilling the necessary criteria 222 | ) 223 | unique(final_variant_set) 224 | # NOTE: the unique() above simply discards duplicate proxies 225 | # (same proxy for multiple variants and/or proxy variant that is already a primary GWAS variant). 226 | # There may be a better way to deal with this. 227 | 228 | } 229 | 230 | 231 | -------------------------------------------------------------------------------- /scripts/archive/gwas_variant_selection.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | 4 | diamante_gwas <- read_tsv( 5 | "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAMANTE_eu_UKBB_dv2/T2D/DATA/GWAS_DIAMANTE_eu_UKBB_dv2.T2D.1.txt", 6 | ) %>% 7 | select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>% 8 | filter(p < 5e-8) %>% 9 | separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F) 10 | 11 | exchip_gwas <- read_tsv( 12 | "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/ExChip_ExTexT2D_dv1/T2D/DATA/ExChip_ExTexT2D_dv1.T2D.1.txt", 13 | ) %>% 14 | select(varID=VAR_ID, p=P_VALUE, N=Neff) %>% 15 | filter(p < 5e-8) %>% 16 | separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F) 17 | 18 | diagram_gwas <- read_tsv( 19 | "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAGRAM_eu_dv2/T2D/DATA/GWAS_DIAGRAM_eu_dv2.T2D.1.txt", 20 | ) %>% 21 | select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>% 22 | filter(p < 5e-8) %>% 23 | separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F) 24 | 25 | wgs_got2d_gwas <- read_tsv( 26 | "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/WGS_GoT2Dimputed_dv1/T2D/DATA/WGS_GoT2Dimputed_dv1.T2D.1.txt", 27 | ) %>% 28 | select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>% 29 | filter(p < 5e-8) %>% 30 | separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F) 31 | 32 | mahajan_gwas <- read_tsv( 33 | "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European.txt", 34 | col_types=cols(SNP="c") 35 | ) %>% 36 | select(varID=SNP, chr=Chr, pos=Pos, p=Pvalue, N=Neff, EA, NEA) %>% 37 | mutate(varID=paste(chr, pos, EA, NEA, sep="_")) %>% 38 | filter(p < 5e-8) 39 | 40 | t2d_eur_gwas <- read_tsv( 41 | "/humgen/diabetes2/users/clairekim/T2D_European.BMIunadjusted.txt", 42 | col_types=cols(SNP="c") 43 | ) %>% 44 | select(varID=SNP, chr=CHR, pos=BP, p=Pvalue, N=Neff, 45 | EA=EFFECT_ALLELE, NEA=OTHER_ALLELE) %>% 46 | mutate(varID=paste(chr, pos, EA, NEA, sep="_")) %>% 47 | filter(p < 5e-8) 48 | 49 | 50 | choose_gwas_variants <- function(ss_df_list) { 51 | 52 | # Given a list of summary statistic data frames, choose variants to take 53 | # forward for clustering 54 | # Summary statistic data frames should contain the following fields: 55 | # chr, pos, N, p, EA, NEA 56 | 57 | # Filter out studies with N < 10k 58 | low_N <- sapply(ss_df_list, function(ss_df) median(ss_df$N) < 10000) 59 | ss_df_list <- ss_df_list[which(!low_N)] 60 | 61 | # Standardize column types for binding 62 | ss_df_list <- lapply(ss_df_list, function(df) { 63 | mutate(df, 64 | chr=as.character(chr), 65 | pos=as.integer(pos)) 66 | }) 67 | 68 | # Bind summary stats from each GWAS and select variants 69 | do.call(bind_rows, c(ss_df_list, .id="gwas")) %>% 70 | group_by(chr, pos) %>% 71 | dplyr::slice(which.max(N)) %>% 72 | ungroup() 73 | } 74 | 75 | gw_variants_list <- list( 76 | diamante=diamante_gwas, 77 | exchip=exchip_gwas, 78 | diagram=diagram_gwas, 79 | wgs_got2d=wgs_got2d_gwas, 80 | mahajan=mahajan_gwas, 81 | t2d_eur=t2d_eur_gwas 82 | ) 83 | 84 | initial_gwas_variants_df <- choose_gwas_variants(gw_variants_list) 85 | 86 | # input_file_list <- list( 87 | # c(varID="VAR_ID", p="P_VALUE", N="N_PH"), 88 | # c(varID="VAR_ID", p="P_VALUE", N="Neff"), 89 | # c(varID="VAR_ID", p="P_VALUE", N="N_PH"), 90 | # c(varID="VAR_ID", p="P_VALUE", N="N_PH"), 91 | # c(varID="VAR_ID", p="Pvalue", n="Neff"), 92 | # c(varID="VAR_ID", p=) 93 | # ) 94 | # 95 | # names(input_file_list) <- c( 96 | # "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAMANTE_eu_UKBB_dv2/T2D/DATA/GWAS_DIAMANTE_eu_UKBB_dv2.T2D.1.txt", 97 | # "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/ExChip_ExTexT2D_dv1/T2D/DATA/ExChip_ExTexT2D_dv1.T2D.1.txt", 98 | # "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAGRAM_eu_dv2/T2D/DATA/GWAS_DIAGRAM_eu_dv2.T2D.1.txt", 99 | # "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/WGS_GoT2Dimputed_dv1/T2D/DATA/WGS_GoT2Dimputed_dv1.T2D.1.txt", 100 | # "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European.txt", 101 | # "/humgen/diabetes2/users/clairekim/T2D_European.BMIunadjusted.txt" 102 | # ) 103 | 104 | 105 | -------------------------------------------------------------------------------- /scripts/archive/main.BayesNMF.script_to_Jaeyoon_edit_claire_T2D.R: -------------------------------------------------------------------------------- 1 | ############################################################################################ 2 | ############################################################################################ 3 | #### Copyright (c) 2017, Broad Institute 4 | #### Redistribution and use in source and binary forms, with or without 5 | #### modification, are permitted provided that the following conditions are 6 | #### met: 7 | #### Redistributions of source code must retain the above copyright 8 | #### notice, this list of conditions and the following disclaimer. 9 | #### Redistributions in binary form must reproduce the above copyright 10 | #### notice, this list of conditions and the following disclaimer in 11 | #### the documentation and/or other materials provided with the 12 | #### distribution. 13 | #### Neither the name of the Broad Institute nor the names of its 14 | #### contributors may be used to endorse or promote products derived 15 | #### from this software without specific prior written permission. 16 | #### THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | #### "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | #### LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | #### A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | #### HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | #### SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | #### LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | #### DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | #### THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | #### (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | #### OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | ############################################################################################ 28 | ############################################################################################ 29 | 30 | ###################################################################################################### 31 | ####### Bayesian NMF algorithms for clustering 32 | ###################################################################################################### 33 | ####### For implementation details see the ppaer 34 | ####### Udler MS, Kim J, von Grotthuss M, 35 | ####### Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018) 36 | ####### Type 2 diabetes genetic loci informed by multi-trait 37 | ####### associations point to disease mechanisms and 38 | ####### subtypes: A soft clustering analysis. PLoS Med 15 39 | ####### (9): e1002654. 40 | ################################# 41 | ####### For details on the original algorithms 42 | ####### see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence. 43 | ####### IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013). 44 | ###################################################################################################### 45 | 46 | ########################### 47 | ########################### 48 | ##### Bayesian NMF with half-normal priors for W and H 49 | BayesNMF.L2EU <- function(V0,n.iter,a0,tol,K,K0,phi) { 50 | eps <- 1.e-50 51 | del <- 1.0 52 | active_nodes <- colSums(V0) != 0 53 | V0 <- V0[,active_nodes] 54 | V <- V0-min(V0) 55 | Vmin <- min(V) 56 | Vmax <- max(V) 57 | N <- dim(V)[1] 58 | M <- dim(V)[2] 59 | 60 | W <- matrix(runif(N * K)*Vmax,ncol=K) 61 | H <- matrix(runif(M * K)*Vmax,ncol=M) 62 | I <- array(1,dim=c(N,M)) 63 | V.ap <- W%*%H+eps 64 | 65 | phi <- sd(V)^2*phi 66 | C <- (N+M)/2+a0+1 67 | b0 <- 3.14*(a0-1)*mean(V)/(2*K0) 68 | lambda.bound <- b0/C 69 | lambda <- (0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/C 70 | lambda.cut <- lambda.bound*1.5 71 | 72 | n.like <- list() 73 | n.evid <- list() 74 | n.error <- list() 75 | n.lambda <- list() 76 | n.lambda[[1]] <- lambda 77 | iter <- 2 78 | count <- 1 79 | while (del >= tol & iter < n.iter) { 80 | H <- H*(t(W)%*%V)/(t(W)%*%V.ap+phi*H*matrix(rep(1/lambda,M),ncol=M)+eps) 81 | V.ap <- W %*% H + eps 82 | W <- W*(V%*%t(H))/(V.ap%*%t(H)+phi*W*t(matrix(rep(1/lambda,N),ncol=N))+eps) 83 | V.ap <- W %*% H + eps 84 | lambda <- (0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/C 85 | del <- max(abs(lambda-n.lambda[[iter-1]])/n.lambda[[iter-1]]) 86 | like <- sum((V-V.ap)^2)/2 87 | n.like[[iter]] <- like 88 | n.evid[[iter]] <- like + phi*sum((0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/lambda+C*log(lambda)) 89 | n.lambda[[iter]] <- lambda 90 | n.error[[iter]] <- sum((V-V.ap)^2) 91 | if (iter %% 100 == 0) { 92 | cat(iter,n.evid[[iter]],n.like[[iter]],n.error[[iter]],del,sum(colSums(W)!=0),sum(lambda>=lambda.cut),'\n') 93 | } 94 | iter <- iter+1 95 | } 96 | return(list(W,H,n.like,n.evid,n.lambda,n.error)) 97 | } 98 | 99 | plot.heatmap.ggplot.new <- function(mat) { 100 | scale0 <- 0.8 101 | scale <- 1 102 | g.ordering <- c("G4","G3","G2","G1") 103 | color.axis <- "black" 104 | .theme_ss <- theme_bw(base_size=12) + 105 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis), 106 | axis.text.y = element_text(hjust = 0.5,size=8*scale, family="mono",face='bold',color=color.axis), 107 | axis.text = element_text(size = 12*scale, family = "mono",color=color.axis), 108 | axis.title=element_text(face="bold",size=12*scale,color="black"), 109 | plot.title=element_text(face="bold",size=12*scale)) 110 | mat[mat < 1.e-10] <- 0 111 | hc <- hclust(dist(mat,method="euclidean"),method="ward.D") 112 | feature.ordering <- hc$labels[hc$order] 113 | df <- melt(mat) 114 | colnames(df) <- c("feature","signature","activity") 115 | #df$feature <- factor(df$feature,levels=feature.ordering) 116 | #df$signature <- factor(df$signature,levels=c("W4","W3","W2","W1")) 117 | p = ggplot(df,aes(y=feature,x=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow") 118 | p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep="")) 119 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 120 | p = p + .theme_ss 121 | p = p + ggtitle("Feature Assoication to Clusters") 122 | #p = p + ylab("Contributions") + xlab("Feature") 123 | p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0)) 124 | p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0)) 125 | p = p + theme(legend.position="right") 126 | p = p + theme(legend.key.size = unit(0.5, "cm")) 127 | #pdf(file=paste(OUTPUT,"feature.association_to_clusters.pdf",sep=""),width=10,height=4) 128 | plot(p) 129 | #dev.off() 130 | } 131 | 132 | plot.heatmap.2 <- function(x,rowTF,colTF) { 133 | s1 <- 0.75 134 | s2 <- 1.0 135 | s3 <- 1.5 136 | mydist <- function(c) {dist(c,method="euclidean")} 137 | myclust <- function(c) {hclust(c,method="ward.D")} 138 | heatmap.2(as.matrix(x), hclustfun=myclust, distfun=mydist, na.rm = TRUE, scale="none", dendrogram="both",margins=c(8,8), 139 | Rowv=rowTF, Colv=colTF, symbreaks=F, key=TRUE, symkey=F, 140 | density.info="none", trace="none",labCol=colnames(x),labRow=rownames(x),col=greenred(40),cex.lab=s1,cexRow=0.7,cexCol=0.7,keysize=s1) 141 | } 142 | 143 | library(gplots) 144 | library(RColorBrewer) 145 | library(ggplot2) 146 | library(reshape) 147 | library(reshape2) 148 | 149 | #CURRENT <- paste(getwd(),"/",sep="") 150 | #OUTPUT <- paste(CURRENT,"OUTPUT/DIST_test/",sep="") 151 | #system(paste("mkdir",OUTPUT,sep=" ")) 152 | 153 | ##### mat.all: the trait by genotype matrix; the positive and negative association of each trait to genes are separately handled with distinct features. 154 | ##### mat.all in this scipt is not exactly same as the one used in the paper and we included it as an example input. 155 | #load("t2d_data.example.RData") 156 | #mat_all <- mat.all 157 | # load data 158 | library(tidyverse) 159 | setwd("C:/Users/hk745/Dropbox (Partners HealthCare)") 160 | 161 | CURRENT <- paste(getwd(),"/",sep="") 162 | OUTPUT <- paste(CURRENT,"OUTPUT/DIST_test/T2D/trait_test/",sep="") 163 | 164 | #inputtraits <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/traitlist_pval_0.05_snpcnt_filterGIANT.txt",stringsAsFactors = FALSE, header = F) 165 | #inputtraits$V1 <- gsub("-", ".", inputtraits$V1) 166 | 167 | 168 | #t2d_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_98snps_23traits.txt",stringsAsFactors = FALSE) 169 | #filter <- t2d_snps 170 | #t2d_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_98snps_noloop_minusCHOL.txt",stringsAsFactors = FALSE) 171 | #t2d_snps <- t2d_snps[,names(t2d_snps) %in% names(filter)] 172 | #gbd_snps <- t2d_snps 173 | 174 | 175 | #filtered <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_filter.txt",stringsAsFactors = FALSE, header = F) 176 | #filtered <- filtered[filtered$V2<0.05,] 177 | 178 | gbd_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2Donly_eu_45traits_412snps.txt",stringsAsFactors = FALSE) 179 | 180 | #gbd_snps <- gbd_snps[gbd_snps$VAR_ID_hg19 %in% filtered$V1,] 181 | 182 | #gbd_snps$GLGC_dv2.CHOL.ZN <- NULL 183 | gbd_t <- gbd_snps 184 | rownames(gbd_t) <- gbd_snps$locus 185 | gbd_t <- gbd_t[,-c(1:4)] 186 | gbd_t <- t(gbd_t) 187 | gbd_t[gbd_t == "."] <- NA 188 | 189 | 190 | 191 | #gbd_t <- gbd_t[rownames(gbd_t) %in% inputtraits$V1,] 192 | #write.table(rownames(gbd_t),file=paste(CURRENT,paste("traitlist_0.05_snpcnt.txt",sep="."),sep=""), append = F, quote = F, sep = "\t", 193 | # eol = "\n", na = "NA", dec = ".", row.names = F, 194 | # col.names = F, qmethod = c("escape", "double")) 195 | 196 | gbd_t <- as.data.frame(gbd_t,stringsAsFactors = FALSE) 197 | na <- as.data.frame(as.data.frame(map(gbd_t, ~sum(is.na(.))))) 198 | nadrop <- na[,na>dim(gbd_t)[1]*0.2, drop = FALSE] #### 199 | test <- gbd_t 200 | test <- test[,!(colnames(test) %in% colnames(nadrop))] 201 | gbd_t <- test 202 | 203 | #gbd <- as.data.frame(t(gbd_t)) 204 | #nat <- as.data.frame(as.data.frame(map(gbd, ~sum(is.na(.))))) 205 | #nadropt <- nat[,nat>dim(gbd)[1]*0.2] 206 | 207 | gbd_t[is.na(gbd_t)] <- 0 208 | mat.neg <- gbd_t 209 | mat.pos <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)>=0,as.numeric(x),0))) 210 | #mat.pos <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)<0,as.numeric(x),0))) 211 | mat.pos <- t(mat.pos) 212 | colnames(mat.pos) <- colnames(mat.neg) 213 | mat.neg <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)>=0,0,as.numeric(x)))) 214 | #mat.neg <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)<0,0,as.numeric(x)))) 215 | mat.neg <- t(mat.neg) 216 | colnames(mat.neg) <- colnames(mat.pos) 217 | mat.neg <- mat.neg*(-1) 218 | #mat.pos <- mat.pos*(-1) 219 | rownames(mat.pos) <- paste(rownames(mat.pos), "pos", sep = "_") 220 | rownames(mat.neg) <- paste(rownames(mat.neg), "neg", sep = "_") 221 | mat.all <- rbind(mat.pos,mat.neg) 222 | 223 | 224 | 225 | ##### simple hierarchical clustering ############## not working 226 | library(pheatmap) 227 | hc1 <- hclust(dist(mat.all,method="euclidean"),method="ward.D") 228 | hc1.ordering <- hc1$labels[hc1$order] 229 | hc2 <- hclust(dist(t(mat.all),method="euclidean"),method="ward.D") 230 | hc2.ordering <- hc2$labels[hc2$order] 231 | order1 <- match(hc1.ordering,rownames(mat.all),nomatch=0) 232 | order2 <- match(hc2.ordering,colnames(mat.all),nomatch=0) 233 | pdf(file=paste(OUTPUT,"hierarchicalc.mat.pdf",sep=""),width=40,height=10) 234 | fontsize_row = 8 235 | fontsize_col = 0.5 236 | pheatmap(mat.all[order1,order2], fontsize_col = fontsize_col, fontsize_row = fontsize_row)+ scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep="")) 237 | #plot.heatmap.2(mat.all[order1,order2],F,F) 238 | dev.off() 239 | 240 | 241 | 242 | ##### Running Bayesian NMF with half-normal priors for W and H 243 | if (TRUE) { 244 | #n.iter <- 100 ### number of runs.. you can increase this number 245 | n.iter <- 30 ### number of runs.. you can increase this number 246 | 247 | for (i in 1:n.iter) { 248 | res <- BayesNMF.L2EU(as.matrix(mat.all),200000,10,1.e-07,10,10,1.0) 249 | save(res,file=paste(OUTPUT,paste("res.L2EU.Bayes",i,"RData",sep="."),sep="")) 250 | } 251 | } 252 | 253 | tmpK <- rep(0,n.iter) 254 | tmpE <- rep(0,n.iter) 255 | tmpRUN <- rep(0,n.iter) 256 | for (i in 1:n.iter) { 257 | load(file=paste(OUTPUT,paste("res.L2EU.Bayes",i,"RData",sep="."),sep="")) 258 | lambda <- res[[5]] 259 | lambda <- unlist(lambda[length(lambda)]) 260 | lambda <- lambda-min(lambda) 261 | tmpK[i] <- sum(lambda > 0) 262 | evid <- res[[4]] 263 | tmpE[i] <- evid[length(evid)] 264 | tmpRUN[i] <- i 265 | } 266 | df.run <- data.frame(tmpK,unlist(tmpE),tmpRUN) 267 | colnames(df.run) <- c("K","evid","run") 268 | df.run <- df.run[order(df.run$evid,decreasing=T),] 269 | write.table(df.run,file=paste(OUTPUT,paste("summary.run.txt",sep="."),sep=""), append = F, quote = F, sep = "\t", 270 | eol = "\n", na = "NaN", dec = ".", row.names = F, 271 | col.names = T, qmethod = c("escape", "double")) 272 | 273 | #### df.run - the summary data-frame for bNMF runs; K = number of clusters, evid = -log(posterior), run = the index of bNMF run 274 | #### How to choose K: (i) We usually perfer the most probable K. For example, here 57% K=5 and 43% K=4, so we will consider K=5. 275 | #### (ii) After selcting K then look at "evid" for all runs with the selected K (here K=5) and choose the run with the lowest "evid" 276 | #### corresponding to the maximum posterior solution 277 | #### (iii) Sometimes you may need a manual inspection for other solutions based on your prior knowledge or biological consideration. 278 | #### Specially, when your most probable solution corresponds to the lowest K, it is recommended to examine the solution with (K+1) and check which solution 279 | #### is more biologically plausible. 280 | 281 | #### Below we will generate outputs of the maximum posterior solutions at different K 282 | unique.K <- table(df.run$K) 283 | n.K <- length(unique.K) ### number of distict K 284 | MAP.K <- rep(0,n.K) ### bNMF run index with the maximum posterior for given K 285 | for (i in 1:n.K) { 286 | tmp <- df.run[df.run$K==as.numeric(names(unique.K)[i]),] 287 | MAP.K[i] <- tmp$run[which.min(tmp$evid)] 288 | } 289 | 290 | for (m in 1:n.K) { 291 | 292 | index.m <- as.numeric(names(unique.K)[m]) 293 | 294 | load(file=paste(OUTPUT,paste("res.L2EU.Bayes",MAP.K[m],"RData",sep="."),sep="")) 295 | W <- res[[1]] 296 | H <- res[[2]] 297 | W <- W[,colSums(W)!=0] 298 | H <- H[rowSums(H)!=0,] 299 | colnames(W) <- paste("W",seq(1:ncol(W)),sep="") 300 | rownames(H) <- colnames(W) 301 | W[W < 1.e-10] <- 0 ### feature-cluster association matrix 302 | H[H < 1.e-10] <- 0 ### cluster-gene association matrix 303 | 304 | if (FALSE) { 305 | W.mid <- W 306 | H.mid <- H 307 | for (i in 1:ncol(W)) { 308 | H.mid[i,] <- H.mid[i,]*colSums(W)[i] 309 | W.mid[i,] <- W.mid[i,]*rowSums(H)[i] 310 | } 311 | W.norm <- apply(W.mid,2,function(x) x/sum(x)) 312 | H.norm <- apply(H.mid,2,function(x) x/sum(x)) 313 | } 314 | 315 | W0 <- data.frame(W) 316 | W0[,"feature"] <- rownames(W) 317 | H0 <- data.frame(H) 318 | H0[,"cluster"] <- rownames(H) 319 | 320 | if (TRUE) { 321 | write.table(W0,file=paste(OUTPUT,paste("L2EU.W.mat",index.m,"txt",sep="."),sep=""), append = F, quote = F, sep = "\t", 322 | eol = "\n", na = "NaN", dec = ".", row.names = F, 323 | col.names = T, qmethod = c("escape", "double")) 324 | write.table(H0,file=paste(OUTPUT,paste("L2EU.H.mat",index.m,"txt",sep="."),sep=""), append = F, quote = F, sep = "\t", 325 | eol = "\n", na = "NaN", dec = ".", row.names = F, 326 | col.names = T, qmethod = c("escape", "double")) 327 | } 328 | 329 | mat.reconstructed <- W%*%H ### reconstructed matrix == approximation for the input matrix 330 | #pdf(file=paste(OUTPUT,paste("L2EU.hc.mat.WH.0",index.m,"pdf",sep="."),sep=""),width=8,height=8) 331 | # plot.heatmap.2(mat.reconstructed[order1,order2],F,F) 332 | #dev.off() 333 | 334 | K <- ncol(W) 335 | for (i in 1:K) { 336 | mat1 <- W[,i]%*%t(as.matrix(H[i,])) 337 | rownames(mat1) <- rownames(mat.all) 338 | #pdf(file=paste(OUTPUT,paste("hc.mat.WH",i,index.m,"pdf",sep="."),sep=""),width=8,height=8) 339 | # plot.heatmap.2(mat1[order1,order2],F,F) 340 | #dev.off() 341 | } 342 | 343 | scale0 <- 0.8 344 | scale <- 1 345 | g.ordering <- paste("G",seq(1:ncol(W)),sep="") 346 | color.axis <- "black" 347 | .theme_ss <- theme_bw(base_size=12) + 348 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis), 349 | axis.text.y = element_text(hjust = 0.5,size=12*scale, family="mono",face='bold',color=color.axis), 350 | axis.text = element_text(size = 12*scale, family = "mono",color=color.axis), 351 | axis.title=element_text(face="bold",size=12*scale,color="black"), 352 | plot.title=element_text(face="bold",size=12*scale)) 353 | mat <- W 354 | mat[mat < 1.e-10] <- 0 355 | hc <- hclust(dist(mat,method="euclidean"),method="ward.D") 356 | feature.ordering <- hc$labels[hc$order] 357 | df <- melt(mat) 358 | colnames(df) <- c("feature","signature","activity") 359 | df$feature <- factor(df$feature,levels=feature.ordering) 360 | df$signature <- factor(df$signature,levels=paste("W",seq(1:ncol(W)),sep="")) 361 | p = ggplot(df,aes(x=feature,y=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow") 362 | p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep="")) 363 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 364 | p = p + .theme_ss 365 | p = p + ggtitle("Feature Assoication to Clusters") 366 | p = p + ylab("Contributions") + xlab("Feature") 367 | p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0)) 368 | p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0)) 369 | p = p + theme(legend.position="right") 370 | p = p + theme(legend.key.size = unit(0.5, "cm")) 371 | pdf(file=paste(OUTPUT,paste("L2EU.feature.association_to_clusters",index.m,"pdf",sep="."),sep=""),width=10,height=4) 372 | plot(p) 373 | dev.off() 374 | 375 | # size = 8*scale (original) 376 | scale0 <- 0.8 377 | scale <- 1 378 | g.ordering <- paste("G",seq(1:ncol(W)),sep="") 379 | color.axis <- "black" 380 | .theme_ss <- theme_bw(base_size=12) + 381 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis), 382 | axis.text.y = element_text(hjust = 0.5,size=12*scale, family="mono",face='bold',color=color.axis), 383 | axis.text = element_text(size = 12*scale, family = "mono",color=color.axis), 384 | axis.title=element_text(face="bold",size=12*scale,color="black"), 385 | plot.title=element_text(face="bold",size=12*scale)) 386 | mat <- H 387 | hc <- hclust(dist(t(mat),method="euclidean"),method="ward.D") 388 | gene.ordering <- hc$labels[hc$order] 389 | df <- melt(mat) 390 | colnames(df) <- c("signature","gene","activity") 391 | df$signature <- factor(df$signature,levels=paste("W",seq(1:ncol(W)),sep="")) 392 | df$gene <- factor(df$gene,levels=gene.ordering) 393 | p = ggplot(df,aes(x=gene,y=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow") 394 | p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep="")) 395 | #p = p + scale_fill_gradientn(values=c(0,q1,q2,q3,q4,1),colours=c("yellow","green","black","red","magenta"),limit=c(0,1)) 396 | p = p + .theme_ss 397 | p = p + ggtitle("Gene Assoication to Clusters") 398 | p = p + ylab("Contributions") + xlab("Genes") 399 | p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0)) 400 | p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0)) 401 | p = p + theme(legend.position="right") 402 | p = p + theme(legend.key.size = unit(0.5, "cm")) 403 | pdf(file=paste(OUTPUT,paste("L2EU.gene.association_to_clusters",index.m,"pdf",sep="."),sep=""),width=10,height=4) 404 | plot(p) 405 | dev.off() 406 | } 407 | 408 | -------------------------------------------------------------------------------- /scripts/archive/prep_bNMF_2021.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(data.table) 3 | 4 | fetch_summary_stats <- function(variant_vec, gwas_ss_file, trait_ss_files) { 5 | 6 | # Given a final (pruned & proxied) set of variants to be clustered, 7 | # fetch z-scores and sample size info from summary statistics for each of a 8 | # series of traits 9 | # INPUTS: 10 | # - variant_vec: vector of variants to be clustered 11 | # - gwas_ss: filepath of with VAR_IDs and betas from the original GWAS 12 | # - trait_ss_vec: named vector of trait summary statistic filepaths 13 | # Final variant vector should be in VAR_ID format: [CHR]_[POS]_[REF]_[ALT] (using hg19) 14 | # GWAS summary statistic data frame must have at least the following 15 | # columns: SNP (CHR:POS), REF, ALT, BETA 16 | # Each trait summary statistic dataset must have the following 17 | # columns: VAR_ID, Effect_Allele_PH, BETA, SE, P_VALUE, N_PH 18 | 19 | # ISSUES: 20 | # - potential for strand-flip? 21 | 22 | read_single_trait <- function(trait, variant_df) { 23 | # Read/filter/process summary statistics for a single trait 24 | print(paste0("Processing ", trait, "...")) 25 | df <- fread(trait_ss_files[[trait]], data.table=F, stringsAsFactors=F) 26 | # if (grepl("\\/UKBB\\/", trait_ss_path)) { 27 | # df <- df %>% 28 | # mutate(VAR_ID=gsub(":", "_", variant)) %>% 29 | # filter(VAR_ID %in% final_variant_ss$VAR_ID) %>% 30 | # separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove=F) %>% 31 | # select(VAR_ID, Effect_Allele_PH=ALT, BETA=beta, SE=se, P_VALUE=pval, N_PH=n_complete_samples) 32 | # } 33 | # if (!("N_PH" %in% names(df))) df$N_PH <- as.integer(NA) 34 | df %>% 35 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_") %>% 36 | mutate(SNP=paste(CHR, POS, sep=":")) %>% 37 | select(SNP, Effect_Allele_PH, N_PH, BETA, SE, P_VALUE) %>% 38 | right_join(variant_df, by="SNP", suffix=c(".gwas", ".trait")) %>% 39 | mutate(z=BETA / SE, # First, calculate z-score magnitude 40 | z=case_when( # Next, align z-score sign with GWAS phenotype-raising allele 41 | Effect_Allele_PH == Risk_Allele ~ z, 42 | Effect_Allele_PH == Nonrisk_Allele ~ -z, 43 | TRUE ~ as.numeric(NA) # For example, if trait effect allele matches neither REF nor ALT from GWAS 44 | )) %>% 45 | select(SNP, z, N_PH, P_VALUE) 46 | } 47 | 48 | print("Retrieving risk alleles from the original GWAS summary statistics...") 49 | gwas_ss <- fread(gwas_ss_file, data.table=F, stringsAsFactors=F) %>% 50 | mutate(Risk_Allele=ifelse(BETA > 0, ALT, REF), 51 | Nonrisk_Allele=ifelse(BETA > 0, REF, ALT)) %>% 52 | select(SNP, Risk_Allele, Nonrisk_Allele) 53 | variant_df <- tibble(VAR_ID=variant_vec) %>% 54 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_") %>% 55 | mutate(SNP=paste(CHR, POS, sep=":")) %>% 56 | select(SNP) %>% 57 | inner_join(gwas_ss, by="SNP") 58 | print(paste0(nrow(variant_df), " of ", length(variant_vec), 59 | " variants are available in the primary GWAS.")) 60 | 61 | 62 | print("Retrieving z-scores and sample sizes for each trait...") 63 | trait_df_long <- lapply(names(trait_ss_files), read_single_trait, variant_df) %>% 64 | setNames(names(trait_ss_files)) %>% 65 | bind_rows(.id="trait") # Bind all processed trait datasets into a single "long" data frame 66 | 67 | z_df_wide <- trait_df_long %>% 68 | select(trait, SNP, z) %>% 69 | pivot_wider(names_from="trait", values_from="z") 70 | z_mat <- as.matrix(z_df_wide[, -1]) 71 | rownames(z_mat) <- z_df_wide$SNP 72 | 73 | N_df_wide <- trait_df_long %>% 74 | select(trait, SNP, N_PH) %>% 75 | pivot_wider(names_from="trait", values_from="N_PH") 76 | N_mat <- as.matrix(N_df_wide[, -1]) 77 | rownames(N_mat) <- N_df_wide$SNP 78 | 79 | # P_df <- trait_df_long %>% 80 | # group_by(trait) %>% 81 | # summarise(minP=min(P_VALUE, na.rm=T)) 82 | 83 | list(z_mat=z_mat, N_mat=N_mat) 84 | # minP_vec=setNames(P_df$minP, P_df$trait)) 85 | } 86 | 87 | 88 | prep_z_matrix <- function(z_mat, N_mat) { 89 | 90 | # Given a matrix of z-scores (N_variants x M_traits) and vector of median 91 | # sample sizes per trait: 92 | # 1) perform final pre-processing steps before bNMF clustering: 93 | # trait filtering by p-value, trait pruning based on correlation, 94 | # and z-score scaling based on sample size 95 | # 2) expand N x M matrix into N x 2M non-negative matrix 96 | 97 | # Filter traits by p-value (min. p-value < 0.05/N_variants) 98 | minP_vec <- apply(z_mat, 2, function(x) min(2 * pnorm(abs(x), lower.tail=F), na.rm=T)) 99 | print(paste0("Removing traits with no variant having p < 0.05 / # variants: ", 100 | paste(colnames(z_mat)[minP_vec >= 0.05 / nrow(z_mat)], 101 | collapse=", "))) 102 | z_mat <- z_mat[, minP_vec < 0.05 / nrow(z_mat)] 103 | 104 | # Prune traits by correlation (remove traits with Pearson |r| > 0.85) 105 | trait_cor_mat <- cor(z_mat, use="pairwise.complete.obs") # Trait-trait correlation matrix 106 | trait_min_pvals <- minP_vec[names(minP_vec) %in% colnames(z_mat)] # Remove filtered traits 107 | remaining_traits <- names(sort(trait_min_pvals)) 108 | keep_traits <- c() 109 | while (length(remaining_traits) > 0) { 110 | # print(remaining_traits[1]) 111 | keep_traits <- c(keep_traits, remaining_traits[1]) 112 | remaining_traits <- setdiff( 113 | remaining_traits, 114 | rownames(trait_cor_mat)[abs(trait_cor_mat[, remaining_traits[1]]) >= 0.85] 115 | ) 116 | } 117 | pruned_traits <- setdiff(colnames(z_mat), keep_traits) 118 | print(paste("Traits removed in pruning process:", 119 | paste(pruned_traits, collapse=", "))) 120 | z_mat <- z_mat[, keep_traits] 121 | 122 | # Adjust z-scores by sample size for each variant-trait combo 123 | # i.e. (z = z / sqrt(medN) * mean(sqrt(medN_all_traits))) 124 | print("Performing sample size adjustment...") 125 | medN_vec <- apply(N_mat[, colnames(z_mat)], 2, median, na.rm=T) 126 | z_mat <- z_mat / sqrt(N_mat[, colnames(z_mat)]) * mean(sqrt(medN_vec)) 127 | 128 | 129 | # Replace missing values with zero 130 | print("Replacing remaining missing values with zero...") 131 | print(paste0(sum(is.na(z_mat)), " missing values were replaced.")) 132 | z_mat[is.na(z_mat)] <- 0 133 | 134 | # Expand into N x 2M non-negative matrix 135 | print("Expanding z-score matrix into non-negative matrix (N-variants x 2M-traits)...") 136 | z_mat_pos <- z_mat 137 | z_mat_pos[z_mat_pos < 0] <- 0 138 | colnames(z_mat_pos) <- paste0(colnames(z_mat), "_pos") 139 | z_mat_neg <- -z_mat 140 | z_mat_neg[z_mat_neg < 0] <- 0 141 | colnames(z_mat_neg) <- paste0(colnames(z_mat), "_neg") 142 | final_z_mat <- cbind(z_mat_pos, z_mat_neg) 143 | 144 | # Write N x M and N x 2M matrices 145 | saveRDS(z_mat, "z_score_mat.rds") 146 | saveRDS(final_z_mat, "z_score_mat_nonnegative.rds") 147 | 148 | final_z_mat 149 | } 150 | -------------------------------------------------------------------------------- /scripts/archive/process_traits.R: -------------------------------------------------------------------------------- 1 | library(readxl) 2 | library(tidyverse) 3 | 4 | 5 | fetch_variant_fracs <- function(trait_filepath_list, gwas_varIDs) { 6 | 7 | # Given a named list of filepaths to trait GWAS summary stats and a list of 8 | # primary GWAS variant IDs, return a named vector indicating the fraction of 9 | # those traits having each variant 10 | 11 | trait_variants <- lapply(trait_filepaths, function(f) { 12 | read_tsv(f, col_types=cols_only(VAR_ID="c"), n_max=100000) %>% 13 | filter(VAR_ID %in% gwas_varIDs) 14 | }) 15 | var_presence_df <- do.call(bind_rows, c(trait_variants, .id="trait")) %>% 16 | mutate(var_present=1) %>% 17 | pivot_wider(names_from="trait", values_from="var_present", values_fill=0) # 0/1 values: is variant (row) present in trait GWAS (column)? 18 | var_presence_mat <- as.matrix(var_presence_df[, 2:ncol(var_presence_df)]) 19 | rownames(var_presence_mat) <- var_presence_df$VAR_ID 20 | var_fracs <- rowSums(var_presence_mat) / length(trait_variants) # Fraction of traits having each variant 21 | var_fracs 22 | } 23 | 24 | 25 | traits_doc <- read_excel("../data/clustering_data_source.xlsx", sheet=3) 26 | trait_filepaths <- setNames(traits_doc$full_path, traits_doc$trait_name) 27 | 28 | gwas_varIDs <- sample(chosen_variants$varID, size=100) 29 | 30 | variant_fracs <- fetch_variant_fracs(trait_filepaths, gwas_varIDs) 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /scripts/archive/proximal_preprocessing.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | 4 | ##### PREPROCESSING ##### 5 | 6 | prep_z_matrix <- function(z_mat, minP_vec, medN_vec) { 7 | 8 | # Given a matrix of z-scores (N_variants x M_traits) and vector of median 9 | # sample sizes per trait: 10 | # 1) perform final pre-processing steps before bNMF clustering: 11 | # trait filtering by p-value, trait pruning based on correlation, 12 | # and z-score scaling based on sample size 13 | # 2) expand N x M matrix into 2N x M non-negative matrix 14 | 15 | # Filter traits by p-value (min. p-value < 0.05/N_variants) 16 | stopifnot(all(colnames(z_mat) == names(minP_vec))) 17 | print(paste0("Removing traits with no variant having p < 0.05 / # variants: ", 18 | paste(colnames(z_mat)[minP_vec >= 0.05 / nrow(z_mat)], 19 | collapse=", "))) 20 | z_mat <- z_mat[, minP_vec < 0.05 / nrow(z_mat)] 21 | 22 | # Prune traits by correlation (remove traits with Pearson |r| > 0.85) 23 | trait_cor_mat <- cor(z_mat, use="pairwise.complete.obs") # Trait-trait correlation matrix 24 | trait_min_pvals <- minP_vec[names(minP_vec) %in% colnames(z_mat)] # Remove filtered traits 25 | remaining_traits <- names(sort(trait_min_pvals)) 26 | keep_traits <- c() 27 | while (length(remaining_traits) > 0) { 28 | # print(remaining_traits[1]) 29 | keep_traits <- c(keep_traits, remaining_traits[1]) 30 | remaining_traits <- setdiff( 31 | remaining_traits, 32 | rownames(trait_cor_mat)[abs(trait_cor_mat[, remaining_traits[1]]) >= 0.85] 33 | ) 34 | } 35 | pruned_traits <- setdiff(colnames(z_mat), keep_traits) 36 | print(paste("Traits removed in pruning process:", 37 | paste(pruned_traits, collapse=", "))) 38 | z_mat <- z_mat[, keep_traits] 39 | 40 | # Adjust z-scores by sample size for each variant-trait combo (z = z / sqrt(N)) 41 | # z_mat <- t(t(z_mat) / sqrt(N_vec[match(pruned_traits, colnames(z_mat))])) 42 | # Multiply full matrix by mean(sqrt(median(N))) (a single number for the whole matrix) 43 | print(paste0("Multiplying sample size-adjusted z-score matrix by ", 44 | round(mean(sqrt(medN_vec[pruned_traits]))), " (i.e. mean(sqrt(median(N))))")) 45 | z_mat <- z_mat * mean(sqrt(medN_vec[pruned_traits])) 46 | 47 | # Replace missing values with zero 48 | z_mat[is.na(z_mat)] <- 0 49 | 50 | # Expand into 2N x M non-negative matrix 51 | z_mat_pos <- z_mat 52 | z_mat_pos[z_mat_pos < 0] <- 0 53 | colnames(z_mat_pos) <- paste0(colnames(z_mat), "_pos") 54 | z_mat_neg <- -z_mat 55 | z_mat_neg[z_mat_neg < 0] <- 0 56 | colnames(z_mat_neg) <- paste0(colnames(z_mat), "_neg") 57 | final_z_mat <- cbind(z_mat_pos, z_mat_neg) 58 | 59 | final_z_mat 60 | } -------------------------------------------------------------------------------- /scripts/archive/run_bNMF_2021.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | 4 | ########################################################################## 5 | # Copyright (c) 2017, Broad Institute 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are 8 | # met: 9 | # Redistributions of source code must retain the above copyright 10 | # notice, this list of conditions and the following disclaimer. 11 | # Redistributions in binary form must reproduce the above copyright 12 | # notice, this list of conditions and the following disclaimer in 13 | # the documentation and/or other materials provided with the 14 | # distribution. 15 | # Neither the name of the Broad Institute nor the names of its 16 | # contributors may be used to endorse or promote products derived 17 | # from this software without specific prior written permission. 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | ######################################################################### 30 | 31 | ###################################################################### 32 | # Bayesian NMF algorithms for clustering 33 | ###################################################################### 34 | # For implementation details see the ppaer 35 | # Udler MS, Kim J, von Grotthuss M, 36 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018) 37 | # Type 2 diabetes genetic loci informed by multi-trait 38 | # associations point to disease mechanisms and 39 | # subtypes: A soft clustering analysis. PLoS Med 15 40 | # (9): e1002654. 41 | ########################### 42 | # For details on the original algorithms 43 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence. 44 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013). 45 | ###################################################################### 46 | 47 | BayesNMF.L2EU <- function( 48 | V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=10, phi=1.0 49 | ) { 50 | 51 | # Bayesian NMF with half-normal priors for W and H 52 | # V0: input z-score matrix (variants x traits) 53 | # n.iter: Number of iterations for parameter optimization 54 | # a0: Hyper-parameter for inverse gamma prior on ARD relevance weights 55 | # tol: Tolerance for convergence of fitting procedure 56 | # K: Number of clusters to be initialized (algorithm may drive some to zero) 57 | # K0: Used for setting b0 (lambda prior hyper-parameter) -- should be equal to K 58 | # phi: Scaling parameter 59 | 60 | eps <- 1.e-50 61 | del <- 1.0 62 | active_nodes <- colSums(V0) != 0 63 | V0 <- V0[, active_nodes] 64 | V <- V0 - min(V0) 65 | Vmin <- min(V) 66 | Vmax <- max(V) 67 | N <- dim(V)[1] 68 | M <- dim(V)[2] 69 | 70 | W <- matrix(runif(N * K) * Vmax, ncol=K) 71 | H <- matrix(runif(M * K) * Vmax, ncol=M) 72 | I <- array(1, dim=c(N, M)) 73 | V.ap <- W %*% H + eps 74 | 75 | phi <- sd(V)^2 * phi 76 | C <- (N + M) / 2 + a0 + 1 77 | b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0) 78 | lambda.bound <- b0 / C 79 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 80 | lambda.cut <- lambda.bound * 1.5 81 | 82 | n.like <- list() 83 | n.evid <- list() 84 | n.error <- list() 85 | n.lambda <- list() 86 | n.lambda[[1]] <- lambda 87 | iter <- 2 88 | count <- 1 89 | while (del >= tol & iter < n.iter) { 90 | H <- H * (t(W) %*% V) / 91 | (t(W) %*% V.ap + phi * H * matrix(rep(1 / lambda, M), ncol=M) + eps) 92 | V.ap <- W %*% H + eps 93 | W <- W * (V %*% t(H)) / 94 | (V.ap %*% t(H) + phi * W * t(matrix(rep(1 / lambda, N), ncol=N)) + eps) 95 | V.ap <- W %*% H + eps 96 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 97 | del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]]) 98 | like <- sum((V - V.ap)^2) / 2 99 | n.like[[iter]] <- like 100 | n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 101 | lambda + C * log(lambda)) 102 | n.lambda[[iter]] <- lambda 103 | n.error[[iter]] <- sum((V - V.ap)^2) 104 | if (iter %% 100 == 0) { 105 | cat(iter, n.evid[[iter]], n.like[[iter]], n.error[[iter]], del, 106 | sum(colSums(W) != 0), sum(lambda >= lambda.cut), '\n') 107 | } 108 | iter <- iter + 1 109 | } 110 | return(list( 111 | W, # Variant weight matrix (N x K) 112 | H, # Trait weight matrix (K x M) 113 | n.like, # List of reconstruction errors (sum of squared errors / 2) per iteration 114 | n.evid, # List of negative log-likelihoods per iteration 115 | n.lambda, # List of lambda vectors (shared weights for each of K clusters, some ~0) per iteration 116 | n.error # List of reconstruction errors (sum of squared errors) per iteration 117 | )) 118 | } 119 | 120 | 121 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, ...) { 122 | 123 | # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure 124 | # a series of times to generate results and evaluate cluster stability 125 | 126 | print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)...")) 127 | set.seed(random_seed) 128 | bnmf_reps <- lapply(1:n_reps, function(r) { 129 | res <- BayesNMF.L2EU(z_mat, ...) 130 | names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error") 131 | res 132 | }) 133 | bnmf_reps 134 | } 135 | 136 | 137 | summarize_bNMF <- function(bnmf_reps) { 138 | 139 | # Given output from bNMF (list of length N_iterations), 140 | # generate summary tables and plots 141 | 142 | make_run_summary <- function(reps) { 143 | 144 | # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs 145 | 146 | run_summary <- map_dfr(1:length(reps), function(i) { 147 | res <- reps[[i]] 148 | final_lambdas <- res$n.lambda[[length(res$n.lambda)]] 149 | tibble( 150 | run=i, 151 | K=sum(final_lambdas > min(final_lambdas)), # Assume that lambdas equal to the minimum lambda are ~ 0 152 | evid=res$n.evid[[length(res$n.evid)]] # Evidence = -log_likelihood 153 | ) 154 | }) %>% 155 | arrange(evid) 156 | 157 | unique.K <- table(run_summary$K) 158 | n.K <- length(unique.K) # Number of distinct K 159 | MAP.K.run <- sapply(names(unique.K), function(k) { # bNMF run index with the maximum posterior for given K 160 | tmp <- run_summary[run_summary$K == k, ] 161 | tmp$run[which.min(tmp$evid)] 162 | }) 163 | 164 | list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run) 165 | } 166 | 167 | print("Summarizing bNMF results...") 168 | 169 | print("Writing table of chosen K across iterations...") 170 | run_summary <- make_run_summary(bnmf_reps) 171 | write_tsv(run_summary$run_tbl, "run_summary.txt") 172 | 173 | n.K <- length(run_summary$unique.K) # Number of distinct K 174 | 175 | get_W <- function(clustering) { 176 | W_raw <- clustering$W 177 | W_raw[, colSums(W_raw > 1e-10) > 0] 178 | } 179 | 180 | get_H <- function(clustering) { 181 | H_raw <- clustering$H 182 | H_raw[rowSums(H_raw > 1e-10) > 0, ] 183 | } 184 | 185 | print("Plotting variant and trait contributions...") 186 | silent <- sapply(names(run_summary$unique.K), function(k) { # Create heatmaps for MAP iteration for each K 187 | res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]] 188 | W <- res$W[, colSums(res$W) != 0] # feature-cluster association matrix 189 | H <- res$H[rowSums(res$H) != 0, ] # cluster-gene association matrix 190 | W[W < 1.e-10] <- 0 191 | H[H < 1.e-10] <- 0 192 | 193 | W0 <- data.frame(W) 194 | W0[, "variant"] <- rownames(W) 195 | H0 <- data.frame(H) 196 | H0[, "cluster"] <- rownames(H) 197 | 198 | write_tsv(paste0("L2EU.W.mat.K", k, ".txt")) 199 | write_tsv(paste0("L2EU.H.mat.K", k, ".txt")) 200 | 201 | mat.reconstructed <- W %*% H # reconstructed matrix == approximation for the input matrix 202 | 203 | # Setup for plotting 204 | scale0 <- 0.8 205 | scale <- 1 206 | g.ordering <- paste("G", seq(1:ncol(W)), sep="") 207 | color.axis <- "black" 208 | .theme_ss <- theme_bw(base_size=12) + 209 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 210 | family="mono", face='bold', color=color.axis), 211 | axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis), 212 | axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis), 213 | axis.title=element_text(face="bold", size=12 * scale,color="black"), 214 | plot.title=element_text(face="bold", size=12 * scale)) 215 | 216 | # Plot W matrix (feature activities) 217 | W_hc <- hclust(dist(W, method="euclidean"), method="ward.D") 218 | W_variant.ordering <- W_hc$labels[W_hc$order] 219 | W_plt_df <- W %>% 220 | as.data.frame() %>% 221 | rownames_to_column(var="variant") %>% 222 | gather(key="cluster", value="activity", -variant) %>% 223 | mutate(variant=factor(variant, levels=W_variant.ordering), 224 | cluster=factor(cluster, 225 | levels=paste0("V", 1:ncol(W)))) 226 | W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 227 | geom_tile() + 228 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 229 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 230 | .theme_ss + 231 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 232 | ylab("Cluster") + xlab("Variant") + 233 | theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) + 234 | theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) + 235 | theme(legend.position="right") + 236 | theme(legend.key.size = unit(0.5, "cm")) 237 | ggsave(paste0("W_plot_K", k, ".pdf"), plot=W_plt) 238 | 239 | H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D") 240 | H_trait.ordering <- H_hc$labels[H_hc$order] 241 | H_plt_df <- t(H) %>% 242 | as.data.frame() %>% 243 | rownames_to_column(var="trait") %>% 244 | gather(key="cluster", value="activity", -trait) %>% 245 | mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))), 246 | trait=factor(trait, levels=H_trait.ordering)) 247 | H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 248 | geom_tile() + 249 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 250 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 251 | .theme_ss + 252 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 253 | ylab("Cluster") + xlab("Trait") + 254 | theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) + 255 | theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) + 256 | theme(legend.position="right") + 257 | theme(legend.key.size = unit(0.5, "cm")) 258 | ggsave(paste0("H_plot_K", k, ".pdf"), plot=H_plt) 259 | }) 260 | } 261 | -------------------------------------------------------------------------------- /scripts/archive/test_pipeline_2021.R: -------------------------------------------------------------------------------- 1 | # This script is intended to run the full pipeline for bNMF clustering based 2 | # on summary statistics and test its agreement with scripts that are 3 | # currently in place. 4 | 5 | # NOTE: to run Tabix on UGER, must start with "ssh gsa4; use .zlib-1.2.6" 6 | 7 | source("choose_variants.R") # fld_pruning, count_traits_per_variant, fina_variants_needing_proxies, & choose_potential_proxies 8 | source("prep_bNMF.R") # fetch_summary_stats & prep_z_matrix 9 | source("run_bNMF.R") # run_bNMF & summarize_bNMF 10 | 11 | gwas_traits <- readxl::read_excel("../data/clustering_data_source.xlsx", sheet="gwas_traits") 12 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait_name) 13 | trait_ss_files <- trait_ss_files[!grepl("MAGIC", names(trait_ss_files))] # Some MAGIC GWAS files don't have N_PH field 14 | 15 | initial_t2d_snps <- read_tsv("../data/T2D_initial_vars_pval.txt") 16 | set.seed(1) 17 | initial_t2d_snps <- sample_n(initial_t2d_snps, size=1000) %>% 18 | select(VAR_ID=VAR_ID_hg19, PVALUE) 19 | 20 | rsID_map_file <- "/humgen/diabetes2/users/clairekim/list_VARID_rsID_updated.txt" # From dbSNP v1.38 -- maps positional IDs to rsIDs 21 | 22 | # Variant choice steps 23 | 24 | pruned_variants <- ld_pruning(initial_t2d_snps, rsID_map_file) 25 | 26 | var_nonmissingness <- count_traits_per_variant(pruned_variants$VAR_ID, trait_ss_files) 27 | 28 | proxies_needed_df <- find_variants_needing_proxies(pruned_variants, var_nonmissingness, 29 | rsID_map_file) 30 | 31 | tabix_path <- "/humgen/diabetes2/users/mvg/VariantClustering/tabix-0.2.6/tabix" 32 | ld_file <- "/humgen/diabetes2/users/mvg/VariantClustering/LD_EUR.tsv.bgz" 33 | final_variant_set <- choose_proxies( 34 | proxies_needed_df, 35 | tabix_path, 36 | ld_file, 37 | rsID_map_file, 38 | trait_ss_files, 39 | pruned_variants 40 | ) 41 | 42 | # Prep bNMF steps 43 | 44 | t2d_ss_filepath <- "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European_formatted.txt" 45 | 46 | initial_zscore_matrices <- fetch_summary_stats( 47 | final_variant_set, 48 | t2d_ss_filepath, 49 | trait_ss_files 50 | ) 51 | 52 | final_zscore_matrix <- prep_z_matrix(initial_zscore_matrices$z_mat, 53 | initial_zscore_matrices$N_mat) 54 | 55 | # Run bNMF steps 56 | 57 | bnmf_reps <- run_bNMF(final_zscore_matrix, n_reps=10) 58 | 59 | summarize_bNMF(bnmf_reps) 60 | -------------------------------------------------------------------------------- /scripts/bNMF_example_pipeline.R: -------------------------------------------------------------------------------- 1 | # This script shows how to run the bNMF clustering pipeline using toy datasets. 2 | 3 | #---- 4 | start=Sys.time() 5 | 6 | # load requires packages 7 | install.packages("pacman") 8 | pacman::p_load(tidyverse, data.table, readxl, magrittr, dplyr, strex, 9 | rstudioapi, DT, kableExtra, GenomicRanges) 10 | 11 | if (!require("BiocManager", quietly = TRUE)) 12 | install.packages("BiocManager") 13 | 14 | BiocManager::install("GenomicRanges") 15 | BiocManager::install("Homo.sapiens") 16 | 17 | # load project scripts containing bNMF functions 18 | source("../scripts/choose_variants.R") # ld_pruning, count_traits_per_variant, fina_variants_needing_proxies, & choose_potential_proxies 19 | source("../scripts/prep_bNMF.R") # fetch_summary_stats & prep_z_matrix 20 | source("../scripts/run_bNMF.R") # run_bNMF & summarize_bNMF 21 | 22 | setwd(dirname(getActiveDocumentContext()$path)) 23 | 24 | #---- 25 | 26 | # USER INPUTS!!! 27 | project_dir = './test_results' # path to where you want results saved 28 | user_token = 'YOUR_LDLINK_API_TOKEN' # token for LDlinkR api 29 | 30 | # create project folder 31 | dir.create(project_dir) 32 | 33 | 34 | #---- 35 | 36 | # SECTION 1: PULL IN GWAS INFORMATION 37 | 38 | data_dir = "../example_data/" 39 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt") # From dbSNP v1.38 -- maps positional IDs to rsIDs 40 | 41 | # GWAS for main trait 42 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"), 43 | sheet="main_gwas") %>% 44 | data.frame() 45 | 46 | # GWAS for clustering traits 47 | gwas_traits <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"), 48 | sheet="trait_gwas") 49 | 50 | # GWAS to be used for final allele alignment 51 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path) 52 | 53 | gwas_ss_files <- setNames(gwas$full_path, gwas$study) 54 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait) 55 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait) 56 | 57 | #---- 58 | 59 | # SECTION 2: PULL SIGNIFICANT VARIANTS FROM MAIN TRAIT GWAS 60 | 61 | # P-value threshold for variants in main trait 62 | PVCUTOFF = 5e-8 63 | 64 | n_gwas <- length(gwas_ss_files) 65 | 66 | vars_sig = data.frame(VAR_ID = as.character(), 67 | P_VALUE = as.numeric(), 68 | Risk_Allele=as.character(), 69 | GWAS=as.character()) 70 | 71 | print(sprintf("Pulling significant SNPs w/ pval<%.1e from %i T2D GWAS...", PVCUTOFF, n_gwas)) 72 | 73 | for(i in 1:n_gwas) { 74 | print(paste0("...Reading ", names(gwas_ss_files)[i], "...")) 75 | 76 | vars <- fread(gwas_ss_files[i], data.table = F, stringsAsFactors=F) 77 | 78 | if (!"BETA" %in% colnames(vars)){ 79 | print("Converting Odds Ratio to Log Odds Ratio...") 80 | vars <- vars %>% 81 | mutate(BETA = log(as.numeric(ODDS_RATIO))) 82 | } 83 | vars <- vars %>% 84 | filter(as.numeric(P_VALUE) <= PVCUTOFF) %>% 85 | subset(grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]", VAR_ID)) %>% 86 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove = F) %>% 87 | mutate(Risk_Allele = ifelse(BETA>=0, ALT, REF)) %>% 88 | mutate(GWAS = gwas$study[i]) %>% 89 | select(VAR_ID, P_VALUE, Risk_Allele, GWAS) 90 | 91 | print(nrow(vars)) 92 | vars_sig = rbind(vars_sig, vars) 93 | } 94 | print(paste("No. total SNPs below pval cutoff:",nrow(vars_sig))) 95 | 96 | # remove duplicates 97 | vars_sig_uniq <- vars_sig %>% 98 | arrange(VAR_ID, P_VALUE) %>% 99 | filter(!duplicated(VAR_ID)) %>% # so we remove duplicates with the higher pvalue 100 | rename(PVALUE = P_VALUE) 101 | print(paste("No. unique SNPs:",nrow(vars_sig_uniq))) 102 | 103 | # remove indels 104 | vars_sig_noIndels <- vars_sig_uniq %>% 105 | separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep="_",remove = F) %>% 106 | mutate(alleles = paste0(REF,ALT)) %>% 107 | subset(nchar(alleles)==2 | (nchar(alleles)<=4 & grepl(",",alleles))) %>% 108 | select(VAR_ID, PVALUE, Risk_Allele, GWAS) 109 | print(paste("No. SNPs excluding indels:",nrow(vars_sig_noIndels))) 110 | 111 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 112 | 113 | # #---- 114 | 115 | # SECTION 3: VARIANT PRUNING (LD-BASED) 116 | 117 | # LD pruning 118 | print("LD-pruning using EUR panel in LDlinkR::SNPclip...") 119 | ld_prune(df_snps = vars_sig_noIndels, 120 | pop = "EUR", 121 | output_dir = project_dir, 122 | r2 = 0.05, 123 | maf=0.001, 124 | my_token = user_token, 125 | chr = c(1:22)) 126 | 127 | #---- 128 | 129 | # combine LD-pruning results 130 | print("Combining SNP.clip results...") 131 | ld_files <- list.files(path = project_dir, 132 | pattern = "^snpClip_results", 133 | full.names = T) 134 | 135 | df_clipped_res = data.frame("RS_Number"=as.character(), 136 | "Position"=as.character(), 137 | "Alleles"= as.character(), 138 | "Details"=as.character()) 139 | 140 | rename_cols_clipped <- c(RS_Number="RS Number", 141 | Position_grch37="Position") 142 | 143 | for (ld_file in ld_files){ 144 | df <- fread(ld_file, stringsAsFactors = F, data.table = F) %>% 145 | dplyr::rename(any_of(rename_cols_clipped)) 146 | df_clipped_res <- rbind(df_clipped_res, df) 147 | } 148 | 149 | df_clipped_kept <- df_clipped_res %>% 150 | filter(Details=="Variant kept.") 151 | 152 | pruned_vars <- vars_sig_noIndels %>% 153 | separate(VAR_ID, into=c("CHR","POS","REF","ALT"), sep = "_",remove = F) %>% 154 | mutate(ChrPos = paste0("chr", CHR, ":", POS)) %>% 155 | filter(ChrPos %in% df_clipped_kept$Position) 156 | print(sprintf("T2D SNPs pruned from %i to %i...", nrow(vars_sig_noIndels), nrow(pruned_vars))) 157 | 158 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 159 | 160 | #---- 161 | # SECTION 4: VARIANT MISSINGNESS 162 | 163 | print("Searching for variants in trait GWAS...") 164 | gwas_variants <- pruned_vars$VAR_ID 165 | df_Ns <- count_traits_per_variant(gwas_variants, 166 | trait_ss_files) 167 | 168 | # fix column names 169 | df_Ns_rev <- df_Ns %>% 170 | column_to_rownames("VAR_ID") %>% 171 | set_colnames(names(trait_ss_files)) 172 | 173 | print("Calculating variant missingess in traits...") 174 | variant_counts_df <- data.frame(VAR_ID=rownames(df_Ns_rev), 175 | frac=rowSums(!is.na(df_Ns_rev[,names(trait_ss_files)]))/length(trait_ss_files)) 176 | var_nonmissingness <- ifelse( 177 | gwas_variants %in% variant_counts_df$VAR_ID, 178 | # if in counts data frame, take the non-missing fraction: 179 | variant_counts_df$frac[match(gwas_variants, variant_counts_df$VAR_ID)], 180 | # else not in data frame, so non-missing fraction is 0: 181 | 0 182 | ) 183 | var_nonmissingness <- setNames(var_nonmissingness, gwas_variants) 184 | 185 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 186 | 187 | #---- 188 | # SECTION 5: DETERMINE VARIANTS NEEDING PROXIES 189 | 190 | print("Identifying variants needing proxies...") 191 | proxies_needed_df <- find_variants_needing_proxies(pruned_vars, 192 | var_nonmissingness, 193 | rsID_map_file, 194 | missing_cutoff = 0.8) 195 | 196 | #---- 197 | # SECTION 6: PROXY SEARCH 198 | 199 | print("Searching for proxies with TopLD API...") 200 | proxy_search_results <- choose_proxies(need_proxies = proxies_needed_df, 201 | method="LDlink", 202 | LDlink_token = user_token, 203 | topLD_path = api_path, 204 | rsID_map_file = rsID_map_file, 205 | trait_ss_files = trait_ss_files, 206 | pruned_variants = pruned_vars, 207 | population="EUR" 208 | ) 209 | 210 | df_proxies <- proxy_search_results %>% 211 | dplyr::select(VAR_ID, proxy_VAR_ID) %>% 212 | dplyr::inner_join(pruned_vars[,c("VAR_ID","GWAS")], by="VAR_ID") %>% 213 | mutate(Risk_Allele=NA, PVALUE=NA) 214 | 215 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 216 | 217 | #---- 218 | 219 | # SECTION 7: Fetch summary statistics for SNPs in trait GWAS 220 | 221 | print("Prepping input for fetch_summary_stats...") 222 | 223 | # Remove SNPs from pruned_vars that needed proxies 224 | df_orig_snps <- pruned_vars %>% 225 | filter(!VAR_ID %in% proxies_needed_df$VAR_ID) 226 | 227 | # Join with pruned vars so we can get the original GWAS where the proxy came from 228 | # and add the necessary columns (if you don't care about the original GWAS, can 229 | # ski the inner_join step and just set GWAS=NA) 230 | df_proxies <- proxy_search_results %>% 231 | dplyr::select(VAR_ID, proxy_VAR_ID) %>% 232 | dplyr::inner_join(pruned_vars[,c("VAR_ID","GWAS")], by="VAR_ID") %>% 233 | mutate(Risk_Allele=NA, PVALUE=NA) 234 | 235 | # combine original SNPs with proxy SNP 236 | # MAKE SURE assign proxy_VAR_ID to VAR_ID in df_proxies!!! 237 | df_input_snps <- rbind(df_orig_snps %>% select(VAR_ID, PVALUE, Risk_Allele,GWAS), 238 | df_proxies %>% select(VAR_ID=proxy_VAR_ID, PVALUE, Risk_Allele,GWAS)) %>% 239 | arrange(PVALUE) %>% 240 | filter(!duplicated(VAR_ID)) 241 | 242 | cat(sprintf("\n%i original SNPs...\n", nrow(df_orig_snps))) 243 | cat(sprintf("\n%i proxy SNPs...\n", nrow(df_proxies))) 244 | cat(sprintf("\n%i total unique SNPs!\n", nrow(df_input_snps))) 245 | 246 | initial_zscore_matrices <- fetch_summary_stats( 247 | df_input_snps, 248 | main_ss_filepath, 249 | trait_ss_files, 250 | trait_ss_size, 251 | pval_cutoff=0.05 252 | ) 253 | 254 | 255 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 256 | system(sprintf("mv alignment_GWAS_summStats.csv %s", project_dir)) 257 | 258 | #---- 259 | 260 | # Section 8: get rsIDs for final variant set 261 | 262 | print("Getting rsIDs for final snps and saving to results...") 263 | z_mat <- initial_zscore_matrices$z_mat 264 | N_mat <- initial_zscore_matrices$N_mat 265 | 266 | df_var_ids <- df_input_snps %>% 267 | separate(VAR_ID, into=c("Chr","Pos","Ref","Alt"),sep="_",remove = F) %>% 268 | mutate(ChrPos=paste(Chr,Pos,sep = ":")) %>% 269 | subset(ChrPos %in% rownames(z_mat)) 270 | write(df_var_ids$VAR_ID,'my_snps.tmp') 271 | 272 | system(sprintf("grep -wFf my_snps.tmp %s > %s", 273 | rsID_map_file, file.path(project_dir, "rsID_map.txt"))) 274 | 275 | df_rsIDs <- fread(cmd=sprintf("grep -wFf my_snps.tmp %s",rsID_map_file), 276 | header = F, 277 | col.names = c("VAR_ID","rsID")) 278 | print(sprintf("rsIDs found for %i of %i SNPs...", nrow(df_rsIDs), nrow(df_var_ids))) 279 | 280 | df_rsIDs_final <- df_rsIDs %>% 281 | filter(VAR_ID %in% df_var_ids$VAR_ID) 282 | write_delim(x=df_rsIDs_final, 283 | file = file.path(project_dir, "rsID_map.txt"), 284 | col_names = T) 285 | 286 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 287 | 288 | #---- 289 | 290 | # Section 9: Fill missing data in z-score and N matrices 291 | 292 | df_snps <- df_input_snps %>% 293 | inner_join(df_rsIDs_final, by="VAR_ID") %>% 294 | data.frame() 295 | 296 | print("Searching for cover proxies for missing z-scores...") 297 | initial_zscore_matrices_final <- fill_missing_zscores(initial_zscore_matrices, 298 | df_snps, 299 | trait_ss_files, 300 | trait_ss_size, 301 | main_ss_filepath, 302 | rsID_map_file, 303 | method_fill="median", 304 | population="EUR") 305 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 306 | 307 | #---- 308 | 309 | # Section 10.) Generate non-negative z-score matrix 310 | 311 | prep_z_output <- prep_z_matrix(z_mat = initial_zscore_matrices_final$z_mat, 312 | N_mat = initial_zscore_matrices_final$N_mat, 313 | corr_cutoff = 0.8) 314 | 315 | # prep_z_output has two outputs: 316 | 317 | # 1.) The scaled, non-negative z-score matrix 318 | final_zscore_matrix <- prep_z_output$final_z_mat 319 | 320 | # 2.) Results from the trait filtering 321 | df_traits_filtered <- prep_z_output$df_traits 322 | write_csv(x = df_traits_filtered, 323 | file = file.path(project_dir,"df_traits.csv")) 324 | 325 | # prep_z_matrix also save trait correlation matrix to working dir, so move to project dir 326 | system(sprintf("mv trait_cor_mat.txt %s", project_dir)) 327 | 328 | print(sprintf("Final matrix: %i SNPs x %i traits", 329 | nrow(final_zscore_matrix), 330 | ncol(final_zscore_matrix)/2)) 331 | 332 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 333 | 334 | #---- 335 | 336 | # Section 11.) Run bNMF 337 | bnmf_reps <- run_bNMF(final_zscore_matrix, 338 | n_reps=25, 339 | tolerance = 1e-6) 340 | summarize_bNMF(bnmf_reps, dir_save=project_dir) 341 | 342 | save.image(file = file.path(project_dir, "pipeline_data.RData")) 343 | 344 | end=Sys.time() 345 | print("Total pipeline runtime:") 346 | print(end-start) 347 | 348 | #---- 349 | 350 | # format results 351 | k <- NULL 352 | if (is.null(k)){ 353 | html_filename <- "results_for_maxK.html" 354 | } else { 355 | html_filename <- sprintf("results_for_K_%i.html", k) 356 | } 357 | 358 | rmarkdown::render( 359 | './format_bNMF_results.Rmd', 360 | output_file = html_filename, 361 | params = list(main_dir = project_dir, 362 | k = k, 363 | loci_file="query", 364 | GTEx=F, 365 | my_traits=gwas_traits) 366 | ) 367 | 368 | 369 | #---- 370 | -------------------------------------------------------------------------------- /scripts/choose_variants.R: -------------------------------------------------------------------------------- 1 | packages = c('tidyverse', 'data.table', 'LDlinkR') 2 | invisible(lapply(packages, library, character.only = TRUE)) 3 | 4 | # CURRENT ASSUMPTIONS ABOUT FORMATTING: 5 | # - Genome build is hg19/GrCh37 6 | # - Summary statistic datasets are whitespace-delimited with columns: VAR_ID, BETA, SE, N_PH 7 | # - Variant IDs are all of the format: CHR_POS_REF_ALT 8 | 9 | 10 | 11 | snp_clump <- function(df_snps, 12 | id="VAR_ID", 13 | window=500000, 14 | chr=1:22, 15 | pos_range=c(1,Inf)) { 16 | 17 | clumped_snps <- c() 18 | uniq_chr <- sort(as.integer(unique(df_snps$CHR))) 19 | 20 | for (i in uniq_chr) { 21 | tmp <- df_snps %>% 22 | filter(CHR==i) %>% 23 | arrange(PVALUE) %>% 24 | mutate(POS = as.integer(POS)) %>% 25 | data.frame() 26 | 27 | if (i %in% chr) { 28 | print(sprintf("Clumping Chr. %i...",i)) 29 | 30 | do_clump <- tmp %>% 31 | filter(between(POS, pos_range[1], pos_range[2])) 32 | dont_clump <- tmp %>% 33 | filter(!between(POS, pos_range[1], pos_range[2])) 34 | clumped_snps <- c(clumped_snps, dont_clump[,id]) 35 | print(sprintf("Clumping %i variants, not clumping %i variants...", 36 | nrow(do_clump), nrow(dont_clump))) 37 | 38 | remaining_snps <- do_clump[,id] 39 | 40 | j=0 41 | while (length(remaining_snps)>0){ 42 | clumped_snps <- c(clumped_snps, remaining_snps[1]) 43 | 44 | cur_pos <- do_clump$POS[do_clump[id]==remaining_snps[1]] 45 | 46 | close_snps <- do_clump[abs(do_clump$POS-cur_pos)<=window, id] 47 | 48 | remaining_snps <- setdiff(remaining_snps, close_snps) 49 | 50 | j=j+1 51 | } 52 | num_clumped <- j + nrow(dont_clump) 53 | } else { 54 | print(sprintf("No clumping for Chr %i!",i)) 55 | num_clumped=nrow(tmp) 56 | clumped_snps <- c(clumped_snps, tmp[,id]) 57 | } 58 | cat(sprintf("Chr %i clumped from %i to %i SNPs\n\n",i, nrow(tmp), num_clumped)) 59 | } 60 | print(sprintf("No. SNPs after clumping: %i",length(clumped_snps))) 61 | return(clumped_snps) 62 | } 63 | 64 | ld_prune <- function(df_snps, 65 | pop, 66 | my_token, 67 | r2=0.1, 68 | maf=0.01, 69 | chr=1:22, 70 | output_dir="./") { 71 | 72 | snp_clip_input <- df_snps %>% 73 | separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep="_",remove = F) %>% 74 | mutate(ChrPos = paste0("chr",CHR,":",POS)) %>% 75 | arrange(PVALUE) 76 | 77 | df_clipped <- data.frame(ChrPos=as.character(), 78 | rsID=as.character()) 79 | for (i in chr){ 80 | start = Sys.time() 81 | cur_chr <- snp_clip_input %>% 82 | filter(CHR==i) 83 | print(sprintf("Chr %i (%i SNPs)",i, nrow(cur_chr))) 84 | 85 | 86 | if (nrow(cur_chr) == 0) { 87 | next 88 | } 89 | else if (nrow(cur_chr) == 1) { 90 | 91 | cur_snps <- cur_chr %>% 92 | pull(ChrPos) 93 | # if only one SNP, use LDhap to get the variant info 94 | clipped_res <- LDlinkR::LDhap(snps = cur_snps, 95 | pop = pop, 96 | token = my_token, 97 | genome_build = "grch37", 98 | table_type = "variant" 99 | ) %>% 100 | rename(Alleles=Allele_Frequency) %>% 101 | mutate(Details="Variant kept.") 102 | } else { # >1 SNP 103 | if (between(nrow(cur_chr), 1, 5000)) { 104 | cur_snps <- cur_chr %>% 105 | pull(ChrPos) 106 | 107 | } else { 108 | print("Chromosome has >5000 SNPs; breaking into sections...") 109 | var_df_list <- split(cur_chr, (seq(nrow(cur_chr))-1) %/% 5000) 110 | cur_snps <- c() 111 | 112 | for (j in 1:length(var_df_list)){ 113 | 114 | print(sprintf("Pruning subset %i for chromosome %i...", j, i)) 115 | var_df <- var_df_list[[j]] 116 | cur_snps_j <- var_df %>% 117 | pull(ChrPos) 118 | 119 | clipped_res_split <- LDlinkR::SNPclip( 120 | cur_snps_j, 121 | pop = pop, 122 | r2_threshold = r2, 123 | maf_threshold = maf, 124 | token = my_token, 125 | file = FALSE, 126 | genome_build = "grch37") 127 | 128 | clipped_snps <- clipped_res_split %>% 129 | filter(Details=="Variant kept.") %>% 130 | pull(RS_Number) 131 | print(sprintf("Subset %i pruned to %i SNPs...", j, length(clipped_snps))) 132 | cur_snps <- c(cur_snps, clipped_snps) 133 | } 134 | } 135 | 136 | print(sprintf("Performing final chromosomal pruning for %i SNPs...", length(cur_snps))) 137 | clipped_res <- LDlinkR::SNPclip( 138 | cur_snps, 139 | pop = pop, 140 | r2_threshold = r2, 141 | maf_threshold = maf, 142 | token = my_token, 143 | file = FALSE, 144 | genome_build = "grch37") 145 | } 146 | 147 | fwrite(x=clipped_res, 148 | file = file.path(output_dir, sprintf("snpClip_results_%s_chr%i.txt", pop, i)), 149 | quote = F, 150 | sep = "\t") 151 | 152 | df_clipped_final <- clipped_res %>% 153 | filter(Details=="Variant kept.") 154 | print(sprintf("Chr%i pruned from %i to %i SNPs...",i, nrow(cur_chr), nrow(df_clipped_final))) 155 | 156 | end = Sys.time() 157 | print(end-start) 158 | 159 | } 160 | print("Done!") 161 | 162 | } 163 | 164 | count_traits_per_variant <- function(gwas_variants, ss_files) { 165 | 166 | # Given a vector of variants and a named vector of summary statistics files 167 | # for traits to be clustered, output a vector of non-missing trait fractions 168 | # per variant 169 | 170 | print("Assessing variant missingness across traits...") 171 | write(gwas_variants, "all_snps_varids.tmp") 172 | 173 | rename_cols <- c(N_PH="N") 174 | 175 | variant_df_list <- lapply(1:length(ss_files), function(i) { 176 | print(sprintf("...Reading %s...", names(ss_files)[i])) 177 | 178 | headers <- as.character(fread(ss_files[i], nrows=1, 179 | data.table=F, stringsAsFactors=F, header=F)) 180 | 181 | if (endsWith(ss_files[i],".gz")) { 182 | df <- fread(cmd=sprintf("gzip -cd %s | fgrep -wf all_snps_varids.tmp ",ss_files[i]), 183 | header=F, 184 | col.names=headers, 185 | data.table=F, 186 | stringsAsFactors=F) %>% 187 | rename(any_of(rename_cols)) 188 | 189 | } else { 190 | df <- fread(cmd=sprintf("fgrep -wf all_snps_varids.tmp %s ",ss_files[i]), 191 | header=F, 192 | col.names=headers, 193 | data.table=F, 194 | stringsAsFactors=F) %>% 195 | rename(any_of(rename_cols)) 196 | 197 | } 198 | print(nrow(df)) 199 | return(df) 200 | }) 201 | 202 | # make dataframe of Ns 203 | df_N <- variant_df_list %>% 204 | setNames(names(ss_files)) %>% 205 | bind_rows(.id="trait") %>% 206 | select(trait, VAR_ID, N_PH) %>% 207 | pivot_wider(names_from="trait", values_from="N_PH") %>% 208 | data.frame() 209 | } 210 | 211 | 212 | find_variants_needing_proxies <- function(gwas_variant_df, var_nonmissingness, 213 | rsID_map_file, missing_cutoff=0.8) { 214 | 215 | # Given a data frame containing GWAS variants and alleles as well as a vector 216 | # of trait missingness fractions per variant (from count_traits_per_variant), 217 | # output a vector of variants that need proxies 218 | # Criteria (any of the following): 219 | # Strand-ambiguous (AT or GC) 220 | # Multi-allelic 221 | # Low-count (available in < 80% of traits) 222 | # rsID_map_file should point to a whitespace-delimited file with columns 223 | # corresponding to VAR_ID and rsID 224 | 225 | print("Choosing variants in need of proxies...") 226 | 227 | gwas_variant_df <- gwas_variant_df %>% 228 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 229 | sep="_", remove=F) 230 | 231 | need_proxies_varid <- with(gwas_variant_df, { 232 | strand_ambig <- VAR_ID[paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")] 233 | print(paste0("...", length(strand_ambig), " strand-ambiguous variants")) 234 | 235 | multi_allelic <- grep("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", VAR_ID, value=T) # i.e. ALT allele has a comma 236 | print(paste0("...", length(multi_allelic), " multi-allelic variants")) 237 | 238 | low_cnt <- VAR_ID[!(VAR_ID %in% names(var_nonmissingness)) | 239 | var_nonmissingness[VAR_ID] < missing_cutoff] 240 | print(paste0("...", length(low_cnt), " variants with excessive missingness")) 241 | 242 | unique(c(strand_ambig, multi_allelic, low_cnt)) 243 | }) 244 | print(paste0("...", length(need_proxies_varid), " unique variants in total")) 245 | 246 | if (length(need_proxies_varid) == 0) return(tibble(VAR_ID=c(), rsID=c())) 247 | 248 | write(need_proxies_varid, "need_proxies_varid.tmp") 249 | varid_rsid_map <- fread(cmd=paste0("grep -wFf need_proxies_varid.tmp ", 250 | rsID_map_file), 251 | header=F, col.names=c("VAR_ID", "rsID"), 252 | data.table=F, stringsAsFactors=F) 253 | need_proxies_rsid <- varid_rsid_map$rsID[match(need_proxies_varid, 254 | varid_rsid_map$VAR_ID)] 255 | print(paste0("...", length(unique(varid_rsid_map$rsID)), 256 | " of these are mapped to rsIDs")) 257 | system("rm need_proxies_varid.tmp") 258 | 259 | tibble(VAR_ID=need_proxies_varid) %>% 260 | left_join(varid_rsid_map, by="VAR_ID") %>% 261 | left_join(gwas_variant_df[,c("VAR_ID","PVALUE")], by="VAR_ID") 262 | } 263 | 264 | 265 | choose_proxies <- function(need_proxies, 266 | rsID_map_file, 267 | trait_ss_files, 268 | pruned_variants, 269 | method="TopLD", 270 | LDlink_token=NULL, 271 | topLD_path=NULL, 272 | population="EUR", 273 | frac_nonmissing_num=0.8, 274 | r2_num=0.8) { 275 | 276 | # Given a vector of variants (rsIDs) needing proxies 277 | # (from find_variants_needing_proxies) and an LD reference file, 278 | # output a data frame linking each variant to a data frame containing possible 279 | # proxies (variant ID + r^2 + alleles) 280 | # Criteria for eligibility: 281 | # Not strand-ambiguous 282 | # Trait fraction >= 80% 283 | # r^2 >= 0.8 with the index variant 284 | # Choose based on first trait count, then r^2 285 | 286 | # First, run "/path/to/tabix /path/to/LDfile rsID_1 rsID_2 ... 287 | print(paste("Num rows need_proxies:",nrow(need_proxies))) 288 | if (method %in% c("LDlink","LDlinkR","LDproxy")) { 289 | 290 | print("Using LDlinkR:LDproxy_batch to find proxies...") 291 | need_proxies <- need_proxies %>% 292 | separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep = "_",remove = F) %>% 293 | mutate(query_snp = paste0("chr", CHR, ":", POS)) %>% 294 | select(-c(CHR, POS, REF, ALT)) 295 | need_proxies_snps <- need_proxies$query_snp 296 | 297 | LDlinkR::LDproxy_batch(need_proxies_snps, 298 | pop = population, 299 | r2d = "r2", 300 | token = LDlink_token, 301 | append = T, 302 | genome_build = "grch37") 303 | proxy_df <- read.table("./combined_query_snp_list_grch37.txt",sep = "\t",row.names = NULL) %>% 304 | filter(R2>r2_num) %>% 305 | filter(!Coord %in% need_proxies_snps) %>% 306 | inner_join(need_proxies, by = "query_snp") %>% 307 | arrange(PVALUE) %>% 308 | filter(!duplicated(RS_Number)) %>% 309 | dplyr::select(rsID, proxy_rsID=RS_Number, r2=R2) 310 | need_proxies <- need_proxies %>% 311 | select(-c(query_snp)) 312 | 313 | } else if (method=="TopLD") { # use TopLD 314 | print(sprintf("Using TopLD to find proxies for %s!", population)) 315 | if (nrow(need_proxies)<100) { 316 | write(need_proxies$rsID, "need_proxies_rsIDs.tmp") 317 | system(sprintf("%s -thres %.1f -pop %s -maf 0.01 -inFile need_proxies_rsIDs.tmp -outputLD outputLD.txt -outputInfo outputInfo.txt", topLD_path, r2_num, population)) 318 | } else { # need to split up 319 | print("Splitting proxy df into subsets (more than 100 SNPs)...") 320 | proxy_df_list <- split(need_proxies, (seq(nrow(need_proxies))-1) %/% 100) 321 | 322 | system("touch outputLD.txt") 323 | print("Running TopLD for proxy df segments...") 324 | for (j in 1:length(proxy_df_list)){ 325 | print(sprintf("Querying LD subset %i/%i",j, length(proxy_df_list))) 326 | df <- proxy_df_list[[j]] 327 | write(df$rsID, "need_proxies_rsIDs.tmp") 328 | system(sprintf("%s -thres %.1f -pop %s -maf 0.01 -inFile need_proxies_rsIDs.tmp -outputLD outputLD_temp.txt -outputInfo outputInfo.txt", topLD_path, r2_num, population)) 329 | system("cat outputLD_temp.txt >> outputLD.txt") 330 | } 331 | } 332 | proxy_df <- fread("outputLD.txt", stringsAsFactors = F, data.table = F) %>% 333 | select(rsID=rsID1, proxy_rsID=rsID2, r2=R2) %>% 334 | subset(proxy_rsID %like% "rs") 335 | } 336 | else { 337 | stop("Enter appropriate proxy search method!") # Using stop function 338 | 339 | } 340 | print(paste("No. possible proxies found:",nrow(proxy_df))) # proxy_df should have columns (rsID, proxy_rsID, r2) 341 | write(proxy_df$proxy_rsID, "potential_proxies_rsid.tmp") 342 | 343 | if (nrow(proxy_df)>0) { 344 | print("Creating proxy rsID map...") 345 | potential_proxies_map <- fread(cmd=paste0("grep -wFf potential_proxies_rsid.tmp ", 346 | rsID_map_file), 347 | header=F, col.names=c("proxy_VAR_ID", "proxy_rsID"), 348 | data.table=F, stringsAsFactors=F) 349 | print(head(potential_proxies_map)) 350 | 351 | proxy_variants <- potential_proxies_map$proxy_VAR_ID 352 | 353 | proxy_missingness <- count_traits_per_variant( 354 | proxy_variants, 355 | trait_ss_files 356 | ) 357 | 358 | # get proxy missingness 359 | df_Ns_rev <- proxy_missingness %>% 360 | column_to_rownames("VAR_ID") 361 | df_Ns_rev[df_Ns_rev == 'NULL'] <- NA 362 | 363 | # get variant counts 364 | variant_counts_df <- data.frame(VAR_ID=rownames(df_Ns_rev), 365 | frac=rowSums(!is.na(df_Ns_rev))/length(trait_ss_files)) 366 | 367 | proxy_missingness <- ifelse( 368 | proxy_variants %in% variant_counts_df$VAR_ID, 369 | variant_counts_df$frac[match(proxy_variants, variant_counts_df$VAR_ID)], # If in counts data frame, take the non-missing fraction 370 | 0 # If not in data frame, then the non-missing fraction is 0 371 | ) 372 | proxy_missingness <- setNames(proxy_missingness, proxy_variants) 373 | 374 | proxy_missingness_df <- tibble( 375 | proxy_VAR_ID=names(proxy_missingness), 376 | frac_nonmissing=proxy_missingness 377 | ) 378 | 379 | final_proxy_df <- proxy_df %>% 380 | inner_join(potential_proxies_map, by="proxy_rsID") %>% 381 | separate(proxy_VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 382 | sep="_", remove=F) %>% 383 | inner_join(proxy_missingness_df, by="proxy_VAR_ID") %>% 384 | filter( 385 | !(paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")), # Not strand-ambiguous 386 | !grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", proxy_VAR_ID), # Not multi-allelic 387 | frac_nonmissing >= frac_nonmissing_num, # Sufficient fraction of traits non-missing 388 | r2 >= r2_num # Sufficient LD with the proxied variant 389 | ) %>% 390 | group_by(rsID) %>% 391 | arrange(desc(frac_nonmissing), 392 | desc(r2), 393 | CHR) %>% # Arbitrary sort for reproducibility in case of missingness + r2 ties 394 | dplyr::slice(1) %>% 395 | ungroup() %>% 396 | inner_join(need_proxies, by="rsID") %>% # added to include orig VAR_ID in output 397 | data.frame() 398 | } else { 399 | final_proxy_df <- NULL 400 | } 401 | proxies_found <- final_proxy_df$rsID 402 | 403 | no_proxies_found <- setdiff(need_proxies$rsID, proxies_found) 404 | print(paste0("No proxies needed for ", 405 | length(setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID)), 406 | " variants.")) 407 | print(paste0("Proxies found for ", length(proxies_found), " variants.")) 408 | print(paste0("No adequate proxies found for ", length(no_proxies_found), 409 | " variants.")) 410 | 411 | return(final_proxy_df) 412 | 413 | } 414 | 415 | -------------------------------------------------------------------------------- /scripts/generate_varid_to_rsid_map_file.R: -------------------------------------------------------------------------------- 1 | 2 | CURRENT <- paste(getwd(),"/",sep="") 3 | 4 | # download all variant information for homo sapiens GRCh37 from Ensembl 5 | url <- "http://ftp.ensembl.org/pub/grch37/current/variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz" 6 | destfile <- paste(CURRENT,"1000GENOMES-phase_3.vcf.gz",sep="") 7 | download.file(url, destfile) 8 | 9 | library(vcfR) 10 | vcf <- read.vcfR(destfile) 11 | vars = vcf@fix 12 | 13 | # format output file 14 | vars = as.data.frame(vars[,c("CHROM","POS","ID","REF","ALT")]) 15 | vars$VAR_ID = paste(vars$CHROM,vars$POS,vars$REF,vars$ALT,sep="_") 16 | vars = vars[,c("VAR_ID","ID")] 17 | names(vars) = c("VAR_ID","rsID") 18 | 19 | write.table(vars,file=paste(CURRENT,"VARID_rsID_map_file.txt",sep=""), append = F, quote = F, sep = "\t", 20 | eol = "\n", na = "NA", dec = ".", row.names = F, 21 | col.names = F, qmethod = c("escape", "double")) 22 | -------------------------------------------------------------------------------- /scripts/run_bNMF.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | 4 | ########################################################################## 5 | # Copyright (c) 2017, Broad Institute 6 | # Redistribution and use in source and binary forms, with or without 7 | # modification, are permitted provided that the following conditions are 8 | # met: 9 | # Redistributions of source code must retain the above copyright 10 | # notice, this list of conditions and the following disclaimer. 11 | # Redistributions in binary form must reproduce the above copyright 12 | # notice, this list of conditions and the following disclaimer in 13 | # the documentation and/or other materials provided with the 14 | # distribution. 15 | # Neither the name of the Broad Institute nor the names of its 16 | # contributors may be used to endorse or promote products derived 17 | # from this software without specific prior written permission. 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | ######################################################################### 30 | 31 | ###################################################################### 32 | # Bayesian NMF algorithms for clustering 33 | ###################################################################### 34 | # For implementation details see the ppaer 35 | # Udler MS, Kim J, von Grotthuss M, 36 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018) 37 | # Type 2 diabetes genetic loci informed by multi-trait 38 | # associations point to disease mechanisms and 39 | # subtypes: A soft clustering analysis. PLoS Med 15 40 | # (9): e1002654. 41 | ########################### 42 | # For details on the original algorithms 43 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence. 44 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013). 45 | ###################################################################### 46 | 47 | BayesNMF.L2EU <- function( 48 | V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=15, phi=1.0 #20, 10 49 | ) { 50 | 51 | # Bayesian NMF with half-normal priors for W and H 52 | # V0: input z-score matrix (variants x traits) 53 | # n.iter: Number of iterations for parameter optimization 54 | # a0: Hyper-parameter for inverse gamma prior on ARD relevance weights 55 | # tol: Tolerance for convergence of fitting procedure 56 | # K: Number of clusters to be initialized (algorithm may drive some to zero) 57 | # K0: Used for setting b0 (lambda prior hyper-parameter) -- should be equal to K 58 | # phi: Scaling parameter 59 | 60 | eps <- 1.e-50 61 | del <- 1.0 62 | active_nodes <- colSums(V0) != 0 63 | V0 <- V0[, active_nodes] 64 | V <- V0 - min(V0) 65 | Vmin <- min(V) 66 | Vmax <- max(V) 67 | N <- dim(V)[1] 68 | M <- dim(V)[2] 69 | 70 | W <- matrix(runif(N * K) * Vmax, ncol=K) 71 | H <- matrix(runif(M * K) * Vmax, ncol=M) 72 | 73 | I <- array(1, dim=c(N, M)) 74 | V.ap <- W %*% H + eps 75 | 76 | phi <- sd(V)^2 * phi 77 | C <- (N + M) / 2 + a0 + 1 78 | b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0) 79 | lambda.bound <- b0 / C 80 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 81 | lambda.cut <- lambda.bound * 1.5 82 | 83 | n.like <- list() 84 | n.evid <- list() 85 | n.error <- list() 86 | n.lambda <- list() 87 | n.lambda[[1]] <- lambda 88 | iter <- 2 89 | count <- 1 90 | while (del >= tol & iter < n.iter) { 91 | H <- H * (t(W) %*% V) / 92 | (t(W) %*% V.ap + phi * H * matrix(rep(1 / lambda, M), ncol=M) + eps) 93 | V.ap <- W %*% H + eps 94 | W <- W * (V %*% t(H)) / 95 | (V.ap %*% t(H) + phi * W * t(matrix(rep(1 / lambda, N), ncol=N)) + eps) 96 | V.ap <- W %*% H + eps 97 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 98 | del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]]) 99 | like <- sum((V - V.ap)^2) / 2 100 | n.like[[iter]] <- like 101 | n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 102 | lambda + C * log(lambda)) 103 | n.lambda[[iter]] <- lambda 104 | n.error[[iter]] <- sum((V - V.ap)^2) 105 | if (iter %% 100 == 0) { 106 | cat(iter, n.evid[[iter]], n.like[[iter]], n.error[[iter]], del, 107 | sum(colSums(W) != 0), sum(lambda >= lambda.cut), '\n') 108 | } 109 | iter <- iter + 1 110 | } 111 | return(list( 112 | W, # Variant weight matrix (N x K) 113 | H, # Trait weight matrix (K x M) 114 | n.like, # List of reconstruction errors (sum of squared errors / 2) per iteration 115 | n.evid, # List of negative log-likelihoods per iteration 116 | n.lambda, # List of lambda vectors (shared weights for each of K clusters, some ~0) per iteration 117 | n.error # List of reconstruction errors (sum of squared errors) per iteration 118 | )) 119 | } 120 | 121 | 122 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, K=20, K0=10, tolerance=1e-7) { 123 | 124 | # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure 125 | # a series of times to generate results and evaluate cluster stability 126 | 127 | print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)...")) 128 | print(sprintf("Using tolerance of %.2e!",tolerance)) 129 | 130 | set.seed(random_seed) 131 | 132 | bnmf_reps <- lapply(1:n_reps, function(r) { 133 | print(paste("ITERATION",r)) 134 | res <- BayesNMF.L2EU(V0 = z_mat, K=K, K0=K0, tol=tolerance) 135 | names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error") 136 | res 137 | }) 138 | bnmf_reps 139 | } 140 | 141 | 142 | summarize_bNMF <- function(bnmf_reps, dir_save=NULL) { 143 | 144 | # Given output from bNMF (list of length N_iterations), 145 | # generate summary tables and plots 146 | 147 | make_run_summary <- function(reps) { 148 | 149 | # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs 150 | 151 | run_summary <- map_dfr(1:length(reps), function(i) { 152 | res <- reps[[i]] 153 | final_lambdas <- res$n.lambda[[length(res$n.lambda)]] 154 | tibble( 155 | run=i, 156 | K=sum(final_lambdas > min(final_lambdas)), # Assume that lambdas equal to the minimum lambda are ~ 0 157 | evid=res$n.evid[[length(res$n.evid)]] # Evidence = -log_likelihood 158 | ) 159 | }) %>% 160 | arrange(evid) 161 | 162 | unique.K <- table(run_summary$K) 163 | n.K <- length(unique.K) # Number of distinct K 164 | MAP.K.run <- sapply(names(unique.K), function(k) { # bNMF run index with the maximum posterior for given K 165 | tmp <- run_summary[run_summary$K == k, ] 166 | tmp$run[which.min(tmp$evid)] 167 | }) 168 | 169 | list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run) 170 | } 171 | if (!is.null(dir_save)) { 172 | dir.create(file.path(dir_save)) 173 | dir_save=paste0(dir_save,"/") 174 | } else {dir_save="./"} 175 | 176 | print("Summarizing bNMF results...") 177 | 178 | print("Writing table of chosen K across iterations...") 179 | run_summary <- make_run_summary(bnmf_reps) 180 | write_tsv(run_summary$run_tbl, paste0(dir_save,"run_summary.txt")) 181 | 182 | n.K <- length(run_summary$unique.K) # Number of distinct K 183 | 184 | get_W <- function(clustering) { 185 | W_raw <- clustering$W 186 | W_raw[, colSums(W_raw > 1e-10) > 0] 187 | } 188 | 189 | get_H <- function(clustering) { 190 | H_raw <- clustering$H 191 | H_raw[rowSums(H_raw > 1e-10) > 0, ] 192 | } 193 | 194 | print("Plotting variant and trait contributions...") 195 | silent <- sapply(names(run_summary$unique.K), function(k) { # Create heatmaps for MAP iteration for each K 196 | res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]] 197 | W <- res$W[, colSums(res$W) != 0] # feature-cluster association matrix 198 | H <- res$H[rowSums(res$H) != 0, ] # cluster-gene association matrix 199 | W[W < 1.e-10] <- 0 200 | H[H < 1.e-10] <- 0 201 | 202 | W0 <- data.frame(W) 203 | W0[, "variant"] <- rownames(W) 204 | H0 <- data.frame(H) 205 | H0[, "cluster"] <- rownames(H) 206 | 207 | write_tsv(W0, file=paste0(dir_save,"L2EU.W.mat.", k, ".txt")) 208 | write_tsv(H0, file=paste0(dir_save,"L2EU.H.mat.", k, ".txt")) 209 | 210 | mat.reconstructed <- W %*% H # reconstructed matrix == approximation for the input matrix 211 | 212 | # Setup for plotting 213 | scale0 <- 0.8 214 | scale <- 1 215 | g.ordering <- paste("G", seq(1:ncol(W)), sep="") 216 | color.axis <- "black" 217 | .theme_ss <- theme_bw(base_size=12) + 218 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 219 | family="mono", face='bold', color=color.axis), 220 | axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis), 221 | axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis), 222 | axis.title=element_text(face="bold", size=12 * scale,color="black"), 223 | plot.title=element_text(face="bold", size=12 * scale)) 224 | 225 | # Plot W matrix (feature activities) 226 | W_hc <- hclust(dist(W, method="euclidean"), method="ward.D") 227 | W_variant.ordering <- W_hc$labels[W_hc$order] 228 | W_plt_df <- W %>% 229 | as.data.frame() %>% 230 | rownames_to_column(var="variant") %>% 231 | gather(key="cluster", value="activity", -variant) %>% 232 | mutate(variant=factor(variant, levels=W_variant.ordering), 233 | cluster=factor(cluster, 234 | levels=paste0("V", 1:ncol(W)))) 235 | W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 236 | geom_tile() + 237 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 238 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 239 | .theme_ss + 240 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 241 | ylab("Cluster") + xlab("Variant") + 242 | theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) + 243 | theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) + 244 | theme(legend.position="right") + 245 | theme(legend.key.size = unit(0.5, "cm")) 246 | ggsave(paste0(dir_save,"W_plot_K", k, ".pdf"), plot=W_plt) 247 | 248 | H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D") 249 | H_trait.ordering <- H_hc$labels[H_hc$order] 250 | H_plt_df <- t(H) %>% 251 | as.data.frame() %>% 252 | rownames_to_column(var="trait") %>% 253 | gather(key="cluster", value="activity", -trait) %>% 254 | mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))), 255 | trait=factor(trait, levels=H_trait.ordering)) 256 | H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 257 | geom_tile() + 258 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 259 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 260 | .theme_ss + 261 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 262 | ylab("Cluster") + xlab("Trait") + 263 | theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) + 264 | theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) + 265 | theme(legend.position="right") + 266 | theme(legend.key.size = unit(0.5, "cm")) 267 | ggsave(paste0(dir_save,"H_plot_K", k, ".pdf"), plot=H_plt) 268 | }) 269 | } 270 | -------------------------------------------------------------------------------- /scripts/run_bNMF_2025.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(furrr) 3 | library(progressr) 4 | library(rtracklayer) 5 | library(vroom) 6 | ########################################################################## 7 | # Copyright (c) 2017, Broad Institute 8 | # Redistribution and use in source and binary forms, with or without 9 | # modification, are permitted provided that the following conditions are 10 | # met: 11 | # Redistributions of source code must retain the above copyright 12 | # notice, this list of conditions and the following disclaimer. 13 | # Redistributions in binary form must reproduce the above copyright 14 | # notice, this list of conditions and the following disclaimer in 15 | # the documentation and/or other materials provided with the 16 | # distribution. 17 | # Neither the name of the Broad Institute nor the names of its 18 | # contributors may be used to endorse or promote products derived 19 | # from this software without specific prior written permission. 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | ######################################################################### 32 | 33 | ###################################################################### 34 | # Bayesian NMF algorithms for clustering 35 | ###################################################################### 36 | # For implementation details see the ppaer 37 | # Udler MS, Kim J, von Grotthuss M, 38 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018) 39 | # Type 2 diabetes genetic loci informed by multi-trait 40 | # associations point to disease mechanisms and 41 | # subtypes: A soft clustering analysis. PLoS Med 15 42 | # (9): e1002654. 43 | ########################### 44 | # For details on the original algorithms 45 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence. 46 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013). 47 | ###################################################################### 48 | 49 | 50 | BayesNMF.L2EU <- function( 51 | V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=15, phi=1.0, 52 | window_size=25, min_iter=100 # New parameters for convergence monitoring 53 | ) { 54 | # ... existing initial comments ... 55 | 56 | eps <- 1.e-50 57 | del <- 1.0 58 | active_nodes <- colSums(V0) != 0 59 | V0 <- V0[, active_nodes] 60 | V <- V0 - min(V0) 61 | Vmin <- min(V) 62 | Vmax <- max(V) 63 | N <- dim(V)[1] 64 | M <- dim(V)[2] 65 | 66 | # Pre-allocate matrices 67 | W <- matrix(runif(N * K) * Vmax, ncol=K) 68 | H <- matrix(runif(M * K) * Vmax, ncol=M) 69 | V.ap <- matrix(0, nrow=N, ncol=M) 70 | lambda_matrix_M <- matrix(0, nrow=K, ncol=M) 71 | lambda_matrix_N <- matrix(0, nrow=K, ncol=N) 72 | 73 | I <- array(1, dim=c(N, M)) 74 | V.ap <- W %*% H + eps 75 | 76 | phi <- sd(V)^2 * phi 77 | C <- (N + M) / 2 + a0 + 1 78 | b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0) 79 | lambda.bound <- b0 / C 80 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 81 | lambda.cut <- lambda.bound * 1.5 82 | 83 | n.like <- list() 84 | n.evid <- list() 85 | n.error <- list() 86 | n.lambda <- list() 87 | n.active <- list() 88 | n.lambda[[1]] <- lambda 89 | 90 | # Function to check convergence stability 91 | check_convergence <- function(errors, window_size) { 92 | if (length(errors) < window_size) return(FALSE) 93 | recent <- tail(unlist(errors), window_size) 94 | # rel_change <- abs((recent[1] - recent[window_size]) / recent[1]) 95 | rel_change <- abs(mean(diff(recent)) / mean(recent)) 96 | return(rel_change < tol) 97 | } 98 | 99 | iter <- 2 100 | 101 | 102 | while (del >= tol & iter < n.iter) { 103 | # Update matrices efficiently 104 | lambda_matrix_M[] <- rep(1/lambda, M) 105 | lambda_matrix_N[] <- rep(1/lambda, N) 106 | 107 | # Update H 108 | H <- H * (t(W) %*% V) / (t(W) %*% V.ap + phi * H * lambda_matrix_M + eps) 109 | V.ap <- W %*% H + eps 110 | 111 | # Update W 112 | W <- W * (V %*% t(H)) / (V.ap %*% t(H) + phi * W * t(lambda_matrix_N) + eps) 113 | V.ap <- W %*% H + eps 114 | 115 | # Update lambda and calculate metrics 116 | lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C 117 | lambda[is.na(lambda)] <- 1e-6 118 | lambda[is.nan(lambda)] <- 1e-6 119 | lambda[lambda == Inf] <- 1e-6 120 | 121 | # Clean del calculation with finite checks 122 | # del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]]) 123 | del <- if (!is.null(n.lambda[[iter - 1]]) && all(is.finite(n.lambda[[iter - 1]]))) { 124 | max(abs(lambda - n.lambda[[iter - 1]]) / (n.lambda[[iter - 1]] + 1e-10)) 125 | } else { 126 | Inf 127 | } 128 | 129 | like <- sum((V - V.ap)^2) / 2 130 | n.like[[iter]] <- like 131 | n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 132 | lambda + C * log(lambda)) 133 | n.error[[iter]] <- sum((V - V.ap)^2) 134 | n.lambda[[iter]] <- lambda 135 | n.active[[iter]] <- sum(lambda >= lambda.cut) 136 | 137 | # Progress monitoring 138 | if (iter %% 100 == 0) { 139 | cat(sprintf("Iter %d: Error=%g, Delta=%g, Active=%d, Evidence=%g\n", 140 | iter, n.error[[iter]], del, n.active[[iter]], n.evid[[iter]])) 141 | } 142 | 143 | # Early stopping check (but only after minimum iterations) 144 | if (iter > min_iter && iter %% 20 == 0) { 145 | if (check_convergence(n.error, window_size)) { 146 | cat("Converged based on error stability\n") 147 | break 148 | } 149 | } 150 | 151 | iter <- iter + 1 152 | } 153 | 154 | # Return results with convergence metrics 155 | return(list( 156 | W = W, 157 | H = H, 158 | n.like = n.like, 159 | n.evid = n.evid, 160 | n.lambda = n.lambda, 161 | n.error = n.error, 162 | n.active = n.active, 163 | iterations = iter - 1, 164 | converged = del < tol 165 | )) 166 | } 167 | 168 | # Function to test multiple phi values 169 | test_phi_values <- function(V0, phi_values = c(1.0, 2.0, 5.0, 10.0), n_reps = 10, ...) { 170 | results <- list() 171 | 172 | for (phi in phi_values) { 173 | cat(sprintf("\nTesting phi = %g\n", phi)) 174 | 175 | best_solution <- NULL 176 | best_error <- Inf # Initialize with a high error value 177 | 178 | for (i in 1:n_reps) { 179 | result <- BayesNMF.L2EU(V0, phi = phi, ...) 180 | 181 | # Assume the function returns an error metric, e.g., result$error 182 | if (result[["n.error"]] < best_error) { # Use double brackets to extract numeric value 183 | best_solution <- result 184 | best_error <- result[["n.error"]] 185 | } 186 | } 187 | 188 | results[[as.character(phi)]] <- best_solution 189 | } 190 | 191 | return(results) 192 | } 193 | # Example usage: 194 | # phi_test_results <- test_phi_values(V0, phi_values=c(1.0, 2.0, 5.0, 10.0)) 195 | # print(phi_test_results$comparison) 196 | 197 | 198 | 199 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, K=20, K0=10, tolerance=1e-7, phi=1) { 200 | 201 | # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure 202 | # a series of times to generate results and evaluate cluster stability 203 | 204 | print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)...")) 205 | print(sprintf("Using tolerance of %.2e!",tolerance)) 206 | 207 | set.seed(random_seed) 208 | 209 | bnmf_reps <- lapply(1:n_reps, function(r) { 210 | print(paste("ITERATION",r)) 211 | res <- BayesNMF.L2EU(V0 = z_mat, K=K, K0=K0, tol=tolerance, phi=phi) 212 | # names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error") 213 | res 214 | }) 215 | bnmf_reps 216 | } 217 | 218 | 219 | # Activate a global handler for progress bars 220 | 221 | run_bNMF_parallel <- function(z_mat, n_reps = 10, random_seed = 1, K = 20, K0 = 10, tolerance = 1e-7, phi=1) { 222 | 223 | print(paste0("Running bNMF clustering procedure in parallel! (", n_reps, " iterations)...")) 224 | print(sprintf("Using tolerance of %.2e!", tolerance)) 225 | 226 | # Run each repetition in parallel 227 | bnmf_reps <- future_map(1:n_reps, function(rep) { 228 | 229 | # Logging progress 230 | log_message <- paste("ITERATION", rep) 231 | cat(log_message, "\n") # Print to console to verify 232 | 233 | # Run the Bayesian NMF function and store the result 234 | res <- BayesNMF.L2EU(V0 = z_mat, K = K, K0 = K0, tol = tolerance, phi=phi) 235 | # names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error") 236 | 237 | return(res) # Explicitly return `res` from the function 238 | }, 239 | .options = furrr_options(seed = random_seed) # Set the seed to a specific number 240 | ) 241 | 242 | return(bnmf_reps) 243 | } 244 | 245 | summarize_bNMF <- function(bnmf_reps, dir_save=NULL) { 246 | 247 | # Given output from bNMF (list of length N_iterations), 248 | # generate summary tables and plots 249 | 250 | make_run_summary <- function(reps) { 251 | 252 | # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs 253 | 254 | run_summary <- map_dfr(1:length(reps), function(i) { 255 | res <- reps[[i]] 256 | final_lambdas <- res$n.lambda[[length(res$n.lambda)]] 257 | tibble( 258 | run=i, 259 | K=sum(final_lambdas > min(final_lambdas)), # Assume that lambdas equal to the minimum lambda are ~ 0 260 | evid=res$n.evid[[length(res$n.evid)]] # Evidence = -log_likelihood 261 | ) 262 | }) %>% 263 | arrange(evid) 264 | 265 | unique.K <- table(run_summary$K) 266 | n.K <- length(unique.K) # Number of distinct K 267 | MAP.K.run <- sapply(names(unique.K), function(k) { # bNMF run index with the maximum posterior for given K 268 | tmp <- run_summary[run_summary$K == k, ] 269 | tmp$run[which.min(tmp$evid)] 270 | }) 271 | 272 | list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run) 273 | } 274 | if (!is.null(dir_save)) { 275 | dir.create(file.path(dir_save)) 276 | dir_save=paste0(dir_save,"/") 277 | } else {dir_save="./"} 278 | 279 | print("Summarizing bNMF results...") 280 | 281 | print("Writing table of chosen K across iterations...") 282 | run_summary <- make_run_summary(bnmf_reps) 283 | write_tsv(run_summary$run_tbl, paste0(dir_save,"run_summary.txt")) 284 | 285 | n.K <- length(run_summary$unique.K) # Number of distinct K 286 | 287 | get_W <- function(clustering) { 288 | W_raw <- clustering$W 289 | W_raw[, colSums(W_raw > 1e-10) > 0] 290 | } 291 | 292 | get_H <- function(clustering) { 293 | H_raw <- clustering$H 294 | H_raw[rowSums(H_raw > 1e-10) > 0, ] 295 | } 296 | 297 | print("Plotting variant and trait contributions...") 298 | silent <- sapply(names(run_summary$unique.K), function(k) { # Create heatmaps for MAP iteration for each K 299 | res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]] 300 | W <- res$W[, colSums(res$W) != 0] # feature-cluster association matrix 301 | H <- res$H[rowSums(res$H) != 0, ] # cluster-gene association matrix 302 | W[W < 1.e-10] <- 0 303 | H[H < 1.e-10] <- 0 304 | 305 | W0 <- data.frame(W) 306 | W0[, "variant"] <- rownames(W) 307 | H0 <- data.frame(H) 308 | H0[, "cluster"] <- rownames(H) 309 | 310 | write_tsv(W0, file=paste0(dir_save,"L2EU.W.mat.", k, ".txt")) 311 | write_tsv(H0, file=paste0(dir_save,"L2EU.H.mat.", k, ".txt")) 312 | 313 | mat.reconstructed <- W %*% H # reconstructed matrix == approximation for the input matrix 314 | 315 | # Setup for plotting 316 | scale0 <- 0.8 317 | scale <- 1 318 | g.ordering <- paste("G", seq(1:ncol(W)), sep="") 319 | color.axis <- "black" 320 | .theme_ss <- theme_bw(base_size=12) + 321 | theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 322 | family="mono", face='bold', color=color.axis), 323 | axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis), 324 | axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis), 325 | axis.title=element_text(face="bold", size=12 * scale,color="black"), 326 | plot.title=element_text(face="bold", size=12 * scale)) 327 | 328 | # Plot W matrix (feature activities) 329 | W_hc <- hclust(dist(W, method="euclidean"), method="ward.D") 330 | W_variant.ordering <- W_hc$labels[W_hc$order] 331 | W_plt_df <- W %>% 332 | as.data.frame() %>% 333 | rownames_to_column(var="variant") %>% 334 | gather(key="cluster", value="activity", -variant) %>% 335 | mutate(variant=factor(variant, levels=W_variant.ordering), 336 | cluster=factor(cluster, 337 | levels=paste0("V", 1:ncol(W)))) 338 | W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 339 | geom_tile() + 340 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 341 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 342 | .theme_ss + 343 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 344 | ylab("Cluster") + xlab("Variant") + 345 | theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) + 346 | theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) + 347 | theme(legend.position="right") + 348 | theme(legend.key.size = unit(0.5, "cm")) 349 | ggsave(paste0(dir_save,"W_plot_K", k, ".pdf"), plot=W_plt) 350 | 351 | H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D") 352 | H_trait.ordering <- H_hc$labels[H_hc$order] 353 | H_plt_df <- t(H) %>% 354 | as.data.frame() %>% 355 | rownames_to_column(var="trait") %>% 356 | gather(key="cluster", value="activity", -trait) %>% 357 | mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))), 358 | trait=factor(trait, levels=H_trait.ordering)) 359 | H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 360 | geom_tile() + 361 | scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) + 362 | #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0)) 363 | .theme_ss + 364 | ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) + 365 | ylab("Cluster") + xlab("Trait") + 366 | theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) + 367 | theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) + 368 | theme(legend.position="right") + 369 | theme(legend.key.size = unit(0.5, "cm")) 370 | ggsave(paste0(dir_save,"H_plot_K", k, ".pdf"), plot=H_plt) 371 | }) 372 | } 373 | 374 | --------------------------------------------------------------------------------