├── .gitattributes
├── .gitignore
├── README.md
├── Smith_Deutsch_NatureMedicine_2024
    ├── AFR
    │   ├── hg38_score_info
    │   │   ├── Beta Cell 1.csv
    │   │   ├── Beta Cell 2.csv
    │   │   ├── Blood Markers.csv
    │   │   ├── Hyper Insulin.csv
    │   │   ├── Lipodystrophy.csv
    │   │   ├── Obesity.csv
    │   │   ├── Proinsulin.csv
    │   │   └── Total_GRS.csv
    │   └── liftover_map.txt
    ├── AMR
    │   ├── hg38_score_info
    │   │   ├── ASAT Pos.csv
    │   │   ├── Beta Cell 2.csv
    │   │   ├── Lipodystrophy.csv
    │   │   ├── Obesity.csv
    │   │   ├── Total_GRS.csv
    │   │   └── Unknown.csv
    │   └── liftover_map.txt
    ├── EAS
    │   ├── hg38_score_info
    │   │   ├── ALP Neg.csv
    │   │   ├── Beta Cell 1.csv
    │   │   ├── Beta Cell 2.csv
    │   │   ├── Lipodystrophy 1.csv
    │   │   ├── Lipodystrophy 2.csv
    │   │   ├── Liver Lipid.csv
    │   │   ├── Obesity.csv
    │   │   ├── Proinsulin.csv
    │   │   ├── Total_GRS.csv
    │   │   └── Unknown.csv
    │   └── liftover_map.txt
    ├── EUR
    │   ├── hg38_score_info
    │   │   ├── ALP Neg.csv
    │   │   ├── ASAT Pos.csv
    │   │   ├── Beta Cell 1.csv
    │   │   ├── Beta Cell 2.csv
    │   │   ├── Bilirubin.csv
    │   │   ├── Cholesterol.csv
    │   │   ├── Hyper Insulin.csv
    │   │   ├── Lipodystrophy 1.csv
    │   │   ├── Lipodystrophy 2.csv
    │   │   ├── Liver-Lipid.csv
    │   │   ├── Obesity.csv
    │   │   ├── Proinsulin.csv
    │   │   ├── SHBG-LpA.csv
    │   │   ├── Total_GRS.csv
    │   │   └── VAT Neg.csv
    │   └── liftover_map.txt
    └── MultiAncestry
    │   ├── hg38_score_info
    │       ├── ALP Neg.csv
    │       ├── Beta Cell 1.csv
    │       ├── Beta Cell 2.csv
    │       ├── Bilirubin.csv
    │       ├── Cholesterol.csv
    │       ├── Hyper Insulin.csv
    │       ├── Lipodystrophy 1.csv
    │       ├── Lipodystrophy 2.csv
    │       ├── Liver-Lipid.csv
    │       ├── Obesity.csv
    │       ├── Proinsulin.csv
    │       ├── SHBG-LpA.csv
    │       └── Total_GRS.csv
    │   └── liftover_map.txt
├── doc
    └── Variant clustering preprocessing pipeline_plan_KW.docx
├── example_data
    ├── clustering_data_sources_example.xlsx
    ├── my_GWAS
    │   ├── ALP_sample.txt
    │   ├── ALT_sample.txt
    │   ├── AST_sample.txt
    │   ├── Adiponectin_sample.txt
    │   ├── Albumin_sample.txt
    │   ├── BFP_sample.txt
    │   ├── BMI_sample.txt
    │   ├── C_reactive_protein_sample.txt
    │   ├── Cholesterol_sample.txt
    │   ├── FGadjBMI_sample.txt
    │   ├── FIadjBMI_sample.txt
    │   ├── GGT_sample.txt
    │   ├── Glucose_sample.txt
    │   ├── HDL_sample.txt
    │   ├── HOMAB_sample.txt
    │   ├── HOMAIR_sample.txt
    │   ├── Haemoglobin_concentration_sample.txt
    │   ├── HbA1c.adjBMI_sample.txt
    │   ├── HbA1c_sample.txt
    │   ├── LipoproteinA_sample.txt
    │   ├── Lymphocyte_count_sample.txt
    │   ├── Monocyte_count_sample.txt
    │   ├── Neutrophil_count_sample.txt
    │   ├── Proins_sample.txt
    │   ├── RBC_sample.txt
    │   ├── SHBG_sample.txt
    │   ├── T2D_GWAS_1_sample.txt
    │   ├── T2D_GWAS_2_sample.txt
    │   ├── T2D_GWAS_3_sample.txt
    │   ├── TG_sample.txt
    │   ├── Urate_sample.txt
    │   ├── Urea_sample.txt
    │   ├── VitaminD_sample.txt
    │   ├── WBC_sample.txt
    │   ├── WHR_female_sample.txt
    │   └── WHR_male_sample.txt
    └── rsID_map_example.txt
└── scripts
    ├── .Rhistory
    ├── archive
        ├── choose_variants_2021.R
        ├── compiled_code.txt
        ├── get_data_for_ALL_variants_from_ALL_datasets_findproxy_readall_new_BMI_edit.pl
        ├── gwas_variant_selection.R
        ├── main.BayesNMF.script_to_Jaeyoon_edit_claire_T2D.R
        ├── prep_bNMF_2021.R
        ├── process_traits.R
        ├── proximal_preprocessing.R
        ├── run_bNMF_2021.R
        └── test_pipeline_2021.R
    ├── bNMF_example_pipeline.R
    ├── choose_variants.R
    ├── choose_variants_2025.R
    ├── format_bNMF_results.Rmd
    ├── generate_varid_to_rsid_map_file.R
    ├── post_bNMF_2025.R
    ├── prep_bNMF.R
    ├── prep_bNMF_2025.R
    ├── run_bNMF.R
    └── run_bNMF_2025.R


/.gitattributes:
--------------------------------------------------------------------------------
1 | list_VARID_rsID_updated.txt.gz filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # High-level directories
2 | data/*
3 | opt/*
4 | cache/*
5 | 
6 | *.DS_store
7 | pipeline_walkthrough/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## T2D Multi-ancestry Partitioned Polygenic Scores
 2 | Cluster weights are available from the [Smith, Deutsch et al Nature Medicine 2024](https://www.nature.com/articles/s41591-024-02865-3) paper in the "Smith_Deutsch_NatureMedicine_2024" folder. Partitioned polygenic scores (pPS) can be generated using the enclosed variant cluster weights. We have included weights for ancestry-specific and multi-ancestry clusters:
 3 | 
 4 | * In the weights files, the "Effect_Allele" column denotes the T2D risk-increasing allele.
 5 | * When generating the pPS, all genotypes should be aligned to this allele! 
 6 | * Weights have been provided in hg38, however a liftover map (hg19 to hg38) is included in each subfolder.
 7 | 
 8 | ## Pipeline for GWAS clustering using Bayesian non-negative matrix factorization (bNMF)
 9 | 
10 | The bNMF procedure, as applied here, is used to detect clusters of GWAS variants for some outcome of interest based on the associations of those variants with a set of additional traits. This pipeline includes pre-processing steps (such as quality control of variants and traits and the choice of proxy variants), preparation of the z-score matrix, clustering, and summarization of results.
11 | 
12 | **Important:** The current pipeline makes certain assumptions and uses some hard-coded filenames, including:
13 | 
14 | * "VAR_ID"s for GWAS and trait-specific summary statistics are in a specific format (CHR_POS_REF_ALT), with alleles aligned in a consistent way across traits (i.e. variant matching is performed using a simple string match).
15 | * The variant reference file linking VAR_IDs to rsIDs is based on the VAR_ID format above, and points to a file available on the Broad Institute compute cluster. The VAR_ID to rsID pairs can be generated using the script **"generate_varid_to_rsid_map_file.R"**
16 | 
17 | ### Requirments
18 | The pipeline relies APIs for performing several linkage disequilibrium (LD)-based operations, including LD-pruning and proxy variant searches. The user must either acquire an personal token for LDlinkR or download the topLD API, depending on how they wish to perform these steps:
19 | 
20 | * **LDlinkR**: request token from https://ldlink.nih.gov/?tab=apiaccess
21 |   * LDlinkR is used within the ld_prune and choose_proxies functions. Alternatives include performing position-based clumping using the snp_clump function, and using the topLD API option for choose_proxies
22 | * **topLD**: download API to your project folder (instructions here: https://github.com/linnabrown/topld_api)
23 |   * Note that topLD is currently not compatible with macOS. 
24 | 
25 | ### 1. Choose the set of variants to be clustered (choose_variants.R)
26 | **ld_prune**: LD-based pruning of the input variant set  
27 | **snp_clip**: Alternative to ld_prune; uses chromosomal position to prune variant set
28 | **count_traits_per_variant**: Assess the fraction of traits missing each variant of interest  
29 | **find_variants_needing_proxies**: Determine which variants need proxies (allele considerations, missingness, etc.) 
30 | **choose_proxies**: Select proxies for the necessary variants and output the final variant set for clustering
31 | 
32 | ### 2. Prepare the final z-score matrix (prep_bNMF.R)
33 | **fetch_summary_stats**: Retrieve z-scores and sample sizes across all traits for the final variant set 
34 | **fill_missing_zcores**: Fill missing values in the variant-trait association matrix
35 | **prep_z_matrix**: Final trait filters, sample size adjustment, and creation of non-negative input matrix (N (variants) x 2M (traits), with separate columns for positive trait associations (zero otherwise) and negative trait associations)
36 | 
37 | ### 3. Run bNMF and summarize the results (run_bNMF.R)
38 | **run_bNMF**: Run the bNMF procedure (over multiple iterations)  
39 | **summarize_bNMF**: Summarize the results and create heatmaps for visualization
40 | 
41 | ### 4. Further summarize and visualize results
42 | **format_bNMF_results.Rmd**: Generates a HTML file which includes several plots and tables which summarize the cluster results
43 | 
44 |   * also included is a calculation for finding an optimal weight threshold for defining cluster-defining variants and traits
45 | 
46 | ### Outputs
47 | Most steps of the pipeline will print messages with details of the procedure. In addition, the following outputs will be written to the working directory.  
48 | 
49 | * no_proxies_found.txt: A list of variants that were excluded and for which no acceptable proxies were found.  
50 | * run_summary.txt: A table listing the chosen K (# of clusters) and negative log-likelihood for each bNMF iteration.
51 | * z_score_mat.rds: A binary R object containing the N x M z-score matrix after all preprocessing steps.
52 | * L2EU.W.mat.K[]: The matrix of feature contributions to clusters for the K in question (one per K chosen in at least one iteration).
53 | * L2EU.H.mat.K[]: The matrix of variant contributions to clusters for the K in question.
54 | * W_plot_K[].pdf: A heatmap displaying feature contributions to clusters for the K in question.
55 | * H_plot_K[].pdf: A heatmap displaying variant contributions to clusters for the K in question.
56 | 
57 | ### Contributors
58 | * Claire Kim (design and code)
59 | * Kenny Westerman (design and code)
60 | * Kirk Smith (code)
61 | * Jaegil Kim (code)
62 | * Marcin von Grotthuss (code)
63 | * Miriam Udler (design and supervision)
64 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Beta Cell 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 3,197541732,C,T,rs9325395,T,1.14049703135893
 3 | 5,31436933,C,T,rs493760,C,1.28649231683287
 4 | 6,20686342,C,A,rs7766070,A,3.41321775882255
 5 | 8,41651740,G,A,rs12549902,A,4.15965995628624
 6 | 10,112992744,T,C,rs17747324,C,1.10483442041379
 7 | 10,112998590,C,T,rs7903146,T,1.79073378748542
 8 | 11,2175119,A,G,rs4930044,A,1.54513936560311
 9 | 12,41574480,G,A,rs10880103,G,1.34707283985582
10 | 12,75039749,A,G,rs10879874,A,1.23803020813872
11 | 12,80592093,G,T,rs1528287,T,1.30853148600033
12 | 13,31135669,A,C,rs9538367,C,1.22973026426221
13 | 17,77902588,A,G,rs9899517,A,1.70309057170068
14 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Beta Cell 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 3,123346931,A,G,rs11708067,A,2.85678418025812
 3 | 3,185816694,G,A,rs9859406,A,1.09532761532444
 4 | 6,20686342,C,A,rs7766070,A,1.65025246141402
 5 | 7,15019701,T,C,rs12540947,T,1.08718444457625
 6 | 7,15025271,T,G,rs4721401,G,1.51636423495046
 7 | 7,157233358,A,G,rs1182443,G,0.982566657994938
 8 | 7,28140937,T,C,rs864745,T,1.03509730159163
 9 | 7,44145489,G,A,rs2908274,A,1.08604250980022
10 | 7,44184122,G,A,rs730497,A,3.28882703813248
11 | 10,112992744,T,C,rs17747324,C,3.34452390445164
12 | 10,112998590,C,T,rs7903146,T,3.88464226247631
13 | 10,113014674,A,C,rs36090025,C,3.00494927966238
14 | 11,2670270,G,A,rs231361,A,1.28057459942797
15 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Blood Markers.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 1,158670244,A,G,rs857682,A,5.12028565509816
3 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Hyper Insulin.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,159022630,C,T,rs60436412,T,0.937208121482674
 3 | 3,185816694,G,A,rs9859406,A,1.4547676152557
 4 | 3,190741327,A,G,rs6792341,G,1.70532156926971
 5 | 6,39048860,C,T,rs10305420,C,1.18031610117673
 6 | 7,28140937,T,C,rs864745,T,1.38053889749524
 7 | 7,50819477,T,C,rs7781440,T,1.5370579485721
 8 | 7,53139155,A,C,rs10259582,C,0.975441660585956
 9 | 7,6694510,T,G,rs3801034,G,0.984780208023824
10 | 8,13987532,T,G,rs6991416,G,1.51623374674583
11 | 11,2190447,G,A,rs7396447,G,1.59235929925333
12 | 11,4671034,G,T,rs10836447,G,1.90220698103679
13 | 12,55216782,G,T,rs12314766,G,1.00404060576134
14 | 12,55662031,C,T,rs11614818,C,1.04278700039453
15 | 12,55786980,G,A,rs1681087,A,1.06354540778224
16 | 12,65788675,G,A,rs971779,A,1.15455466224296
17 | 12,80922459,A,G,rs7306751,G,0.951919927045835
18 | 13,68824414,T,C,rs60108948,T,1.39201366039072
19 | 17,30816009,C,T,rs112944916,T,1.41391360410219
20 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Lipodystrophy.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 5,56511543,C,T,rs464605,T,5.61538480583823
3 | 7,6694510,T,G,rs3801034,G,1.35073259004649
4 | 12,65788675,G,A,rs971779,A,1.01288557808651
5 | 12,85437387,G,A,rs1533223,A,0.988679452869238
6 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Obesity.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 6,42910968,C,T,rs112504542,T,1.30936561514991
3 | 7,28140937,T,C,rs864745,T,0.975796362368512
4 | 7,6694510,T,G,rs3801034,G,1.16998873599611
5 | 12,57574955,G,A,rs11172254,G,1.09360276744307
6 | 16,53777876,A,G,rs62033400,G,5.98919318612742
7 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Proinsulin.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 6,43850987,T,C,rs9472141,C,4.12812379000962
3 | 7,15019701,T,C,rs12540947,T,1.24419174028044
4 | 7,15025271,T,G,rs4721401,G,1.26922197503576
5 | 7,56156535,G,T,rs34224159,G,1.00495716026767
6 | 11,120779344,T,C,rs2846098,C,0.946849140526209
7 | 12,85437387,G,A,rs1533223,A,0.923071432638415
8 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/hg38_score_info/Total_GRS.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,158670244,A,G,rs857682,A,0.0626
 3 | 2,159022630,C,T,rs60436412,T,0.089
 4 | 3,123346931,A,G,rs11708067,A,0.1178
 5 | 3,185816694,G,A,rs9859406,A,0.1152
 6 | 3,190741327,A,G,rs6792341,G,0.0877
 7 | 3,197541732,C,T,rs9325395,T,0.0678
 8 | 5,31436933,C,T,rs493760,C,0.33
 9 | 5,56511543,C,T,rs464605,T,0.0774
10 | 6,20686342,C,A,rs7766070,A,0.0794
11 | 6,39048860,C,T,rs10305420,C,0.1423
12 | 6,42910968,C,T,rs112504542,T,0.0665
13 | 6,43850987,T,C,rs9472141,C,0.0629
14 | 7,15019701,T,C,rs12540947,T,0.1256
15 | 7,15025271,T,G,rs4721401,G,0.1012
16 | 7,157233358,A,G,rs1182443,G,0.0645
17 | 7,28140937,T,C,rs864745,T,0.0825
18 | 7,44145489,G,A,rs2908274,A,0.089
19 | 7,44184122,G,A,rs730497,A,0.0918
20 | 7,50819477,T,C,rs7781440,T,0.0861
21 | 7,53139155,A,C,rs10259582,C,0.0756
22 | 7,56156535,G,T,rs34224159,G,0.1109
23 | 7,6694510,T,G,rs3801034,G,0.078
24 | 8,13987532,T,G,rs6991416,G,0.0697
25 | 8,41651740,G,A,rs12549902,A,0.1138
26 | 10,112992744,T,C,rs17747324,C,0.2021
27 | 10,112998590,C,T,rs7903146,T,0.226
28 | 10,113014674,A,C,rs36090025,C,0.109
29 | 11,120779344,T,C,rs2846098,C,0.3094
30 | 11,2175119,A,G,rs4930044,A,0.0709
31 | 11,2190447,G,A,rs7396447,G,0.097
32 | 11,2670270,G,A,rs231361,A,0.0798
33 | 11,4671034,G,T,rs10836447,G,0.0782
34 | 12,41574480,G,A,rs10880103,G,0.0772
35 | 12,55216782,G,T,rs12314766,G,0.1298
36 | 12,55662031,C,T,rs11614818,C,0.087
37 | 12,55786980,G,A,rs1681087,A,0.0657
38 | 12,57574955,G,A,rs11172254,G,0.0974
39 | 12,65788675,G,A,rs971779,A,0.1251
40 | 12,75039749,A,G,rs10879874,A,0.1001
41 | 12,80592093,G,T,rs1528287,T,0.4939
42 | 12,80922459,A,G,rs7306751,G,0.0647
43 | 12,85437387,G,A,rs1533223,A,0.066
44 | 13,31135669,A,C,rs9538367,C,0.3072
45 | 13,68824414,T,C,rs60108948,T,0.0867
46 | 16,53777876,A,G,rs62033400,G,0.1506
47 | 17,30816009,C,T,rs112944916,T,0.0999
48 | 17,77902588,A,G,rs9899517,A,0.0652
49 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AFR/liftover_map.txt:
--------------------------------------------------------------------------------
 1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene
 2 | 7_53206848_A_C,rs10259582,7_53139155_A_C,POM121L12
 3 | 6_39016636_C_T,rs10305420,6_39048860_C_T,GLP1R
 4 | 12_75455390_C_T,rs10785171,12_75061610_C_T,KCNC2
 5 | 1_229024237_G_A,rs10799493,1_228888490_G_A,RHOU
 6 | 11_4692264_G_T,rs10836447,11_4671034_G_T,OR51E2
 7 | 12_75433529_A_G,rs10879874,12_75039749_A_G,KCNC2
 8 | 12_41968282_G_A,rs10880103,12_41574480_G_A,PDZRN4
 9 | 11_12352989_C_T,rs11022305,11_12331442_C_T,11:12352989
10 | 12_57968738_G_A,rs11172254,12_57574955_G_A,KIF5A
11 | 12_64151880_T_C,rs111784736,12_63758100_T_C,RXYLT1
12 | 6_42878706_C_T,rs112504542,6_42910968_C_T,PTCRA
13 | 17_29143027_C_T,rs112944916,17_30816009_C_T,CRLF3
14 | 21_38438951_C_T,rs116043344,21_37066651_C_T,PIGP
15 | 12_56055815_C_T,rs11614818,12_55662031_C_T,METTL7B
16 | 3_123065778_A_G,rs11708067,3_123346931_A_G,ADCY5
17 | 7_157026052_A_G,rs1182443,7_157233358_A_G,UBE3C
18 | 12_55610566_G_T,rs12314766,12_55216782_G_T,OR10A7
19 | 12_61844049_T_C,rs12370484,12_61450268_T_C,TAFA2
20 | 3_76570616_G_A,rs12497354,3_76521465_G_A,ROBO2
21 | 7_15059326_T_C,rs12540947,7_15019701_T_C,DGKB
22 | 8_41509259_G_A,rs12549902,8_41651740_G_A,NKX6-3
23 | 6_22207900_T_C,rs13193644,6_22207671_T_C,CASC15
24 | 12_80985872_G_T,rs1528287,12_80592093_G_T,PTPRQ
25 | 12_85831165_G_A,rs1533223,12_85437387_G_A,ALX1
26 | 12_56180764_G_A,rs1681087,12_55786980_G_A,DNAJC14
27 | 9_135474131_G_A,rs17149330,9_132598744_G_A,DDX31
28 | 10_114752503_T_C,rs17747324,10_112992744_T_C,TCF7L2
29 | 5_60930584_C_T,rs1910026,5_61634757_C_T,C5orf64
30 | 12_128224722_A_G,rs191362921,12_127740177_A_G,LINC02393
31 | 12_82080177_A_G,rs1922403,12_81686398_A_G,PPFIA2
32 | 16_52901915_G_A,rs2052267,16_52868003_G_A,CHD9
33 | 13_97758894_C_T,rs2055423,13_97106640_C_T,MBNL2
34 | 13_106074896_T_C,rs2147479,13_105422547_T_C,DAOA-AS1
35 | 6_9065677_C_T,rs2224791,6_9065444_C_T,LOC100506207
36 | 11_2691500_G_A,rs231361,11_2670270_G_A,KCNQ1
37 | 21_40295064_T_C,rs2410066,21_38923140_T_C,ETS2
38 | 11_2641129_A_G,rs2412058,11_2619899_A_G,KCNQ1
39 | 11_120650053_T_C,rs2846098,11_120779344_T_C,GRIK4
40 | 7_44185088_G_A,rs2908274,7_44145489_G_A,GCK
41 | 7_56224228_G_T,rs34224159,7_56156535_G_T,PSPH
42 | 3_193938796_G_A,rs35402322,3_194221007_G_A,LINC00887
43 | 10_114774433_A_C,rs36090025,10_113014674_A_C,TCF7L2
44 | 7_6734141_T_G,rs3801034,7_6694510_T_G,ZNF12
45 | 12_70850622_T_C,rs380835,12_70456842_T_C,KCNMB4
46 | 12_88335960_G_A,rs4146751,12_87942183_G_A,C12orf50
47 | 5_55807370_C_T,rs464605,5_56511543_C_T,ANKRD55
48 | 7_15064896_T_G,rs4721401,7_15025271_T_G,DGKB
49 | 12_54339052_C_A,rs4759058,12_53945268_C_A,HOXC13
50 | 12_52220968_C_T,rs4762019,12_51827184_C_T,FIGNL2
51 | 3_127710138_C_T,rs4857925,3_127991295_C_T,KBTBD12
52 | 11_2196349_A_G,rs4930044,11_2175119_A_G,MIR4686
53 | 5_31437040_C_T,rs493760,5_31436933_C_T,DROSHA
54 | 2_16663741_C_T,rs5023163,2_16482473_C_T,CYRIA
55 | 7_67444857_T_C,rs538285390,7_67979870_T_C,STAG3L4
56 | 12_77581589_C_A,rs55695739,12_77187809_C_A,E2F7
57 | 12_54527168_C_T,rs55730794,12_54133384_C_T,LINC02381
58 | 8_11505266_A_C,rs56965246,8_11647757_A_C,GATA4
59 | 13_69398546_T_C,rs60108948,13_68824414_T_C,LINC00550
60 | 2_159879142_C_T,rs60436412,2_159022630_C_T,TANC1
61 | 15_57042759_T_G,rs62022933,15_56750561_T_G,ZNF280D
62 | 16_53811788_A_G,rs62033400,16_53777876_A_G,FTO
63 | 3_190459116_A_G,rs6792341,3_190741327_A_G,IL1RAP
64 | 13_91908725_C_T,rs68126334,13_91256471_C_T,LINC00379
65 | 6_15275305_A_G,rs6921502,6_15275074_A_G,JARID2
66 | 7_40618995_A_G,rs6948900,7_40579396_A_G,SUGCT
67 | 8_13845041_T_G,rs6991416,8_13987532_T_G,SGCZ
68 | 7_44223721_G_A,rs730497,7_44184122_G_A,GCK
69 | 12_81316238_A_G,rs7306751,12_80922459_A_G,LIN7A
70 | 12_38710523_G_A,rs7315028,12_38316721_G_A,ALG10B
71 | 11_2211677_G_A,rs7396447,11_2190447_G_A,MIR4686
72 | 12_66242327_A_G,rs74234985,12_65848547_A_G,HMGA2
73 | 3_187644695_C_T,rs74802573,3_187926907_C_T,BCL6
74 | 11_2187477_C_T,rs7483805,11_2166247_C_T,TH
75 | 5_55780388_C_T,rs7730776,5_56484561_C_T,ANKRD55
76 | 6_20686573_C_A,rs7766070,6_20686342_C_A,CDKAL1
77 | 7_50887174_T_C,rs7781440,7_50819477_T_C,GRB10
78 | 7_113340067_A_G,rs7802710,7_113700012_A_G,PPP1R3A
79 | 10_114758349_C_T,rs7903146,10_112998590_C_T,TCF7L2
80 | 1_51209148_T_C,rs79090772,1_50743476_T_C,FAF1
81 | 11_121266882_A_G,rs7938784,11_121396173_A_G,SORL1
82 | 12_66431690_G_A,rs7965495,12_66037910_G_A,HMGA2
83 | 12_33653477_T_C,rs7972688,12_33500542_T_C,SYT10
84 | 1_158640034_A_G,rs857682,1_158670244_A_G,SPTA1
85 | 7_28180556_T_C,rs864745,7_28140937_T_C,JAZF1
86 | 3_197268603_C_T,rs9325395,3_197541732_C_T,BDH1
87 | 6_165014556_C_A,rs9347878,6_164593523_C_A,C6orf118
88 | 6_8874553_T_C,rs9393070,6_8874320_T_C,LOC100506207
89 | 6_43818724_T_C,rs9472141,6_43850987_T_C,LINC01512
90 | 13_31709806_A_C,rs9538367,13_31135669_A_C,HSPH1
91 | 7_43320431_A_G,rs9648079,7_43280832_A_G,HECW1
92 | 12_66182455_G_A,rs971779,12_65788675_G_A,RPSAP52
93 | 3_193811168_A_G,rs9823161,3_194093379_A_G,HES1
94 | 3_185534482_G_A,rs9859406,3_185816694_G_A,IGF2BP2
95 | 17_75898670_A_G,rs9899517,17_77902588_A_G,LINC01973
96 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/ASAT Pos.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 3,185796605,T,C,rs6444081,C,1.85709728620563
3 | 4,156902837,A,G,rs36029656,G,1.6925373099693
4 | 5,56558326,C,T,rs6867983,T,5.02054454459969
5 | 7,28156603,A,G,rs849134,A,1.36149359686958
6 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Beta Cell 2.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 3,185796605,T,C,rs6444081,C,1.49929836175987
3 | 7,28156603,A,G,rs849134,A,1.71723423601776
4 | 10,112990398,C,T,rs11196182,C,1.81731405933532
5 | 10,112998590,C,T,rs7903146,T,5.4918602156638
6 | 10,12211598,G,A,rs12221133,A,1.32097143637614
7 | 11,2828300,A,C,rs2283228,A,1.12295327353213
8 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Lipodystrophy.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 7,130765628,G,A,rs172306,A,4.53455920944434
3 | 10,121164303,C,T,rs7087606,C,1.77462714072911
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Obesity.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 10,113315384,C,T,rs4918811,C,0.962879223117557
3 | 10,79182863,C,T,rs810517,C,0.95425477256916
4 | 13,97061795,T,C,rs7333342,C,0.974049344424921
5 | 16,53794154,C,T,rs17817964,T,6.34437002752861
6 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Total_GRS.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 3,185796605,T,C,rs6444081,C,0.1504
 3 | 4,130073468,G,A,rs55653883,G,0.1658
 4 | 4,152146894,A,C,rs35752631,A,0.1404
 5 | 4,156902837,A,G,rs36029656,G,0.1277
 6 | 5,56558326,C,T,rs6867983,T,0.1457
 7 | 7,130765628,G,A,rs172306,A,0.1241
 8 | 7,28156603,A,G,rs849134,A,0.0894
 9 | 7,94409622,C,T,rs42518,C,0.0817
10 | 9,28773206,A,C,rs824250,C,0.0709
11 | 10,112990398,C,T,rs11196182,C,0.252
12 | 10,112998590,C,T,rs7903146,T,0.325
13 | 10,113315384,C,T,rs4918811,C,0.1009
14 | 10,121164303,C,T,rs7087606,C,0.1708
15 | 10,12211598,G,A,rs12221133,A,0.1247
16 | 10,79182863,C,T,rs810517,C,0.1058
17 | 11,2821317,G,A,rs12294675,A,0.1879
18 | 11,2828300,A,C,rs2283228,A,0.2936
19 | 12,4226777,T,C,rs67904513,T,0.1979
20 | 13,111640797,G,T,rs113507970,T,1.3373
21 | 13,97061795,T,C,rs7333342,C,0.0878
22 | 16,53794154,C,T,rs17817964,T,0.0945
23 | 17,7042968,T,C,rs13342692,C,0.1737
24 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/hg38_score_info/Unknown.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 4,130073468,G,A,rs55653883,G,1.22345648743075
 3 | 4,152146894,A,C,rs35752631,A,1.55505592177207
 4 | 7,94409622,C,T,rs42518,C,1.13529799606247
 5 | 9,28773206,A,C,rs824250,C,1.07511172465863
 6 | 11,2821317,G,A,rs12294675,A,1.53046203465743
 7 | 11,2828300,A,C,rs2283228,A,1.12914340155893
 8 | 12,4226777,T,C,rs67904513,T,1.17350682402633
 9 | 13,111640797,G,T,rs113507970,T,1.13530179286828
10 | 17,7042968,T,C,rs13342692,C,1.05499505510544
11 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/AMR/liftover_map.txt:
--------------------------------------------------------------------------------
 1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene
 2 | 4_105844273_T_G,rs10005603,4_104923116_T_G,TET2
 3 | 9_11606348_T_C,rs10511567,9_11606348_T_C,PTPRD
 4 | 10_114750157_C_T,rs11196182,10_112990398_C_T,TCF7L2
 5 | 10_119713515_T_C,rs11198201,10_117954004_T_C,RAB11FIP2
 6 | 13_112293144_G_T,rs113507970,13_111640797_G_T,TEX29
 7 | 3_103220188_G_A,rs113829802,3_103501344_G_A,MIR548A3
 8 | 14_68010986_C_T,rs118148639,14_67544269_C_T,PLEKHH1
 9 | 4_1291058_C_T,rs11931251,4_1297270_C_T,MAEA
10 | 10_12253597_G_A,rs12221133,10_12211598_G_A,CDC123
11 | 11_2842547_G_A,rs12294675,11_2821317_G_A,KCNQ1
12 | 8_106035307_A_C,rs12548220,8_105023079_A_C,ZFPM2
13 | 16_78332174_T_G,rs12596093,16_78298277_T_G,WWOX
14 | 11_2800669_C_T,rs12794000,11_2779439_C_T,KCNQ1
15 | 9_13972415_G_A,rs13283809,9_13972416_G_A,LINC00583
16 | 17_6946287_T_C,rs13342692,17_7042968_T_C,SLC16A11
17 | 2_59182401_G_A,rs17049846,2_58955266_G_A,LINC01122
18 | 7_130450387_G_A,rs172306,7_130765628_G_A,KLF14
19 | 16_53828066_C_T,rs17817964,16_53794154_C_T,FTO
20 | 10_119418104_G_A,rs2184898,10_117658593_G_A,EMX2
21 | 11_2849530_A_C,rs2283228,11_2828300_A_C,KCNQ1
22 | 10_122347276_T_C,rs2291314,10_120587764_T_C,PLPP4
23 | 11_2901222_C_T,rs2411873,11_2879992_C_T,CDKN1C
24 | 5_79087302_G_A,rs259097,5_79791479_G_A,CMYA5
25 | 6_88418077_C_T,rs2787932,6_87708359_C_T,AKIRIN2
26 | 9_94521525_G_A,rs34134725,9_91759243_G_A,ROR2
27 | 4_153068046_A_C,rs35752631,4_152146894_A_C,FBXW7
28 | 4_157823989_A_G,rs36029656,4_156902837_A_G,PDGFC
29 | 7_94038934_C_T,rs42518,7_94409622_C_T,COL1A2
30 | 13_80628580_G_A,rs4885691,13_80054445_G_A,SPRY2
31 | 10_115075143_C_T,rs4918811,10_113315384_C_T,TCF7L2
32 | 11_2197286_A_G,rs4929965,11_2176056_A_G,MIR4686
33 | 11_2878935_C_T,rs4930016,11_2857705_C_T,KCNQ1
34 | 11_71318195_G_A,rs4945066,11_71607149_G_A,KRTAP5-11
35 | 4_130994623_G_A,rs55653883,4_130073468_G_A,C4orf33
36 | 10_71830776_C_T,rs55919533,10_70071020_C_T,MACROH2A2
37 | 22_32205632_A_C,rs5998135,22_31809646_A_C,DEPDC5
38 | 4_55048912_T_C,rs61650959,4_54182745_T_C,PDGFRA
39 | 1_3459867_A_G,rs61762173,1_3543303_A_G,MEGF6
40 | 1_82903595_G_A,rs61765084,1_82437912_G_A,ADGRL2
41 | 22_44998325_G_A,rs62228440,22_44602445_G_A,LINC00229
42 | 6_97177101_C_T,rs62412972,6_96729225_C_T,GPR63
43 | 3_185514393_T_C,rs6444081,3_185796605_T_C,IGF2BP2
44 | 1_82027976_G_A,rs6674509,1_81562291_G_A,ADGRL2
45 | 12_4335943_T_C,rs67904513,12_4226777_T_C,CCND2
46 | 5_55854153_C_T,rs6867983,5_56558326_C_T,MAP3K1
47 | 10_122923817_C_T,rs7087606,10_121164303_C_T,WDR11
48 | 18_24153466_C_A,rs7243826,18_26573502_C_A,KCTD1
49 | 4_707600_G_A,rs73221106,4_713811_G_A,PCGF3
50 | 13_97714049_T_C,rs7333342,13_97061795_T_C,OXGR1
51 | 11_2063846_C_T,rs73398031,11_2042616_C_T,H19
52 | 10_116080773_G_A,rs758212,10_114321014_G_A,AFAP1L2
53 | 4_117300599_A_G,rs7675851,4_116379443_A_G,MIR1973
54 | 4_154911022_A_G,rs7688570,4_153989870_A_G,SFRP2
55 | 7_843157_G_A,rs7801525,7_803520_G_A,SUN1
56 | 10_123932642_T_G,rs7897826,10_122173127_T_G,TACC2
57 | 10_114758349_C_T,rs7903146,10_112998590_C_T,TCF7L2
58 | 11_5751135_C_T,rs7935270,11_5729905_C_T,TRIM5
59 | 15_45072082_G_T,rs8023560,15_44779884_G_T,TRIM69
60 | 11_107253239_C_A,rs80288792,11_107382513_C_A,CWF19L2
61 | 10_80942620_C_T,rs810517,10_79182863_C_T,ZMIZ1
62 | 9_28773204_A_C,rs824250,9_28773206_A_C,LINGO2
63 | 7_28196222_A_G,rs849134,7_28156603_A_G,JAZF1
64 | 6_42128717_G_A,rs9471790,6_42160979_G_A,GUCA1A
65 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/ALP Neg.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 9,133274084,T,C,rs529565,C,8.36178385315219
3 | 12,120994499,T,G,rs1169302,T,1.4493184052045
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Beta Cell 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,45796261,A,G,rs7551025,A,1.05897449154299
 3 | 2,148810827,C,T,rs4499362,C,1.20214580073032
 4 | 2,233388510,T,C,rs838719,C,1.65890007058726
 5 | 2,58694642,G,A,rs13417036,A,1.1884555300111601
 6 | 3,152664563,A,C,rs1850421,A,1.0407632959667
 7 | 3,186943483,G,T,rs12494144,G,2.21150069431789
 8 | 6,127095785,A,G,rs2800733,A,1.04440549728214
 9 | 6,20682933,G,A,rs9350271,A,2.2011593661603
10 | 6,20743721,T,C,rs9358363,C,1.46659086596583
11 | 6,20926938,T,G,rs9465936,T,1.21580166598046
12 | 7,127580864,T,C,rs989100,C,0.947532312659824
13 | 7,157231816,A,G,rs1182444,G,1.46035575247526
14 | 7,44226585,C,T,rs35452727,T,1.25959114181733
15 | 8,117172544,C,T,rs13266634,C,1.24175705038725
16 | 8,36974792,G,A,rs56687477,A,1.11760420250299
17 | 8,38485494,T,C,rs328301,T,1.21832381259362
18 | 8,41648861,A,G,rs12549294,G,2.92003539595794
19 | 8,41661944,A,G,rs515071,G,1.13571219509328
20 | 9,136350937,G,A,rs78270318,A,1.13082004056711
21 | 9,22132879,T,C,rs10965248,T,1.4445767083959
22 | 9,81359621,A,G,rs10125947,A,1.15780696586224
23 | 10,112994329,T,C,rs7901695,C,1.14634451292131
24 | 10,12265895,C,T,rs11257655,T,1.49618088251067
25 | 10,92782853,G,A,rs7896332,G,1.23305976294232
26 | 10,97259387,C,T,rs3740522,C,1.18676481411811
27 | 11,17391413,A,G,rs7124355,A,1.13791609454109
28 | 11,35417556,G,T,rs1923293,G,1.1133954528047
29 | 11,92964815,C,T,rs10466351,T,2.69364990189508
30 | 12,108236003,G,A,rs1426371,G,1.60776816602254
31 | 13,32988367,T,C,rs7997912,C,1.67924628477302
32 | 13,91295793,G,A,rs9523295,G,0.920252193799057
33 | 15,62104190,A,G,rs7172432,A,1.90805714438906
34 | 15,89885662,A,C,rs10852123,A,1.33017201375341
35 | 15,90979023,G,A,rs8026714,A,1.02450165378977
36 | 17,67645535,C,T,rs2706710,T,0.977179999261815
37 | 19,45654658,G,A,rs7507912,G,1.01121814162339
38 | 19,45856295,A,G,rs12609371,G,0.953844507475111
39 | 20,44366172,T,C,rs12625671,C,0.963386102400816
40 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Beta Cell 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,27508073,T,C,rs1260326,C,0.938834406752605
 3 | 2,44961214,C,T,rs895636,T,1.98631732420755
 4 | 2,60358671,T,C,rs243019,C,0.954279346393862
 5 | 6,20682933,G,A,rs9350271,A,1.70867475612877
 6 | 7,14858657,C,T,rs17168486,T,1.7427604052071
 7 | 7,15024298,T,G,rs10950550,T,1.99450729695613
 8 | 8,117172544,C,T,rs13266634,C,3.64925590051388
 9 | 9,22132879,T,C,rs10965248,T,1.32250264059456
10 | 9,22137686,T,G,rs7018475,G,0.934217933381001
11 | 9,4292083,G,A,rs10758593,A,1.32408451085596
12 | 10,112994329,T,C,rs7901695,C,2.66742274538238
13 | 10,12265895,C,T,rs11257655,T,0.923058952262586
14 | 11,2836003,G,A,rs60808706,G,0.938558688818934
15 | 11,92964815,C,T,rs10466351,T,3.41810346728567
16 | 13,32988367,T,C,rs7997912,C,1.02831632190351
17 | 15,62104190,A,G,rs7172432,A,1.93563411356787
18 | 15,77454848,A,G,rs7178572,G,1.00567163395728
19 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Lipodystrophy 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,39478577,T,C,rs61779313,C,1.8505544202947
 3 | 1,50728312,A,G,rs12090545,A,1.70980451666358
 4 | 2,164745671,A,G,rs3769891,A,2.82807773456512
 5 | 3,185777532,G,A,rs13092876,A,1.55038216609814
 6 | 6,126643364,A,G,rs4273712,G,1.34061525226534
 7 | 6,34246893,A,G,rs4711389,A,1.43776952084951
 8 | 7,93477781,A,C,rs2074120,A,0.957966020800177
 9 | 10,121207656,G,A,rs10788149,G,1.26026948350704
10 | 10,92782853,G,A,rs7896332,G,0.964758195142507
11 | 12,71055741,G,T,rs7313668,T,1.02078514149782
12 | 16,81501185,T,C,rs2925979,T,3.63678616571073
13 | 18,63178651,T,C,rs12454712,T,1.92434210253456
14 | 19,33396499,T,C,rs7250869,T,1.12505423482159
15 | 19,45654658,G,A,rs7507912,G,1.16217682759086
16 | 19,7293108,T,C,rs8101064,T,1.39614440545665
17 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Lipodystrophy 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,164745671,A,G,rs3769891,A,0.972242711885497
 3 | 3,187980545,T,C,rs13086331,T,1.13875980113942
 4 | 3,23055584,C,T,rs861983,T,1.2694221182209
 5 | 5,134528909,G,A,rs329122,A,1.37438593786571
 6 | 5,56510924,A,G,rs459193,G,4.20300059160372
 7 | 6,34246893,A,G,rs4711389,A,0.923917693997165
 8 | 8,131867530,T,C,rs10505581,C,0.932698668823286
 9 | 9,1032567,G,A,rs1016565,A,1.63852517573449
10 | 10,121169979,T,C,rs10886863,C,1.48873464298509
11 | 12,117963051,G,A,rs111246699,A,1.02260481232994
12 | 12,120994499,T,G,rs1169302,T,1.93277518295661
13 | 17,31315412,T,C,rs7502556,T,1.49419289027242
14 | 18,63178651,T,C,rs12454712,T,1.42356374304542
15 | 19,33396499,T,C,rs7250869,T,2.82777959216272
16 | 19,7293108,T,C,rs8101064,T,0.983107797973924
17 | 20,58890516,G,A,rs6123837,A,0.929247394504364
18 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Liver Lipid.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 2,27508073,T,C,rs1260326,C,8.70714920205364
3 | 6,7231610,G,A,rs9379084,G,0.954621381002466
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Obesity.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,177909798,G,A,rs532504,A,2.38687601806275
 3 | 1,39478577,T,C,rs61779313,C,1.20426863670431
 4 | 2,630662,A,C,rs6731688,C,2.41359285170887
 5 | 4,45184122,G,A,rs10938398,A,2.3625225272987
 6 | 5,75279159,A,G,rs2126736,A,1.26239274968625
 7 | 5,96508059,C,T,rs261982,T,1.07415276743519
 8 | 6,50819746,G,T,rs62405419,T,1.71330188212786
 9 | 7,70231919,A,G,rs12698877,G,0.97916886732597
10 | 11,27678578,C,T,rs7103411,T,2.30465226379423
11 | 16,20243775,C,T,rs4238585,T,1.19815302452613
12 | 16,3597097,G,A,rs2240885,A,0.917582164874537
13 | 16,53767042,T,C,rs1421085,C,5.36461255586884
14 | 18,60185354,T,C,rs476828,C,3.02194014380283
15 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Proinsulin.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 8,41661944,A,G,rs515071,G,1.10184318280305
3 | 11,72752390,G,A,rs7109575,G,6.12083008155497
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Total_GRS.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,177909798,G,A,rs532504,A,0.0547999999999974
 3 | 1,39478577,T,C,rs61779313,C,0.0603000000000033
 4 | 1,45796261,A,G,rs7551025,A,0.0490999999999999
 5 | 1,50728312,A,G,rs12090545,A,0.0883000000000005
 6 | 2,148810827,C,T,rs4499362,C,0.0435
 7 | 2,164745671,A,G,rs3769891,A,0.0549999999999998
 8 | 2,233388510,T,C,rs838719,C,0.0452999999999993
 9 | 2,27508073,T,C,rs1260326,C,0.0632000000000013
10 | 2,44961214,C,T,rs895636,T,0.0523000000000021
11 | 2,58694642,G,A,rs13417036,A,0.0496999999999998
12 | 2,60358671,T,C,rs243019,C,0.0553000000000006
13 | 2,630662,A,C,rs6731688,C,0.0982000000000037
14 | 3,122246352,T,G,rs9859381,G,0.0391999999999962
15 | 3,152664563,A,C,rs1850421,A,0.0442000000000003
16 | 3,170953520,T,G,rs7642480,T,0.0584
17 | 3,185777532,G,A,rs13092876,A,0.125699999999997
18 | 3,186943483,G,T,rs12494144,G,0.0442999999999999
19 | 3,187980545,T,C,rs13086331,T,0.0508999999999999
20 | 3,23055584,C,T,rs861983,T,0.136900000000004
21 | 4,1260747,C,T,rs7656416,C,0.1002
22 | 4,45184122,G,A,rs10938398,A,0.0457000000000029
23 | 5,134528909,G,A,rs329122,A,0.0386000000000032
24 | 5,56510924,A,G,rs459193,G,0.0736999999999982
25 | 5,75279159,A,G,rs2126736,A,0.0376999999999998
26 | 5,96508059,C,T,rs261982,T,0.039699999999998
27 | 6,126643364,A,G,rs4273712,G,0.0468999999999971
28 | 6,127095785,A,G,rs2800733,A,0.0640000000000005
29 | 6,20682933,G,A,rs9350271,A,0.193300000000001
30 | 6,20743721,T,C,rs9358363,C,0.0600000000000004
31 | 6,20926938,T,G,rs9465936,T,0.0540999999999997
32 | 6,34246893,A,G,rs4711389,A,0.0926000000000004
33 | 6,39078868,A,C,rs742762,A,0.0750999999999999
34 | 6,50819746,G,T,rs62405419,T,0.0441999999999981
35 | 6,7231610,G,A,rs9379084,G,0.0717999999999996
36 | 7,127580864,T,C,rs989100,C,0.0662999999999975
37 | 7,14858657,C,T,rs17168486,T,0.0643000000000005
38 | 7,15024298,T,G,rs10950550,T,0.0650000000000005
39 | 7,157231816,A,G,rs1182444,G,0.047300000000003
40 | 7,44226585,C,T,rs35452727,T,0.0559999999999987
41 | 7,70231919,A,G,rs12698877,G,0.0672999999999987
42 | 7,93477781,A,C,rs2074120,A,0.0409000000000003
43 | 8,117172544,C,T,rs13266634,C,0.116
44 | 8,131867530,T,C,rs10505581,C,0.0416000000000018
45 | 8,36974792,G,A,rs56687477,A,0.0461000000000004
46 | 8,38485494,T,C,rs328301,T,0.0395000000000002
47 | 8,41648861,A,G,rs12549294,G,0.0706000000000022
48 | 8,41661944,A,G,rs515071,G,0.0653999999999977
49 | 8,74290985,A,G,rs185063984,G,0.1324
50 | 9,1032567,G,A,rs1016565,A,0.0375000000000047
51 | 9,133274084,T,C,rs529565,C,0.0428000000000014
52 | 9,136350937,G,A,rs78270318,A,0.067499999999998
53 | 9,22132879,T,C,rs10965248,T,0.1829
54 | 9,22137686,T,G,rs7018475,G,0.0593999999999996
55 | 9,4292083,G,A,rs10758593,A,0.0633000000000042
56 | 9,81359621,A,G,rs10125947,A,0.0432999999999998
57 | 10,112994329,T,C,rs7901695,C,0.275400000000002
58 | 10,121169979,T,C,rs10886863,C,0.0595000000000034
59 | 10,121207656,G,A,rs10788149,G,0.0521999999999996
60 | 10,12265895,C,T,rs11257655,T,0.113999999999997
61 | 10,92782853,G,A,rs7896332,G,0.0570000000000003
62 | 10,97259387,C,T,rs3740522,C,0.0459999999999996
63 | 11,17391413,A,G,rs7124355,A,0.0741999999999996
64 | 11,27678578,C,T,rs7103411,T,0.0413000000000032
65 | 11,2836003,G,A,rs60808706,G,0.2394
66 | 11,35417556,G,T,rs1923293,G,0.0548999999999997
67 | 11,72752390,G,A,rs7109575,G,0.141
68 | 11,92964815,C,T,rs10466351,T,0.037100000000001
69 | 12,108236003,G,A,rs1426371,G,0.0479000000000003
70 | 12,117963051,G,A,rs111246699,A,0.0613999999999981
71 | 12,120994499,T,G,rs1169302,T,0.0500999999999995
72 | 12,71055741,G,T,rs7313668,T,0.0446999999999987
73 | 13,22015744,A,G,rs9316706,A,0.0414
74 | 13,32988367,T,C,rs7997912,C,0.078799999999996
75 | 13,91295793,G,A,rs9523295,G,0.0774999999999998
76 | 15,62104190,A,G,rs7172432,A,0.0792000000000003
77 | 15,75444946,T,C,rs7171507,T,0.0457000000000001
78 | 15,77454848,A,G,rs7178572,G,0.0717000000000007
79 | 15,89885662,A,C,rs10852123,A,0.0601
80 | 15,90979023,G,A,rs8026714,A,0.0657000000000009
81 | 16,20243775,C,T,rs4238585,T,0.0470000000000041
82 | 16,3597097,G,A,rs2240885,A,0.0421999999999966
83 | 16,53767042,T,C,rs1421085,C,0.130299999999999
84 | 16,81501185,T,C,rs2925979,T,0.0423000000000003
85 | 17,31315412,T,C,rs7502556,T,0.0442999999999999
86 | 17,37741595,A,C,rs8064454,A,0.1202
87 | 17,67645535,C,T,rs2706710,T,0.0711000000000007
88 | 18,60185354,T,C,rs476828,C,0.0840000000000036
89 | 18,63178651,T,C,rs12454712,T,0.0543000000000005
90 | 19,33396499,T,C,rs7250869,T,0.0557999999999999
91 | 19,45654658,G,A,rs7507912,G,0.0932000000000006
92 | 19,45856295,A,G,rs12609371,G,0.0394000000000014
93 | 19,7293108,T,C,rs8101064,T,0.0634
94 | 20,44366172,T,C,rs12625671,C,0.0655000000000018
95 | 20,51538847,T,C,rs6021276,T,0.0426000000000005
96 | 20,58890516,G,A,rs6123837,A,0.0387000000000039
97 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/hg38_score_info/Unknown.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,233388510,T,C,rs838719,C,1.36292318377145
 3 | 2,44961214,C,T,rs895636,T,1.36471787570867
 4 | 3,122246352,T,G,rs9859381,G,0.925934188534673
 5 | 3,170953520,T,G,rs7642480,T,1.00385231931068
 6 | 4,1260747,C,T,rs7656416,C,1.33469344718539
 7 | 6,20682933,G,A,rs9350271,A,1.20245846308046
 8 | 6,39078868,A,C,rs742762,A,1.09343062879404
 9 | 7,15024298,T,G,rs10950550,T,1.10075181627983
10 | 8,74290985,A,G,rs185063984,G,1.03203544993695
11 | 9,22132879,T,C,rs10965248,T,1.09197796768426
12 | 11,17391413,A,G,rs7124355,A,0.911107861026035
13 | 11,2836003,G,A,rs60808706,G,2.00966819176367
14 | 13,22015744,A,G,rs9316706,A,0.903453718276932
15 | 15,75444946,T,C,rs7171507,T,1.3029635126487
16 | 15,77454848,A,G,rs7178572,G,0.91899776059859
17 | 17,37741595,A,C,rs8064454,A,1.12401219294317
18 | 20,51538847,T,C,rs6021276,T,0.976659512991192
19 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EAS/liftover_map.txt:
--------------------------------------------------------------------------------
  1 | VAR_ID_hg19,rsID,VAR_ID_hg38,nearest_gene
  2 | 4_153520279_G_A,rs10011838,4_152599127_G_A,TMEM154
  3 | 9_83974536_A_G,rs10125947,9_81359621_A_G,TLE1
  4 | 9_1032567_G_A,rs1016565,9_1032567_G_A,DMRT2
  5 | 11_92697981_C_T,rs10466351,11_92964815_C_T,MTNR1B
  6 | 8_132879777_T_C,rs10505581,8_131867530_T_C,EFR3A
  7 | 13_26781528_G_A,rs10507349,13_26207391_G_A,RNF6
  8 | 9_4292083_G_A,rs10758593,9_4292083_G_A,GLIS3
  9 | 10_122967170_G_A,rs10788149,10_121207656_G_A,FGFR2
 10 | 10_124149917_T_C,rs10788284,10_122390401_T_C,PLEKHA1
 11 | 15_90428894_A_C,rs10852123,15_89885662_A_C,AP3S2
 12 | 12_97850215_C_A,rs10860209,12_97456437_C_A,RMST
 13 | 10_114711755_C_T,rs10885396,10_112951996_C_T,TCF7L2
 14 | 10_122929493_T_C,rs10886863,10_121169979_T_C,WDR11
 15 | 4_45186139_G_A,rs10938398,4_45184122_G_A,GNPDA2
 16 | 5_51787940_A_G,rs10940253,5_52492106_A_G,ITGA1
 17 | 7_15063923_T_G,rs10950550,7_15024298_T_G,DGKB
 18 | 9_22132878_T_C,rs10965248,9_22132879_T_C,CDKN2B-AS1
 19 | 11_2203154_T_C,rs11043003,11_2181924_T_C,MIR4686
 20 | 12_118400856_G_A,rs111246699,12_117963051_G_A,KSR2
 21 | 10_94479070_C_T,rs111268405,10_92719313_C_T,HHEX
 22 | 11_2292242_C_T,rs111826047,11_2271012_C_T,ASCL2
 23 | 10_122834572_G_T,rs11199753,10_121075059_G_T,WDR11
 24 | 10_12307894_C_T,rs11257655,10_12265895_C_T,CDC123
 25 | 9_98278413_C_T,rs113154802,9_95516131_C_T,PTCH1
 26 | 17_4024824_G_A,rs113547729,17_4121530_G_A,ZZEF1
 27 | 14_77320585_C_T,rs11620646,14_76854242_C_T,LRRC74A
 28 | 12_121432302_T_G,rs1169302,12_120994499_T_G,HNF1A
 29 | 16_20323168_G_A,rs117267808,16_20311846_G_A,GP2
 30 | 6_39283083_G_A,rs11753141,6_39315307_G_A,KCNK16
 31 | 11_128389391_C_T,rs11819995,11_128519496_C_T,ETS1
 32 | 7_157024510_A_G,rs1182444,7_157231816_A_G,UBE3C
 33 | 3_23258614_G_A,rs11926494,3_23217123_G_A,UBE2E2
 34 | 1_51193984_A_G,rs12090545,1_50728312_A_G,FAF1
 35 | 13_80707429_A_G,rs1215468,13_80133294_A_G,SPRY2
 36 | 6_20586039_A_G,rs12213132,6_20585808_A_G,CDKAL1
 37 | 13_51088809_G_A,rs123378,13_50514673_G_A,DLEU1
 38 | 10_89684214_C_A,rs1236816,10_87924457_C_A,PTEN
 39 | 1_214155398_C_A,rs12403994,1_213982055_C_A,PROX1
 40 | 14_24878370_C_T,rs12437434,14_24409164_C_T,NYNRIN
 41 | 17_36056192_T_G,rs12452659,17_37696186_T_G,HNF1B
 42 | 18_60845884_T_C,rs12454712,18_63178651_T_C,BCL2
 43 | 3_186661271_G_T,rs12494144,3_186943483_G_T,ST6GAL1
 44 | 3_124921920_G_A,rs12497133,3_125203076_G_A,SLC12A8
 45 | 8_41506380_A_G,rs12549294,8_41648861_A_G,NKX6-3
 46 | 16_72022534_C_T,rs12600132,16_71988635_C_T,PKD1L3
 47 | 2_27730940_T_C,rs1260326,2_27508073_T_C,GCKR
 48 | 19_46359553_A_G,rs12609371,19_45856295_A_G,SYMPK
 49 | 20_42994812_T_C,rs12625671,20_44366172_T_C,HNF4A
 50 | 7_69696905_A_G,rs12698877,7_70231919_A_G,AUTS2
 51 | 12_114124239_G_A,rs12816687,12_113686434_G_A,RBM19
 52 | 3_187698333_T_C,rs13086331,3_187980545_T_C,LPP-AS2
 53 | 3_185495320_G_A,rs13092876,3_185777532_G_A,IGF2BP2
 54 | 8_118184783_C_T,rs13266634,8_117172544_C_T,SLC30A8
 55 | 9_81917127_T_C,rs1328412,9_79302212_T_C,TLE4
 56 | 2_58921777_G_A,rs13417036,2_58694642_G_A,LINC01122
 57 | 22_50439430_A_G,rs137845,22_50001001_A_G,IL17REL
 58 | 16_53800954_T_C,rs1421085,16_53767042_T_C,FTO
 59 | 12_108629780_G_A,rs1426371,12_108236003_G_A,WSCD2
 60 | 3_114960798_A_C,rs1459513,3_115241951_A_C,ZBTB20
 61 | 10_63717113_C_T,rs146716733,10_61957354_C_T,ARID5B
 62 | 11_2767262_G_A,rs149658560,11_2746032_G_A,KCNQ1
 63 | 6_20455564_C_T,rs16883824,6_20455333_C_T,E2F3
 64 | 10_64903204_G_A,rs16913026,10_63143444_G_A,NRBF2
 65 | 15_28506833_G_A,rs16950949,15_28261687_G_A,HERC2
 66 | 7_14898282_C_T,rs17168486,7_14858657_C_T,DGKB
 67 | 10_71321279_A_G,rs177045,10_69561523_A_G,NEUROG3
 68 | 8_118028794_T_G,rs17744945,8_117016555_T_G,SLC30A8
 69 | 1_22068326_A_G,rs1825307,1_21741833_A_G,USP48
 70 | 3_152382352_A_C,rs1850421,3_152664563_A_C,P2RY1
 71 | 8_75203220_A_G,rs185063984,8_74290985_A_G,JPH1
 72 | 17_36051372_C_T,rs1859211,17_37691367_C_T,HNF1B
 73 | 17_6953781_C_T,rs186568031,17_7050462_C_T,SLC16A11
 74 | 8_37380139_T_C,rs1892609,8_37522621_T_C,ZNF703
 75 | 11_35439103_G_T,rs1923293,11_35417556_G_T,SLC1A2
 76 | 10_71273357_G_A,rs1955163,10_69513601_G_A,TSPAN15
 77 | 5_52097183_A_G,rs2059202,5_52801349_A_G,PELO
 78 | 7_93107093_A_C,rs2074120,7_93477781_A_C,CALCR
 79 | 6_33550025_A_G,rs210148,6_33582248_A_G,GGNBP1
 80 | 5_74574984_A_G,rs2126736,5_75279159_A_G,ANKRD31
 81 | 11_2863820_A_C,rs2237898,11_2842590_A_C,KCNQ1
 82 | 16_3647098_G_A,rs2240885,16_3597097_G_A,SLX4
 83 | 1_64107893_G_A,rs2269245,1_63642222_G_A,PGM1
 84 | 1_184020945_G_A,rs2274432,1_184051811_G_A,TSEN15
 85 | 11_2569903_C_T,rs2283159,11_2548673_C_T,KCNQ1
 86 | 11_2745107_C_T,rs231917,11_2723877_C_T,KCNQ1
 87 | 6_137293227_T_C,rs2327777,6_136972090_T_C,NHEG1
 88 | 2_60585806_T_C,rs243019,2_60358671_T_C,MIR4432
 89 | 12_66232810_G_T,rs2583934,12_65839030_G_T,HMGA2
 90 | 5_95843763_C_T,rs261982,5_96508059_C_T,PCSK1
 91 | 17_65641651_C_T,rs2706710,17_67645535_C_T,PITPNC1
 92 | 9_84308948_G_A,rs2796441,9_81694033_G_A,TLE1
 93 | 6_127416930_A_G,rs2800733,6_127095785_A_G,RSPO3
 94 | 10_12331586_A_G,rs2801473,10_12289587_A_G,CDC123
 95 | 4_71844118_G_A,rs28599782,4_70978401_G_A,MOB1B
 96 | 22_46313618_G_T,rs28637892,22_45917738_G_T,WNT7B
 97 | 7_44174857_T_G,rs2908279,7_44135258_T_G,MYL7
 98 | 16_81534790_T_C,rs2925979,16_81501185_T_C,CMIP
 99 | 5_176513896_C_A,rs3135911,5_177086895_C_A,FGFR4
100 | 7_127793861_G_A,rs322728,7_128153809_G_A,MIR129-1
101 | 8_38343012_T_C,rs328301,8_38485494_T_C,FGFR1
102 | 5_133864599_G_A,rs329122,5_134528909_G_A,JADE2
103 | 8_17927609_C_T,rs34642578,8_18070100_C_T,ASAH1
104 | 1_229672955_G_A,rs348330,1_229537208_G_A,ABCB10
105 | 8_73503743_A_C,rs349359,8_72591508_A_C,KCNB2
106 | 7_44266184_C_T,rs35452727,7_44226585_C_T,CAMK2B
107 | 10_94435673_G_A,rs35906730,10_92675916_G_A,HHEX
108 | 10_99019144_C_T,rs3740522,10_97259387_C_T,ARHGAP19
109 | 15_40616742_G_A,rs3743140,15_40324541_G_A,CCDC9B
110 | 12_27963402_G_A,rs3751236,12_27810469_G_A,KLHL42
111 | 2_165602181_A_G,rs3769891,2_164745671_A_G,COBLL1
112 | 10_12401811_T_C,rs4073527,10_12359812_T_C,CAMK1D
113 | 16_20255097_C_T,rs4238585,16_20243775_C_T,GP2
114 | 6_126964510_A_G,rs4273712,6_126643364_A_G,CENPW
115 | 4_6289986_T_G,rs4458523,4_6288259_T_G,WFS1
116 | 2_149568396_C_T,rs4499362,2_148810827_C_T,EPC2
117 | 7_126864679_T_G,rs4532535,7_127224625_T_G,GRM8
118 | 9_4309006_C_T,rs4567095,9_4309006_C_T,GLIS3
119 | 5_55806751_A_G,rs459193,5_56510924_A_G,ANKRD55
120 | 6_34214670_A_G,rs4711389,6_34246893_A_G,SMIM29
121 | 18_57852587_T_C,rs476828,18_60185354_T_C,MC4R
122 | 8_41519462_A_G,rs515071,8_41661944_A_G,ANK1
123 | 9_136149500_T_C,rs529565,9_133274084_T_C,ABO
124 | 1_177878933_G_A,rs532504,1_177909798_G_A,SEC16B
125 | 14_103237952_G_A,rs55700915,14_102771615_G_A,TRAF3
126 | 8_36832310_G_A,rs56687477,8_36974792_G_A,KCNU1
127 | 14_77382503_A_G,rs58524310,14_76916160_A_G,LRRC74A
128 | 11_3116024_A_G,rs59772385,11_3094794_A_G,OSBPL5
129 | 15_93825358_C_A,rs59876980,15_93282129_C_A,RGMA
130 | 3_123174832_C_T,rs60054445,3_123455985_C_T,ADCY5
131 | 8_126471274_G_A,rs60089934,8_125459032_G_A,TRIB1
132 | 20_48830265_T_C,rs6012876,20_50213728_T_C,CEBPB
133 | 20_50155386_T_C,rs6021276,20_51538847_T_C,NFATC2
134 | 3_23551971_G_A,rs60410861,3_23510480_G_A,UBE2E2
135 | 1_20688352_C_T,rs60573766,1_20361859_C_T,LINC01141
136 | 11_2857233_G_A,rs60808706,11_2836003_G_A,KCNQ1
137 | 20_57465571_G_A,rs6123837,20_58890516_G_A,GNAS
138 | 1_39944249_T_C,rs61779313,1_39478577_T_C,MACF1
139 | 12_4289091_C_A,rs61910828,12_4179925_C_A,CCND2
140 | 14_38809661_A_G,rs61975988,14_38340457_A_G,CLEC14A
141 | 6_50787459_G_T,rs62405419,6_50819746_G_T,TFAP2B
142 | 17_36124082_A_G,rs6607292,17_37764098_A_G,HNF1B
143 | 2_630662_A_C,rs6731688,2_630662_A_C,TMEM18
144 | 9_22137685_T_G,rs7018475,9_22137686_T_G,CDKN2B-AS1
145 | 10_80943841_G_A,rs703980,10_79184084_G_A,ZMIZ1
146 | 9_22288404_G_A,rs7045760,9_22288405_G_A,DMRTA1
147 | 11_27700125_C_T,rs7103411,11_27678578_C_T,BDNF
148 | 11_72463435_G_A,rs7109575,11_72752390_G_A,ARAP1
149 | 11_17412960_A_G,rs7124355,11_17391413_A_G,ABCC8
150 | 15_75737287_T_C,rs7171507,15_75444946_T_C,SIN3A
151 | 15_62396389_A_G,rs7172432,15_62104190_A_G,C2CD4A
152 | 15_77747190_A_G,rs7178572,15_77454848_A_G,HMG20A
153 | 19_33887405_T_C,rs7250869,19_33396499_T_C,PEPD
154 | 11_2853163_T_C,rs72844296,11_2831933_T_C,KCNQ1
155 | 22_46484465_A_G,rs7289813,22_46088585_A_G,MIRLET7BHG
156 | 20_22430241_G_A,rs73085586,20_22449603_G_A,LOC284788
157 | 12_71449521_G_T,rs7313668,12_71055741_G_T,TSPAN8
158 | 6_39046644_A_C,rs742762,6_39078868_A_C,GLP1R
159 | 17_29642430_T_C,rs7502556,17_31315412_T_C,NF1
160 | 19_46157916_G_A,rs7507912,19_45654658_G_A,EML2
161 | 6_39289871_G_A,rs75343229,6_39322095_G_A,KCNK16
162 | 1_46261933_A_G,rs7551025,1_45796261_A_G,MAST2
163 | 3_170671309_T_G,rs7642480,3_170953520_T_G,SLC2A2
164 | 4_1254535_C_T,rs7656416,4_1260747_C_T,CTBP1-DT
165 | 6_131954797_T_G,rs7739842,6_131633657_T_G,ENPP3
166 | 7_13886654_C_T,rs7787720,7_13847029_C_T,ETV1
167 | 9_139245389_G_A,rs78270318,9_136350937_G_A,GPSM1
168 | 10_112678657_G_T,rs7895872,10_110918899_G_T,BBIP1
169 | 10_94542610_G_A,rs7896332,10_92782853_G_A,EXOC6
170 | 10_114754088_T_C,rs7901695,10_112994329_T_C,TCF7L2
171 | 10_77303697_T_C,rs7906280,10_75543939_T_C,ZNF503-AS2
172 | 10_94316828_C_A,rs7917163,10_92557071_C_A,IDE
173 | 11_2626334_A_G,rs7947981,11_2605104_A_G,KCNQ1
174 | 13_33562505_T_C,rs7997912,13_32988367_T_C,KL
175 | 7_140651523_G_A,rs801089,7_140951723_G_A,BRAF
176 | 6_117996631_T_C,rs80196932,6_117675468_T_C,NUS1
177 | 15_91522253_G_A,rs8026714,15_90979023_G_A,PRC1
178 | 15_38828140_G_T,rs8043085,15_38535939_G_T,RASGRP1
179 | 17_36101586_A_C,rs8064454,17_37741595_A_C,HNF1B
180 | 19_7293119_T_C,rs8101064,19_7293108_T_C,INSR
181 | 2_234297156_T_C,rs838719,2_233388510_T_C,DGKD
182 | 3_23097075_C_T,rs861983,3_23055584_C_T,UBE2E2
183 | 2_45188353_C_T,rs895636,2_44961214_C_T,SIX3
184 | 8_95960886_G_T,rs896852,8_94948658_G_T,TP53INP1
185 | 9_4243162_G_A,rs911490,9_4243162_G_A,GLIS3
186 | 5_14777799_A_G,rs9312873,5_14777690_A_G,ANKH
187 | 13_22589883_A_G,rs9316706,13_22015744_A_G,LINC00424
188 | 6_20683164_G_A,rs9350271,6_20682933_G_A,CDKAL1
189 | 6_20743952_T_C,rs9358363,6_20743721_T_C,CDKAL1
190 | 6_139205386_T_C,rs9376382,6_138884249_T_C,ECT2L
191 | 6_7231843_G_A,rs9379084,6_7231610_G_A,RREB1
192 | 6_143056556_T_C,rs9390022,6_142735419_T_C,HIVEP2
193 | 6_20509339_A_G,rs942041,6_20509108_A_G,E2F3
194 | 6_20927169_T_G,rs9465936,6_20926938_T_G,CDKAL1
195 | 13_91948047_G_A,rs9523295,13_91295793_G_A,MIR17HG
196 | 3_121965199_T_G,rs9859381,3_122246352_T_G,CASR
197 | 3_63891105_T_C,rs9870576,3_63905429_T_C,ATXN7
198 | 7_127220918_T_C,rs989100,7_127580864_T_C,GCC1
199 | 17_40913366_C_T,rs9892728,17_42761348_C_T,RAMP2
200 | 18_7076836_C_T,rs9948462,18_7076837_C_T,LAMA1
201 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/ALP Neg.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 3,196083316,T,C,rs7619708,T,0.811268374976863
 3 | 6,139515991,T,C,rs11155073,T,1.70122184458857
 4 | 6,43834141,C,T,rs2894536,C,0.848664475361421
 5 | 6,43843237,G,A,rs9369425,G,1.155447956349
 6 | 7,100715101,A,G,rs534043,G,0.764667990679758
 7 | 9,133269828,C,T,rs495203,T,5.324443416831
 8 | 10,69340132,G,A,rs10159477,G,3.69108483702209
 9 | 11,61798436,T,C,rs174541,T,2.17170144914074
10 | 12,48342520,C,A,rs2732480,C,1.41878190679511
11 | 16,29946895,G,A,rs8054556,A,0.74715798473066
12 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/ASAT Pos.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,11239454,C,T,rs7544489,T,0.840788000437575
 3 | 1,155299985,A,G,rs3020781,G,0.823621158224411
 4 | 1,50972693,C,T,rs3176466,C,1.03655309336363
 5 | 2,18540396,G,A,rs7558413,A,0.954927244263011
 6 | 3,46948414,G,A,rs12491473,G,0.856540760181833
 7 | 4,156731601,C,A,rs28819812,C,1.17684262294889
 8 | 4,18023861,G,A,rs2011603,A,1.55586100743138
 9 | 5,158599736,C,T,rs890940,T,0.838594967456831
10 | 5,53478680,G,A,rs62370480,A,1.74052293502224
11 | 5,53976834,G,A,rs4865796,A,1.08742124424837
12 | 5,56566067,G,A,rs9687846,A,2.09045528273711
13 | 6,43790159,C,A,rs998584,A,0.97430556694122
14 | 7,15886603,T,C,rs38221,T,0.735247066717914
15 | 7,40777054,C,T,rs17439448,T,0.83480231990919
16 | 7,4643627,G,A,rs62450857,A,0.911446672696138
17 | 8,26015118,A,G,rs17818197,G,0.736947611830721
18 | 9,94180777,G,A,rs10821311,A,0.80088704874938
19 | 10,122433665,T,G,rs2280141,T,0.940558954640188
20 | 10,68583018,C,T,rs10998304,C,0.835291215123657
21 | 11,17388025,T,C,rs5219,T,0.891800699436919
22 | 12,12718165,T,G,rs2066827,G,0.738867415588662
23 | 12,4275678,T,G,rs76895963,T,1.12801582156112
24 | 12,65965972,C,A,rs8756,A,1.53597273874399
25 | 15,40106553,T,C,rs484943,T,1.16059228847848
26 | 15,75522047,C,T,rs6495182,C,1.1139220422407
27 | 16,28906323,T,C,rs7188071,T,0.781186930213114
28 | 17,67853811,T,G,rs2046323,G,0.82708525898556
29 | 18,55383415,A,C,rs72926932,C,0.766823157775034
30 | 18,63178651,T,C,rs12454712,T,1.25235887701051
31 | 20,33720469,C,T,rs67611724,T,0.880527157857956
32 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Beta Cell 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,20404624,T,C,rs10916785,T,0.741404206287625
 3 | 1,26156065,T,C,rs10794522,C,0.805212778329622
 4 | 1,6612669,A,C,rs11583755,C,1.25357097726457
 5 | 2,120568332,G,A,rs9784137,G,0.820403466645448
 6 | 2,212971253,A,G,rs17354348,A,1.01652074032952
 7 | 2,233414615,A,G,rs838733,G,0.96775916799516
 8 | 2,59084401,G,A,rs980183,G,0.750786192301261
 9 | 2,67651196,T,C,rs1430780,T,0.953168180513232
10 | 3,12443429,A,C,rs60710264,C,1.04050302917565
11 | 3,152675088,T,G,rs9828639,T,0.903080832873115
12 | 3,186927836,C,T,rs13095782,T,0.752962034876219
13 | 3,186947857,C,T,rs3887925,T,1.52187930901443
14 | 3,23414091,T,C,rs35352848,T,1.21643075310443
15 | 3,35626269,A,G,rs10490871,G,0.895805899824739
16 | 3,9472332,G,A,rs3872707,A,1.27207396389584
17 | 4,105127134,T,C,rs17035289,C,1.50731554574906
18 | 4,20209330,A,G,rs11940813,G,0.99297172533189
19 | 4,751184,G,T,rs1531583,T,1.12056880681728
20 | 5,14788235,T,C,rs31931,T,1.45529760950762
21 | 5,158599736,C,T,rs890940,T,0.773259704059205
22 | 6,153106967,A,G,rs9383649,G,0.812825108274305
23 | 6,20679478,A,G,rs7756992,G,2.4158851314958
24 | 6,21008508,G,A,rs12192642,G,1.0179259075545
25 | 6,21079126,A,G,rs11967298,A,0.786669456973421
26 | 6,41044666,A,G,rs4714422,G,0.785686443600627
27 | 6,43790159,C,A,rs998584,A,1.36807730642919
28 | 6,7231610,G,A,rs9379084,G,0.777504300052759
29 | 7,102849009,A,G,rs56269620,A,0.841545229287769
30 | 7,30688836,C,T,rs917195,C,1.25275539527058
31 | 7,74693803,G,A,rs67755137,A,1.00561625403194
32 | 8,117173494,A,G,rs11558471,A,1.57021450268091
33 | 8,14267300,A,C,rs17294565,C,0.772226910587326
34 | 8,41643342,C,T,rs59191643,T,1.58981160160702
35 | 8,41672271,C,T,rs80105613,T,0.836685731976527
36 | 9,114181077,A,G,rs1431819,G,0.91771140542696
37 | 9,123351733,T,C,rs4838049,C,0.988468432674962
38 | 9,19080354,A,G,rs10963942,G,1.20289365831587
39 | 9,22003368,G,A,rs1063192,A,1.4671350314182
40 | 9,22134095,T,C,rs10811661,T,1.35935876634001
41 | 10,112989975,G,A,rs35011184,A,1.09498269557365
42 | 10,112999686,G,A,rs11196187,A,0.99848985803851
43 | 10,122433665,T,G,rs2280141,T,0.771338774506973
44 | 10,12265895,C,T,rs11257655,T,0.754770534442257
45 | 10,69692529,T,C,rs2812533,T,0.848763744071059
46 | 10,92105487,C,T,rs72807217,T,1.21863058213275
47 | 10,92703125,C,T,rs1111875,C,2.19850973740941
48 | 10,92740354,G,A,rs11187152,G,0.919680097388424
49 | 10,97299888,C,A,rs10882891,C,0.921215873844895
50 | 11,1675619,C,T,rs2334499,T,1.16349952469302
51 | 11,17388025,T,C,rs5219,T,0.751504045374263
52 | 11,2080053,G,A,rs11600952,A,1.07055990704825
53 | 11,2176056,A,G,rs4929965,A,1.13083153510612
54 | 11,2835964,A,C,rs2237895,C,0.885974644001596
55 | 11,2887735,A,G,rs450563,G,1.10786668524409
56 | 11,43855148,C,T,rs11555762,T,0.94971081927632
57 | 11,45834447,C,T,rs6485644,C,1.26005361446507
58 | 11,61798436,T,C,rs174541,T,1.59006227316493
59 | 11,92983658,G,A,rs11020132,A,0.775554598095402
60 | 12,108224853,C,T,rs3764002,C,1.55270649486474
61 | 12,121017786,G,A,rs56158042,G,1.08594768430691
62 | 12,121483887,C,T,rs7977709,C,1.25973625944639
63 | 12,21628312,G,T,rs10841868,G,1.14015482497832
64 | 12,26138216,C,T,rs56008051,C,0.830199393097582
65 | 12,6572620,A,G,rs67013744,G,0.809827435852229
66 | 13,32980164,G,A,rs576674,G,0.769566338445002
67 | 14,22819744,A,G,rs17122776,G,0.820396791694036
68 | 14,76834520,C,T,rs2056857,C,0.793738038091963
69 | 15,62099409,C,T,rs7163757,C,2.47960484122266
70 | 15,89836982,C,A,rs8031576,C,1.01613891151716
71 | 15,90968837,G,A,rs2290203,A,1.05530558031703
72 | 16,250389,T,C,rs55857387,T,1.0330434186243
73 | 17,36486677,A,G,rs4796224,G,0.762622711438937
74 | 17,37740776,A,G,rs11657964,A,0.806954666517041
75 | 17,48046280,A,C,rs9895554,A,0.990992463718845
76 | 17,9884641,G,A,rs7219033,A,1.53943529075368
77 | 18,57013486,G,A,rs12969494,G,0.966317439945646
78 | 19,45653979,G,A,rs10407429,G,1.14864321922069
79 | 20,41203988,T,C,rs17265513,C,0.986958226806199
80 | 20,62649310,C,T,rs2427363,C,1.10892377064271
81 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Beta Cell 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117605762,G,A,rs41276588,A,0.748458007347135
 3 | 1,213985913,T,C,rs340874,C,1.50768670293772
 4 | 1,229537208,G,A,rs348330,G,0.761342766656576
 5 | 2,27508073,T,C,rs1260326,C,0.785198311558966
 6 | 2,43444037,C,T,rs13414140,C,1.32635211961424
 7 | 3,123346931,A,G,rs11708067,A,2.19165673547182
 8 | 3,125202613,C,T,rs9873519,T,0.790351147190566
 9 | 3,141415727,C,T,rs73872717,C,0.892883540217284
10 | 3,171015287,G,A,rs9873618,G,2.0205957229683
11 | 3,185803160,A,G,rs9854769,G,0.918261576228066
12 | 3,188024054,A,G,rs6777684,G,1.01919813122508
13 | 4,184792454,G,A,rs72695645,G,0.821010028344514
14 | 4,6291188,A,G,rs10010131,G,0.743525812495279
15 | 5,77131486,G,A,rs6878122,G,1.14389148167193
16 | 6,20679478,A,G,rs7756992,G,1.28626568929162
17 | 6,7275708,T,C,rs11243150,T,0.928903505017285
18 | 7,14858657,C,T,rs17168486,T,1.57737700043141
19 | 7,15024684,G,T,rs2191349,T,2.31443791524668
20 | 7,28160478,C,T,rs1513272,C,0.796075848951056
21 | 7,44161285,T,C,rs2041547,C,1.79951127425708
22 | 7,44195138,C,T,rs2908286,T,3.96133647523467
23 | 7,44325950,C,T,rs116913033,C,0.790033277345729
24 | 8,117173494,A,G,rs11558471,A,2.6136452141312
25 | 8,144285362,T,C,rs7828303,T,0.830068773578635
26 | 8,94672919,A,C,rs11786992,A,0.813994537199422
27 | 9,22134095,T,C,rs10811661,T,1.38544207775808
28 | 9,22137686,T,G,rs7018475,G,1.06669271435742
29 | 9,4291928,A,C,rs10974438,C,1.37497690065346
30 | 10,111288563,C,T,rs10885123,C,1.28455257339798
31 | 10,112989975,G,A,rs35011184,A,2.62472848176576
32 | 10,112999686,G,A,rs11196187,A,0.793197108006303
33 | 10,113064714,G,A,rs10885410,G,0.757624060468176
34 | 10,113101545,A,G,rs10885414,A,0.756106327158006
35 | 10,12265895,C,T,rs11257655,T,0.963638260283328
36 | 10,69340132,G,A,rs10159477,G,1.24198869315766
37 | 11,2671019,C,T,rs231360,T,0.741101217668221
38 | 11,2835964,A,C,rs2237895,C,0.856599374358303
39 | 11,45834447,C,T,rs6485644,C,0.893196006972234
40 | 11,72721940,G,A,rs11603334,G,1.39565082078472
41 | 12,132493708,C,T,rs11614914,T,0.895318119515759
42 | 12,4275678,T,G,rs76895963,T,0.8896763935819
43 | 12,97454449,C,T,rs113036477,C,1.05579805257086
44 | 13,32980164,G,A,rs576674,G,1.18368599542561
45 | 15,62099409,C,T,rs7163757,C,0.814269257625806
46 | 15,77489993,A,G,rs12910361,G,0.964617012002076
47 | 20,58812207,A,G,rs911300,G,0.791727162282264
48 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Bilirubin.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 2,233414615,A,G,rs838733,G,1.5238203831105
3 | 2,233660318,C,T,rs2602374,T,12.179891843664
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Cholesterol.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 5,75660016,G,A,rs253412,A,0.783887678173866
3 | 11,61798436,T,C,rs174541,T,0.73969606852391
4 | 19,19268740,C,T,rs58542926,T,1.8861528421618
5 | 19,44854120,T,C,rs4803764,T,2.88351258749252
6 | 19,44908684,T,C,rs429358,T,7.76835350626357
7 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Hyper Insulin.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117544762,G,A,rs2767323,G,1.06850776010211
 3 | 1,147649198,G,A,rs79489938,G,0.758406347323169
 4 | 1,205075959,G,A,rs1572993,A,1.5781307777311
 5 | 2,164838388,C,T,rs355799,C,0.995210581126864
 6 | 2,25412902,A,G,rs34845373,A,1.39200791735617
 7 | 2,60328479,A,G,rs35824707,G,0.792662982698415
 8 | 2,60357684,G,A,rs243021,A,0.865499331610328
 9 | 2,60491212,C,T,rs7599488,T,0.988819438304368
10 | 2,65049318,T,C,rs1009358,T,1.22892931755776
11 | 3,12070747,C,T,rs12489177,T,0.812081073731205
12 | 3,141415727,C,T,rs73872717,C,0.822404304660683
13 | 3,15699882,T,C,rs4465929,T,0.888925538334548
14 | 3,36828739,A,G,rs11129735,A,1.45011873486622
15 | 3,49943163,C,T,rs4688760,T,1.05742855785914
16 | 3,71606894,C,T,rs844215,C,1.18706070735424
17 | 4,156731601,C,A,rs28819812,C,1.32965890684405
18 | 4,48892964,G,A,rs2289065,G,1.21424859452003
19 | 5,134529762,A,G,rs329124,G,1.00130765376286
20 | 5,54029680,A,G,rs31229,G,0.965661922747786
21 | 5,56503357,C,A,rs157843,A,1.02135333483549
22 | 5,56569218,C,T,rs66485956,T,0.946921308917374
23 | 5,68418419,A,G,rs4976033,G,1.04637295206295
24 | 5,88370824,G,T,rs2410763,T,1.1199061768283
25 | 6,39048860,C,T,rs10305420,C,0.963576281930092
26 | 6,39089153,T,C,rs9296291,T,0.964564282312171
27 | 7,150840547,G,A,rs62492368,A,0.956430787690121
28 | 7,23523649,C,T,rs76365198,C,1.02125747627562
29 | 7,2721116,C,A,rs798549,C,1.05903011044634
30 | 7,50514274,A,G,rs2876826,G,0.754799359198824
31 | 7,50960703,T,C,rs1018942,T,1.18560012451619
32 | 8,10342743,C,T,rs73195303,C,0.839306814640715
33 | 8,10928869,C,A,rs28566988,C,1.19498619603307
34 | 8,144285362,T,C,rs7828303,T,1.03031649213372
35 | 8,9324101,G,A,rs2126263,G,1.07378550456749
36 | 8,9404091,G,A,rs62493853,A,0.765689614863192
37 | 9,1039939,G,A,rs756145,A,0.852062118451503
38 | 9,94180777,G,A,rs10821311,A,0.758765388754933
39 | 10,63545492,C,T,rs12263348,T,1.33440212033686
40 | 10,68583018,C,T,rs10998304,C,1.11279715156381
41 | 10,73839369,A,G,rs2675662,A,0.88512430991802
42 | 11,32439327,C,T,rs7943101,T,0.882817814928975
43 | 11,65527328,C,T,rs1783541,T,0.758466325974499
44 | 11,76528106,C,T,rs2513523,T,0.73522821888135
45 | 12,117974568,G,A,rs34965774,A,1.00621468152664
46 | 12,123973455,C,A,rs12823740,C,1.2112050480976
47 | 12,95534337,C,A,rs11108094,A,1.03981494779862
48 | 13,41114265,A,G,rs4397977,A,0.786186211380992
49 | 13,58562303,C,T,rs4886092,C,0.744234357277956
50 | 14,102910984,G,A,rs10133111,A,1.27553182821105
51 | 14,29275326,A,G,rs8005994,A,0.959587336467102
52 | 15,39354022,A,G,rs17622532,A,0.997518161298793
53 | 15,52807109,T,C,rs75332279,C,1.0533509136327
54 | 16,81501185,T,C,rs2925979,T,0.827427027290875
55 | 16,81576875,G,A,rs11642655,A,0.858903664903455
56 | 16,918292,G,A,rs4984980,A,0.771560526620746
57 | 17,31310290,G,A,rs12602834,G,1.25445170819062
58 | 17,4149155,T,G,rs11652572,T,0.795253836224119
59 | 17,77390827,A,G,rs1656794,G,1.50510338691836
60 | 17,78765957,G,A,rs62075585,G,0.928655402338812
61 | 18,13566625,C,T,rs113780182,T,0.854306244187556
62 | 18,42507133,G,T,rs1431841,T,0.965322255476429
63 | 19,4949909,G,A,rs12977104,A,0.745338342172867
64 | 19,7903283,G,A,rs2115107,A,1.38635303732756
65 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Lipodystrophy 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,11239454,C,T,rs7544489,T,1.062952593457
 3 | 1,172391892,G,T,rs4916253,G,1.500246740117
 4 | 1,219568478,G,A,rs2820444,G,2.17849243941319
 5 | 1,39354638,C,T,rs61779275,T,0.905656530439659
 6 | 2,145590469,G,A,rs7609422,G,0.787122369319404
 7 | 2,164672366,C,T,rs13389219,C,2.85569882509556
 8 | 2,164838388,C,T,rs355799,C,0.959670396756692
 9 | 2,226087049,A,G,rs17231538,A,1.12833654721908
10 | 2,226236593,T,C,rs2972145,C,2.56106579833414
11 | 2,25312674,T,C,rs34048824,T,0.977629369910311
12 | 2,65434334,A,C,rs12185610,A,0.786637992723619
13 | 3,12252703,A,G,rs9826367,A,0.846334489463612
14 | 3,12288284,C,T,rs17036160,C,1.46090054738595
15 | 3,12443429,A,C,rs60710264,C,1.11936111015193
16 | 3,185159838,A,G,rs10937208,G,0.971056303335598
17 | 3,185701201,A,G,rs73175562,A,0.86637571501157
18 | 3,185803160,A,G,rs9854769,G,1.15644342672621
19 | 3,46948414,G,A,rs12491473,G,1.2111506687999
20 | 3,47652174,C,T,rs62262091,T,1.01388288647304
21 | 3,53089257,A,G,rs891368,G,0.898953307776491
22 | 3,64722438,C,T,rs4132228,C,1.74665906967227
23 | 4,156731601,C,A,rs28819812,C,1.28080873150431
24 | 4,88818328,C,T,rs13131633,T,1.91915702228461
25 | 4,88830707,C,T,rs7660000,C,1.04564316356407
26 | 5,158599736,C,T,rs890940,T,0.759547663080347
27 | 5,56566067,G,A,rs9687846,A,1.53921910750697
28 | 5,68418419,A,G,rs4976033,G,0.970214727491333
29 | 6,139515991,T,C,rs11155073,T,2.74724318371907
30 | 6,163711969,C,T,rs4709746,C,1.16406196348262
31 | 6,34279270,C,T,rs77136196,T,1.82551588591726
32 | 6,43790159,C,A,rs998584,A,3.28618195974912
33 | 7,130768623,A,G,rs1596972,G,0.805694198492982
34 | 7,28160478,C,T,rs1513272,C,0.802795869424777
35 | 8,19973410,C,T,rs10096633,C,1.33295639349242
36 | 10,63545492,C,T,rs12263348,T,0.875831654885228
37 | 11,64263769,C,T,rs35169799,T,2.10171892843801
38 | 11,65638129,G,T,rs2306363,G,1.12035933468448
39 | 12,121222984,T,C,rs25643,C,0.789629638410106
40 | 12,123008576,A,G,rs12820906,A,0.994715586903685
41 | 12,123863518,C,T,rs11057376,T,1.39159451808673
42 | 12,123973455,C,A,rs12823740,C,2.9872520617637
43 | 12,124025844,C,T,rs10773051,C,2.3417345784784
44 | 12,26138216,C,T,rs56008051,C,0.907557365091295
45 | 12,26312652,C,T,rs11048458,T,2.20455814061685
46 | 12,65965972,C,A,rs8756,A,0.83117843176366
47 | 12,71129263,C,A,rs1705263,C,0.808572961413311
48 | 15,39354022,A,G,rs17622532,A,0.74772412052731
49 | 15,40088651,G,A,rs2242186,A,0.883526716548812
50 | 16,53395261,C,T,rs2908797,C,0.832470382095215
51 | 16,81501185,T,C,rs2925979,T,2.09487020762773
52 | 16,81576875,G,A,rs11642655,A,0.775114585172354
53 | 17,17807956,A,G,rs4925114,A,1.14223858165401
54 | 17,7646363,C,T,rs1641523,C,0.740951629444799
55 | 18,63178651,T,C,rs12454712,T,0.811591683451073
56 | 19,7235135,A,G,rs17175860,G,0.754031453229013
57 | 19,8364439,G,A,rs116843064,G,0.83176142135224
58 | 20,34008898,G,A,rs2268078,A,0.886389370907059
59 | 20,52383088,T,C,rs2426439,C,0.87250292231854
60 | 20,64080106,C,T,rs8126001,C,0.837788372743015
61 | 22,38204535,C,T,rs2267373,T,1.23484184455549
62 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Lipodystrophy 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,26958422,C,T,rs79598313,T,0.888015136432011
 3 | 2,164672366,C,T,rs13389219,C,1.14063104078903
 4 | 2,226236593,T,C,rs2972145,C,1.89015160924628
 5 | 2,65049318,T,C,rs1009358,T,0.862733719467642
 6 | 3,12252703,A,G,rs9826367,A,0.92830771830743
 7 | 3,12288284,C,T,rs17036160,C,2.02911253590886
 8 | 3,135906656,C,T,rs9852406,T,1.22995669281764
 9 | 5,53976834,G,A,rs4865796,A,0.929463278915424
10 | 5,56503357,C,A,rs157843,A,1.51438235945776
11 | 5,68418419,A,G,rs4976033,G,0.927069934053808
12 | 7,130768623,A,G,rs1596972,G,0.998770029334821
13 | 8,10138879,A,G,rs34990153,A,0.789724773751201
14 | 8,19973410,C,T,rs10096633,C,1.47334889996029
15 | 8,8342448,G,A,rs62496027,A,0.749405889748518
16 | 8,9324101,G,A,rs2126263,G,1.17044882791933
17 | 9,1039939,G,A,rs756145,A,0.78001272191752
18 | 10,100152307,T,C,rs2862954,T,2.45036397686714
19 | 11,47870031,T,C,rs11604324,C,0.824748485040415
20 | 15,60646617,C,A,rs8033609,A,0.751884212448393
21 | 18,63178651,T,C,rs12454712,T,0.845392522961538
22 | 19,19268740,C,T,rs58542926,T,1.82222860052993
23 | 19,33444196,C,T,rs2287821,C,0.745393407987891
24 | 19,7903283,G,A,rs2115107,A,0.813833592469361
25 | 20,33720469,C,T,rs67611724,T,0.775716633669226
26 | 20,44413724,C,T,rs1800961,T,1.11008238172632
27 | 20,46966072,G,A,rs6066138,G,0.91985248758557
28 | 22,43928975,G,A,rs3747207,A,3.84592717411791
29 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Liver-Lipid.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,39354638,C,T,rs61779275,T,0.745213024949263
 3 | 2,27508073,T,C,rs1260326,C,7.41431942024394
 4 | 6,126470949,A,G,rs11759026,G,0.782090004684815
 5 | 8,115627247,T,C,rs2737226,T,0.746153081743436
 6 | 8,9324101,G,A,rs2126263,G,2.39961390738293
 7 | 11,61798436,T,C,rs174541,T,1.50296407059363
 8 | 12,120979061,C,T,rs1800574,T,0.913991554177009
 9 | 12,121017786,G,A,rs56158042,G,1.35618944275262
10 | 20,44413724,C,T,rs1800961,T,1.13321439663169
11 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Obesity.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,111749681,T,C,rs197379,C,0.906686937859058
 3 | 1,177904706,G,A,rs490689,A,1.7232856957805
 4 | 1,201794371,C,T,rs567185,T,0.836071559295798
 5 | 1,62114219,G,T,rs12140153,G,0.806830055368476
 6 | 1,72374091,C,A,rs2613503,A,1.04128134326271
 7 | 2,160289101,A,G,rs6432613,G,0.862847696815228
 8 | 2,174332817,C,A,rs12992995,C,0.83215696271914
 9 | 2,180753927,A,G,rs6741676,A,1.11660443016913
10 | 2,228108944,A,G,rs7561798,G,0.8976388932199
11 | 2,422144,T,C,rs62107261,T,1.98298069078581
12 | 2,58706456,C,T,rs1861410,C,1.43799570093576
13 | 2,59084401,G,A,rs980183,G,1.14525943633913
14 | 2,653874,C,T,rs10188334,C,1.78172903748151
15 | 3,131926093,A,G,rs1225052,G,0.94314067454209
16 | 3,15699882,T,C,rs4465929,T,0.966285047478682
17 | 3,173401978,A,G,rs686998,G,1.01381352259294
18 | 3,35626269,A,G,rs10490871,G,1.15278867636112
19 | 3,47652174,C,T,rs62262091,T,0.851811627881173
20 | 3,49943163,C,T,rs4688760,T,1.26999868676804
21 | 3,89937130,T,C,rs11716527,C,0.804234986757685
22 | 3,9472332,G,A,rs3872707,A,0.780789589036755
23 | 4,136162038,A,C,rs1296328,A,1.05797603780357
24 | 4,139958775,G,T,rs2604918,G,0.859608018407233
25 | 4,45184122,G,A,rs10938398,A,1.69150857286116
26 | 4,90322714,G,A,rs7656001,A,0.832106478481
27 | 5,75660016,G,A,rs253412,A,1.15329123782529
28 | 5,79141082,C,A,rs12519500,C,0.877663146272336
29 | 5,88370824,G,T,rs2410763,T,0.817225490295907
30 | 6,125731213,A,G,rs2008027,G,0.77606784551336
31 | 6,40441504,T,C,rs34298980,T,1.05321873125
32 | 6,50821065,A,C,rs3798519,C,1.3835338791592
33 | 7,30688836,C,T,rs917195,C,0.778750849720167
34 | 7,50514274,A,G,rs2876826,G,0.78236721159589
35 | 7,70320618,G,A,rs1880368,A,0.763004684876532
36 | 8,115627247,T,C,rs2737226,T,0.866919137696046
37 | 8,115876242,A,C,rs1569339,C,0.789808624557141
38 | 8,30996517,A,G,rs2725371,A,1.17904485455876
39 | 9,123824284,C,A,rs1752169,A,1.02267196970459
40 | 9,28410685,T,C,rs1412234,C,1.28360402194409
41 | 10,33729802,T,C,rs36051838,C,0.834743288007079
42 | 10,75887349,G,A,rs7099048,A,0.880274244123929
43 | 11,43855148,C,T,rs11555762,T,1.24429723286945
44 | 11,47659818,T,G,rs1056387,T,0.866516283324162
45 | 11,65527328,C,T,rs1783541,T,0.78347857436091
46 | 11,8632981,T,C,rs10769936,C,0.916255412126923
47 | 12,41444433,T,C,rs2733289,C,0.894063867574765
48 | 12,49869365,G,A,rs7132908,A,1.59394719860947
49 | 13,30468315,G,T,rs11842871,G,1.02320162169402
50 | 13,58082465,T,C,rs9563574,T,1.0218543205169
51 | 14,102910984,G,A,rs10133111,A,1.0774972017038
52 | 14,103393972,T,G,rs12890750,G,0.926623481724374
53 | 14,32833676,G,T,rs17522122,T,0.951612312983814
54 | 14,46844338,G,A,rs2933211,A,0.814660384231547
55 | 14,69059590,C,T,rs4899280,T,0.904985376354747
56 | 14,79318962,A,G,rs10498536,A,0.825727843742782
57 | 14,79473182,C,T,rs10145154,T,1.24531115706448
58 | 16,15059860,T,C,rs9927842,T,0.748716106908275
59 | 16,28906323,T,C,rs7188071,T,1.67962453569743
60 | 16,29946895,G,A,rs8054556,A,1.31040484740787
61 | 16,53767042,T,C,rs1421085,C,4.06996774228195
62 | 16,53794125,C,T,rs73612051,C,1.01961505829021
63 | 16,53832063,C,T,rs9302652,C,1.12501007204517
64 | 16,69534400,G,T,rs2032912,G,1.34594325721168
65 | 17,36486677,A,G,rs4796224,G,0.891280448890062
66 | 17,48982960,C,A,rs35895680,C,0.760610664245459
67 | 17,67853811,T,G,rs2046323,G,0.908559679700781
68 | 18,23503774,C,T,rs303760,T,1.11100743455543
69 | 18,60178844,C,T,rs663640,T,2.12136439115411
70 | 18,60383735,C,T,rs79688165,C,1.5795552438972
71 | 19,18723704,C,T,rs10404726,C,1.16284431655652
72 | 19,44908684,T,C,rs429358,T,1.00539943715663
73 | 19,47076928,A,G,rs11667244,G,0.877008931378147
74 | 20,52383088,T,C,rs2426439,C,0.967245182676206
75 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Proinsulin.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117605762,G,A,rs41276588,A,1.20856189101032
 3 | 1,6612669,A,C,rs11583755,C,1.05467736031182
 4 | 1,72374091,C,A,rs2613503,A,0.817293538109438
 5 | 2,157478828,G,A,rs7568172,G,0.77594787045051306
 6 | 2,212971253,A,G,rs17354348,A,0.732103309684394
 7 | 2,262553,T,C,rs7596678,T,1.011889366208
 8 | 2,43202667,G,A,rs10174764,A,0.91526286448804
 9 | 2,58706456,C,T,rs1861410,C,1.15013355887749
10 | 3,186947857,C,T,rs3887925,T,1.34817619568173
11 | 3,35626269,A,G,rs10490871,G,1.22625669641648
12 | 4,184792454,G,A,rs72695645,G,0.749231424200112
13 | 5,103250706,G,A,rs75432112,A,1.34673511603681
14 | 5,75660016,G,A,rs253412,A,0.785428511797446
15 | 6,20679478,A,G,rs7756992,G,1.05783101135753
16 | 6,20736006,C,A,rs9358361,A,0.979907797810997
17 | 6,43843237,G,A,rs9369425,G,1.43699979087686
18 | 7,157147880,A,G,rs7798816,A,1.04550740766286
19 | 8,10138879,A,G,rs34990153,A,0.779160010387261
20 | 8,41643342,C,T,rs59191643,T,1.9137822511338
21 | 8,41651058,C,A,rs13262861,C,0.752403657263152
22 | 9,133269828,C,T,rs495203,T,0.979071076171371
23 | 9,136355594,T,C,rs78503878,C,0.861585019257345
24 | 9,19080354,A,G,rs10963942,G,0.788975886826281
25 | 10,112989259,G,A,rs11196181,G,1.32323478820699
26 | 10,112989975,G,A,rs35011184,A,0.909537760703753
27 | 10,112999686,G,A,rs11196187,A,0.977230036516327
28 | 10,12265895,C,T,rs11257655,T,1.4133021037223
29 | 10,70888579,C,T,rs827237,T,0.762494267706961
30 | 10,75887349,G,A,rs7099048,A,0.999874939413022
31 | 10,92196795,T,G,rs7071943,G,0.990836509897838
32 | 10,92703125,C,T,rs1111875,C,1.34288155106174
33 | 10,97299888,C,A,rs10882891,C,0.768376635725208
34 | 11,17388025,T,C,rs5219,T,1.06653488409401
35 | 11,2835964,A,C,rs2237895,C,1.2430368002492
36 | 11,45834447,C,T,rs6485644,C,1.24648413352621
37 | 11,69234580,C,T,rs55974245,C,0.878253973987723
38 | 11,72721940,G,A,rs11603334,G,1.20194371183669
39 | 11,93280221,T,G,rs7943372,G,0.82025363003658
40 | 12,121017786,G,A,rs56158042,G,1.34790357076606
41 | 12,4275530,C,T,rs3217792,C,1.36302549295646
42 | 13,32980164,G,A,rs576674,G,1.3948446847927
43 | 13,50519978,T,G,rs9316500,T,1.01955253387866
44 | 13,91302938,G,A,rs9523299,G,0.734808662675904
45 | 14,76834520,C,T,rs2056857,C,0.821735190803588
46 | 15,89836982,C,A,rs8031576,C,0.978977844759263
47 | 16,53767042,T,C,rs1421085,C,0.7641299913021
48 | 17,42544897,C,T,rs684214,T,0.872152701713213
49 | 17,48982960,C,A,rs35895680,C,0.956498495420977
50 | 20,44366172,T,C,rs12625671,C,0.795998517724989
51 | 20,63691346,A,G,rs6011033,G,0.733908926452423
52 | 20,63839432,G,A,rs4809369,G,0.801811875421717
53 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/SHBG-LpA.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 1,26958422,C,T,rs79598313,T,1.76988798874768
3 | 6,160354985,C,A,rs543159,C,2.58672811443119
4 | 10,63545492,C,T,rs12263348,T,2.20027042335222
5 | 12,120979061,C,T,rs1800574,T,0.765632283390125
6 | 12,121017786,G,A,rs56158042,G,0.748315576819208
7 | 12,4275678,T,G,rs76895963,T,1.04077446531587
8 | 17,7646363,C,T,rs1641523,C,5.63473967075984
9 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/Total_GRS.csv:
--------------------------------------------------------------------------------
  1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
  2 | 1,111749681,T,C,rs197379,C,0.0269
  3 | 1,11239454,C,T,rs7544489,T,0.027
  4 | 1,117544762,G,A,rs2767323,G,0.0339
  5 | 1,117605762,G,A,rs41276588,A,0.0375
  6 | 1,147649198,G,A,rs79489938,G,0.1131
  7 | 1,155299985,A,G,rs3020781,G,0.0333
  8 | 1,172391892,G,T,rs4916253,G,0.0274
  9 | 1,177904706,G,A,rs490689,A,0.0335
 10 | 1,201794371,C,T,rs567185,T,0.037
 11 | 1,20404624,T,C,rs10916785,T,0.0271
 12 | 1,205075959,G,A,rs1572993,A,0.031
 13 | 1,213985913,T,C,rs340874,C,0.0668
 14 | 1,219568478,G,A,rs2820444,G,0.0472
 15 | 1,229537208,G,A,rs348330,G,0.0525
 16 | 1,26156065,T,C,rs10794522,C,0.0261
 17 | 1,26958422,C,T,rs79598313,T,0.084
 18 | 1,39354638,C,T,rs61779275,T,0.0753
 19 | 1,50972693,C,T,rs3176466,C,0.0642
 20 | 1,62114219,G,T,rs12140153,G,0.0461
 21 | 1,6612669,A,C,rs11583755,C,0.0387
 22 | 1,72374091,C,A,rs2613503,A,0.0392
 23 | 2,120568332,G,A,rs9784137,G,0.0608
 24 | 2,144503607,A,C,rs2033159,C,0.0352
 25 | 2,145590469,G,A,rs7609422,G,0.0295
 26 | 2,157478828,G,A,rs7568172,G,0.0674
 27 | 2,160289101,A,G,rs6432613,G,0.0393
 28 | 2,164672366,C,T,rs13389219,C,0.0648
 29 | 2,164838388,C,T,rs355799,C,0.0439
 30 | 2,174332817,C,A,rs12992995,C,0.0312
 31 | 2,180753927,A,G,rs6741676,A,0.0324
 32 | 2,18540396,G,A,rs7558413,A,0.0286
 33 | 2,212971253,A,G,rs17354348,A,0.0294
 34 | 2,226087049,A,G,rs17231538,A,0.076
 35 | 2,226236593,T,C,rs2972145,C,0.09
 36 | 2,226340889,A,G,rs12694695,A,0.0291
 37 | 2,228108944,A,G,rs7561798,G,0.0281
 38 | 2,233414615,A,G,rs838733,G,0.0289
 39 | 2,233660318,C,T,rs2602374,T,0.0291
 40 | 2,25312674,T,C,rs34048824,T,0.0323
 41 | 2,25412902,A,G,rs34845373,A,0.0372
 42 | 2,262553,T,C,rs7596678,T,0.0271
 43 | 2,27508073,T,C,rs1260326,C,0.0638
 44 | 2,422144,T,C,rs62107261,T,0.0749
 45 | 2,43202667,G,A,rs10174764,A,0.0392
 46 | 2,43444037,C,T,rs13414140,C,0.1183
 47 | 2,58706456,C,T,rs1861410,C,0.0353
 48 | 2,59084401,G,A,rs980183,G,0.0363
 49 | 2,60328479,A,G,rs35824707,G,0.0685
 50 | 2,60357684,G,A,rs243021,A,0.054
 51 | 2,60491212,C,T,rs7599488,T,0.0255
 52 | 2,65049318,T,C,rs1009358,T,0.0493
 53 | 2,653874,C,T,rs10188334,C,0.0503
 54 | 2,65434334,A,C,rs12185610,A,0.0451
 55 | 2,67651196,T,C,rs1430780,T,0.0273
 56 | 3,12070747,C,T,rs12489177,T,0.0745
 57 | 3,12252703,A,G,rs9826367,A,0.0402
 58 | 3,12288284,C,T,rs17036160,C,0.1028
 59 | 3,123346931,A,G,rs11708067,A,0.0782
 60 | 3,12443429,A,C,rs60710264,C,0.0374
 61 | 3,125202613,C,T,rs9873519,T,0.0375
 62 | 3,131926093,A,G,rs1225052,G,0.0271
 63 | 3,135906656,C,T,rs9852406,T,0.0375
 64 | 3,141415727,C,T,rs73872717,C,0.0856
 65 | 3,152675088,T,G,rs9828639,T,0.0204
 66 | 3,15699882,T,C,rs4465929,T,0.0304
 67 | 3,171015287,G,A,rs9873618,G,0.0581
 68 | 3,173401978,A,G,rs686998,G,0.0267
 69 | 3,185159838,A,G,rs10937208,G,0.0443
 70 | 3,185701201,A,G,rs73175562,A,0.0531
 71 | 3,185803160,A,G,rs9854769,G,0.1075
 72 | 3,186927836,C,T,rs13095782,T,0.0285
 73 | 3,186947857,C,T,rs3887925,T,0.0424
 74 | 3,188024054,A,G,rs6777684,G,0.0569
 75 | 3,196083316,T,C,rs7619708,T,0.0334
 76 | 3,23414091,T,C,rs35352848,T,0.0623
 77 | 3,35626269,A,G,rs10490871,G,0.0272
 78 | 3,36828739,A,G,rs11129735,A,0.026
 79 | 3,46948414,G,A,rs12491473,G,0.0215
 80 | 3,47652174,C,T,rs62262091,T,0.0557
 81 | 3,49943163,C,T,rs4688760,T,0.034
 82 | 3,53089257,A,G,rs891368,G,0.0247
 83 | 3,64722438,C,T,rs4132228,C,0.0471
 84 | 3,71606894,C,T,rs844215,C,0.0262
 85 | 3,89937130,T,C,rs11716527,C,0.0492
 86 | 3,9472332,G,A,rs3872707,A,0.0446
 87 | 4,105127134,T,C,rs17035289,C,0.0428
 88 | 4,136162038,A,C,rs1296328,A,0.0251
 89 | 4,139958775,G,T,rs2604918,G,0.0313
 90 | 4,152583143,C,T,rs6819331,C,0.0404
 91 | 4,156731601,C,A,rs28819812,C,0.0384
 92 | 4,18023861,G,A,rs2011603,A,0.0383
 93 | 4,184792454,G,A,rs72695645,G,0.0611
 94 | 4,20209330,A,G,rs11940813,G,0.0371
 95 | 4,45184122,G,A,rs10938398,A,0.0425
 96 | 4,48892964,G,A,rs2289065,G,0.0272
 97 | 4,6291188,A,G,rs10010131,G,0.0816
 98 | 4,751184,G,T,rs1531583,T,0.0994
 99 | 4,88818328,C,T,rs13131633,T,0.0231
100 | 4,88830707,C,T,rs7660000,C,0.0309
101 | 4,90322714,G,A,rs7656001,A,0.026
102 | 5,103250706,G,A,rs75432112,A,0.1344
103 | 5,134529762,A,G,rs329124,G,0.0259
104 | 5,14788235,T,C,rs31931,T,0.038
105 | 5,158599736,C,T,rs890940,T,0.0484
106 | 5,53478680,G,A,rs62370480,A,0.0335
107 | 5,53976834,G,A,rs4865796,A,0.0466
108 | 5,54029680,A,G,rs31229,G,0.0391
109 | 5,56503357,C,A,rs157843,A,0.0688
110 | 5,56566067,G,A,rs9687846,A,0.0659
111 | 5,56569218,C,T,rs66485956,T,0.0385
112 | 5,68418419,A,G,rs4976033,G,0.028
113 | 5,75660016,G,A,rs253412,A,0.0459
114 | 5,77131486,G,A,rs6878122,G,0.0554
115 | 5,79141082,C,A,rs12519500,C,0.038
116 | 5,88370824,G,T,rs2410763,T,0.031
117 | 6,125731213,A,G,rs2008027,G,0.0272
118 | 6,126470949,A,G,rs11759026,G,0.0646
119 | 6,127093693,A,G,rs719727,A,0.056
120 | 6,139515991,T,C,rs11155073,T,0.0304
121 | 6,153106967,A,G,rs9383649,G,0.0329
122 | 6,160354985,C,A,rs543159,C,0.0324
123 | 6,163711969,C,T,rs4709746,C,0.0581
124 | 6,20679478,A,G,rs7756992,G,0.1217
125 | 6,20736006,C,A,rs9358361,A,0.043
126 | 6,21008508,G,A,rs12192642,G,0.0545
127 | 6,21079126,A,G,rs11967298,A,0.0378
128 | 6,34279270,C,T,rs77136196,T,0.0662
129 | 6,39048860,C,T,rs10305420,C,0.0316
130 | 6,39089153,T,C,rs9296291,T,0.0325
131 | 6,40441504,T,C,rs34298980,T,0.0381
132 | 6,41044666,A,G,rs4714422,G,0.0293
133 | 6,43790159,C,A,rs998584,A,0.0365
134 | 6,43834141,C,T,rs2894536,C,0.0384
135 | 6,43843237,G,A,rs9369425,G,0.0384
136 | 6,50821065,A,C,rs3798519,C,0.0495
137 | 6,7084624,G,T,rs4959424,G,0.0343
138 | 6,7231610,G,A,rs9379084,G,0.0747
139 | 6,7275708,T,C,rs11243150,T,0.0447
140 | 7,100715101,A,G,rs534043,G,0.0447
141 | 7,102849009,A,G,rs56269620,A,0.0316
142 | 7,130768623,A,G,rs1596972,G,0.0403
143 | 7,14858657,C,T,rs17168486,T,0.069
144 | 7,15024684,G,T,rs2191349,T,0.0661
145 | 7,150840547,G,A,rs62492368,A,0.0342
146 | 7,157147880,A,G,rs7798816,A,0.0489
147 | 7,15886603,T,C,rs38221,T,0.033
148 | 7,23523649,C,T,rs76365198,C,0.0496
149 | 7,2721116,C,A,rs798549,C,0.0295
150 | 7,28160478,C,T,rs1513272,C,0.0811
151 | 7,30688836,C,T,rs917195,C,0.0467
152 | 7,40777054,C,T,rs17439448,T,0.0404
153 | 7,44161285,T,C,rs2041547,C,0.0385
154 | 7,44195138,C,T,rs2908286,T,0.0683
155 | 7,44325950,C,T,rs116913033,C,0.0408
156 | 7,4643627,G,A,rs62450857,A,0.0388
157 | 7,50514274,A,G,rs2876826,G,0.0306
158 | 7,50960703,T,C,rs1018942,T,0.0711
159 | 7,70320618,G,A,rs1880368,A,0.0311
160 | 7,74693803,G,A,rs67755137,A,0.0333
161 | 8,10138879,A,G,rs34990153,A,0.0384
162 | 8,10342743,C,T,rs73195303,C,0.0314
163 | 8,10928869,C,A,rs28566988,C,0.0392
164 | 8,115627247,T,C,rs2737226,T,0.0383
165 | 8,115876242,A,C,rs1569339,C,0.0347
166 | 8,117173494,A,G,rs11558471,A,0.1031
167 | 8,14267300,A,C,rs17294565,C,0.0274
168 | 8,144285362,T,C,rs7828303,T,0.0427
169 | 8,19973410,C,T,rs10096633,C,0.0404
170 | 8,26015118,A,G,rs17818197,G,0.0346
171 | 8,26031751,C,T,rs7834679,C,0.0306
172 | 8,30996517,A,G,rs2725371,A,0.0371
173 | 8,41643342,C,T,rs59191643,T,0.0444
174 | 8,41651058,C,A,rs13262861,C,0.1016
175 | 8,41672271,C,T,rs80105613,T,0.0641
176 | 8,8342448,G,A,rs62496027,A,0.0276
177 | 8,9324101,G,A,rs2126263,G,0.0608
178 | 8,9404091,G,A,rs62493853,A,0.0563
179 | 8,94672919,A,C,rs11786992,A,0.0319
180 | 8,94919116,C,T,rs75912292,T,0.0783
181 | 9,1039939,G,A,rs756145,A,0.0286
182 | 9,114181077,A,G,rs1431819,G,0.0293
183 | 9,123351733,T,C,rs4838049,C,0.027
184 | 9,123824284,C,A,rs1752169,A,0.0317
185 | 9,133269828,C,T,rs495203,T,0.0493
186 | 9,136355594,T,C,rs78503878,C,0.0614
187 | 9,19080354,A,G,rs10963942,G,0.0371
188 | 9,22003368,G,A,rs1063192,A,0.0434
189 | 9,22134095,T,C,rs10811661,T,0.1379
190 | 9,22137686,T,G,rs7018475,G,0.1094
191 | 9,28410685,T,C,rs1412234,C,0.044
192 | 9,4291928,A,C,rs10974438,C,0.047
193 | 9,94180777,G,A,rs10821311,A,0.0358
194 | 10,100152307,T,C,rs2862954,T,0.0291
195 | 10,111288563,C,T,rs10885123,C,0.0316
196 | 10,112989259,G,A,rs11196181,G,0.1381
197 | 10,112989975,G,A,rs35011184,A,0.2608
198 | 10,112999686,G,A,rs11196187,A,0.1943
199 | 10,113064714,G,A,rs10885410,G,0.0895
200 | 10,113101545,A,G,rs10885414,A,0.0787
201 | 10,122433665,T,G,rs2280141,T,0.0453
202 | 10,12265895,C,T,rs11257655,T,0.0907
203 | 10,33729802,T,C,rs36051838,C,0.044
204 | 10,63545492,C,T,rs12263348,T,0.0282
205 | 10,68583018,C,T,rs10998304,C,0.0306
206 | 10,69340132,G,A,rs10159477,G,0.0408
207 | 10,69692529,T,C,rs2812533,T,0.0383
208 | 10,70888579,C,T,rs827237,T,0.0365
209 | 10,73839369,A,G,rs2675662,A,0.0267
210 | 10,75887349,G,A,rs7099048,A,0.0282
211 | 10,92105487,C,T,rs72807217,T,0.0478
212 | 10,92196795,T,G,rs7071943,G,0.0443
213 | 10,92703125,C,T,rs1111875,C,0.0923
214 | 10,92740354,G,A,rs11187152,G,0.0537
215 | 10,97299888,C,A,rs10882891,C,0.0313
216 | 11,1675619,C,T,rs2334499,T,0.0283
217 | 11,17388025,T,C,rs5219,T,0.0685
218 | 11,2080053,G,A,rs11600952,A,0.0356
219 | 11,2176056,A,G,rs4929965,A,0.0619
220 | 11,2671019,C,T,rs231360,T,0.0506
221 | 11,2835964,A,C,rs2237895,C,0.0725
222 | 11,28513351,G,A,rs4923543,A,0.0238
223 | 11,2887735,A,G,rs450563,G,0.0341
224 | 11,32439327,C,T,rs7943101,T,0.0358
225 | 11,43855148,C,T,rs11555762,T,0.0407
226 | 11,45834447,C,T,rs6485644,C,0.0294
227 | 11,47659818,T,G,rs1056387,T,0.0352
228 | 11,47870031,T,C,rs11604324,C,0.0736
229 | 11,61798436,T,C,rs174541,T,0.0292
230 | 11,64263769,C,T,rs35169799,T,0.0504
231 | 11,65527328,C,T,rs1783541,T,0.0484
232 | 11,65638129,G,T,rs2306363,G,0.0438
233 | 11,69234580,C,T,rs55974245,C,0.037
234 | 11,72721940,G,A,rs11603334,G,0.0899
235 | 11,76528106,C,T,rs2513523,T,0.0255
236 | 11,8632981,T,C,rs10769936,C,0.0349
237 | 11,92983658,G,A,rs11020132,A,0.1424
238 | 11,93280221,T,G,rs7943372,G,0.0382
239 | 12,108224853,C,T,rs3764002,C,0.0396
240 | 12,117974568,G,A,rs34965774,A,0.0521
241 | 12,120979061,C,T,rs1800574,T,0.147
242 | 12,121017786,G,A,rs56158042,G,0.0566
243 | 12,121222984,T,C,rs25643,C,0.0248
244 | 12,121483887,C,T,rs7977709,C,0.0261
245 | 12,123008576,A,G,rs12820906,A,0.0433
246 | 12,123863518,C,T,rs11057376,T,0.0394
247 | 12,123973455,C,A,rs12823740,C,0.041
248 | 12,124025844,C,T,rs10773051,C,0.035
249 | 12,12718165,T,G,rs2066827,G,0.0351
250 | 12,132493708,C,T,rs11614914,T,0.0389
251 | 12,21628312,G,T,rs10841868,G,0.0315
252 | 12,26138216,C,T,rs56008051,C,0.032
253 | 12,26312652,C,T,rs11048458,T,0.0458
254 | 12,41444433,T,C,rs2733289,C,0.0301
255 | 12,4265207,A,G,rs11063069,G,0.0458
256 | 12,4275530,C,T,rs3217792,C,0.1108
257 | 12,4275678,T,G,rs76895963,T,0.4388
258 | 12,48342520,C,A,rs2732480,C,0.0337
259 | 12,49869365,G,A,rs7132908,A,0.033
260 | 12,6572620,A,G,rs67013744,G,0.035
261 | 12,65965972,C,A,rs8756,A,0.0432
262 | 12,71129263,C,A,rs1705263,C,0.0398
263 | 12,95534337,C,A,rs11108094,A,0.0601
264 | 12,97454449,C,T,rs113036477,C,0.0715
265 | 13,30468315,G,T,rs11842871,G,0.0299
266 | 13,32980164,G,A,rs576674,G,0.0612
267 | 13,41114265,A,G,rs4397977,A,0.0291
268 | 13,50519978,T,G,rs9316500,T,0.0473
269 | 13,58082465,T,C,rs9563574,T,0.0406
270 | 13,58562303,C,T,rs4886092,C,0.0198
271 | 13,80143021,G,A,rs1359790,G,0.0795
272 | 13,91302938,G,A,rs9523299,G,0.0418
273 | 14,102910984,G,A,rs10133111,A,0.0341
274 | 14,103393972,T,G,rs12890750,G,0.0284
275 | 14,22819744,A,G,rs17122776,G,0.029
276 | 14,29275326,A,G,rs8005994,A,0.0265
277 | 14,32833676,G,T,rs17522122,T,0.0343
278 | 14,46844338,G,A,rs2933211,A,0.0268
279 | 14,69059590,C,T,rs4899280,T,0.0276
280 | 14,76834520,C,T,rs2056857,C,0.0261
281 | 14,79318962,A,G,rs10498536,A,0.043
282 | 14,79473182,C,T,rs10145154,T,0.0546
283 | 15,39354022,A,G,rs17622532,A,0.026
284 | 15,40088651,G,A,rs2242186,A,0.0643
285 | 15,40106553,T,C,rs484943,T,0.0332
286 | 15,52807109,T,C,rs75332279,C,0.0559
287 | 15,60646617,C,A,rs8033609,A,0.0265
288 | 15,62099409,C,T,rs7163757,C,0.0405
289 | 15,75522047,C,T,rs6495182,C,0.0406
290 | 15,77489993,A,G,rs12910361,G,0.0716
291 | 15,89836982,C,A,rs8031576,C,0.057
292 | 15,90968837,G,A,rs2290203,A,0.0557
293 | 16,15059860,T,C,rs9927842,T,0.0375
294 | 16,250389,T,C,rs55857387,T,0.0524
295 | 16,28906323,T,C,rs7188071,T,0.0292
296 | 16,29946895,G,A,rs8054556,A,0.0364
297 | 16,53395261,C,T,rs2908797,C,0.0363
298 | 16,53767042,T,C,rs1421085,C,0.1177
299 | 16,53794125,C,T,rs73612051,C,0.0565
300 | 16,53832063,C,T,rs9302652,C,0.044
301 | 16,69534400,G,T,rs2032912,G,0.0421
302 | 16,81501185,T,C,rs2925979,T,0.0448
303 | 16,81576875,G,A,rs11642655,A,0.0288
304 | 16,918292,G,A,rs4984980,A,0.0351
305 | 17,17807956,A,G,rs4925114,A,0.033
306 | 17,31310290,G,A,rs12602834,G,0.0289
307 | 17,36486677,A,G,rs4796224,G,0.0252
308 | 17,37703678,G,A,rs2189301,G,0.0462
309 | 17,37740776,A,G,rs11657964,A,0.0589
310 | 17,4149155,T,G,rs11652572,T,0.1151
311 | 17,42544897,C,T,rs684214,T,0.0424
312 | 17,48046280,A,C,rs9895554,A,0.0549
313 | 17,48982960,C,A,rs35895680,C,0.0559
314 | 17,578364,C,T,rs11870735,T,0.0339
315 | 17,67853811,T,G,rs2046323,G,0.0484
316 | 17,7646363,C,T,rs1641523,C,0.0255
317 | 17,77390827,A,G,rs1656794,G,0.0311
318 | 17,78765957,G,A,rs62075585,G,0.0295
319 | 17,9884641,G,A,rs7219033,A,0.0288
320 | 18,13566625,C,T,rs113780182,T,0.0309
321 | 18,23503774,C,T,rs303760,T,0.0342
322 | 18,42507133,G,T,rs1431841,T,0.0309
323 | 18,55383415,A,C,rs72926932,C,0.0749
324 | 18,57013486,G,A,rs12969494,G,0.0292
325 | 18,60178844,C,T,rs663640,T,0.0503
326 | 18,60383735,C,T,rs79688165,C,0.1216
327 | 18,63178651,T,C,rs12454712,T,0.0412
328 | 19,18723704,C,T,rs10404726,C,0.0277
329 | 19,19268740,C,T,rs58542926,T,0.089
330 | 19,33401503,C,T,rs889138,C,0.0312
331 | 19,33444196,C,T,rs2287821,C,0.0239
332 | 19,44854120,T,C,rs4803764,T,0.0342
333 | 19,44908684,T,C,rs429358,T,0.073
334 | 19,45653979,G,A,rs10407429,G,0.053900000000000003
335 | 19,47076928,A,G,rs11667244,G,0.0354
336 | 19,4949909,G,A,rs12977104,A,0.0412
337 | 19,7235135,A,G,rs17175860,G,0.0445
338 | 19,7903283,G,A,rs2115107,A,0.0382
339 | 19,8364439,G,A,rs116843064,G,0.0965
340 | 20,33720469,C,T,rs67611724,T,0.0443
341 | 20,34008898,G,A,rs2268078,A,0.0387
342 | 20,41203988,T,C,rs17265513,C,0.0325
343 | 20,44366172,T,C,rs12625671,C,0.0653
344 | 20,44413724,C,T,rs1800961,T,0.1077
345 | 20,46966072,G,A,rs6066138,G,0.0451
346 | 20,52383088,T,C,rs2426439,C,0.0365
347 | 20,58812207,A,G,rs911300,G,0.0348
348 | 20,62649310,C,T,rs2427363,C,0.027
349 | 20,63691346,A,G,rs6011033,G,0.0338
350 | 20,63839432,G,A,rs4809369,G,0.0337
351 | 20,64080106,C,T,rs8126001,C,0.0281
352 | 22,38204535,C,T,rs2267373,T,0.0284
353 | 22,43928975,G,A,rs3747207,A,0.0471
354 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/EUR/hg38_score_info/VAT Neg.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,144503607,A,C,rs2033159,C,0.807589704525295
 3 | 2,226340889,A,G,rs12694695,A,1.05866105098753
 4 | 3,12288284,C,T,rs17036160,C,0.969348804976055
 5 | 3,171015287,G,A,rs9873618,G,0.830976371594637
 6 | 4,152583143,C,T,rs6819331,C,0.821925217224353
 7 | 5,134529762,A,G,rs329124,G,0.933352162688514
 8 | 5,158599736,C,T,rs890940,T,0.76040754992184
 9 | 5,56503357,C,A,rs157843,A,1.88634886045515
10 | 6,126470949,A,G,rs11759026,G,1.10123173241437
11 | 6,127093693,A,G,rs719727,A,1.13808379282431
12 | 6,160354985,C,A,rs543159,C,1.59514925179458
13 | 6,20736006,C,A,rs9358361,A,0.964379582992306
14 | 6,43843237,G,A,rs9369425,G,1.43338766867525
15 | 6,7084624,G,T,rs4959424,G,0.883157584356791
16 | 7,130768623,A,G,rs1596972,G,1.03325167441624
17 | 8,26031751,C,T,rs7834679,C,1.21208127201891
18 | 8,94919116,C,T,rs75912292,T,0.734691383702554
19 | 9,1039939,G,A,rs756145,A,0.851590615916217
20 | 11,1675619,C,T,rs2334499,T,0.80137380662311
21 | 11,28513351,G,A,rs4923543,A,0.924268516061603
22 | 11,72721940,G,A,rs11603334,G,1.77517413008634
23 | 12,4265207,A,G,rs11063069,G,0.776917146095885
24 | 13,80143021,G,A,rs1359790,G,0.806173341281547
25 | 15,62099409,C,T,rs7163757,C,0.766860570318094
26 | 17,37703678,G,A,rs2189301,G,0.890959695586082
27 | 17,578364,C,T,rs11870735,T,0.803746718455257
28 | 19,33401503,C,T,rs889138,C,1.78066725909926
29 | 19,33444196,C,T,rs2287821,C,0.845286039772586
30 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/ALP Neg.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 2,27508073,T,C,rs1260326,C,0.810841422500325
3 | 6,139515991,T,C,rs11155073,T,1.27808171314604
4 | 6,43846888,G,A,rs10456526,A,0.783560908019758
5 | 9,133274084,T,C,rs529565,C,7.5980383148254
6 | 11,61798436,T,C,rs174541,T,1.97989469578866
7 | 19,19268740,C,T,rs58542926,T,1.23063141179833
8 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Beta Cell 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117626841,G,A,rs2282456,A,0.790593075157008
 3 | 1,147257439,A,G,rs3766524,G,0.785182467856915
 4 | 1,201880798,A,G,rs41304257,A,0.786528889062839
 5 | 1,21767674,C,T,rs12122128,C,1.23150726989541
 6 | 1,46072648,T,C,rs785513,C,0.957021530441465
 7 | 1,6612669,A,C,rs11583755,C,1.37031857925042
 8 | 2,120579620,A,G,rs9308614,A,0.794281728405421
 9 | 2,148675516,A,G,rs7558502,A,0.788501335050814
10 | 2,15958271,A,G,rs6750986,G,1.06347249633723
11 | 2,212964997,T,C,rs4673712,T,1.1692230131902
12 | 2,233395552,A,G,rs838722,G,1.25460247019725
13 | 2,67651196,T,C,rs1430780,T,0.943112677071227
14 | 3,186947857,C,T,rs3887925,T,1.75581631491611
15 | 3,23415589,T,C,rs13094957,T,1.21016941892711
16 | 3,28690319,G,A,rs9869477,G,0.825613746430223
17 | 3,35628658,G,A,rs1470560,A,1.07868296862592
18 | 3,9472332,G,A,rs3872707,A,1.29719164698694
19 | 4,105127134,T,C,rs17035289,C,1.60712536039401
20 | 4,155776632,T,C,rs2125799,C,1.04059530188695
21 | 4,726892,G,A,rs73221116,A,1.17145106687441
22 | 5,151945039,T,G,rs302395,G,0.862187129765993
23 | 5,158602726,G,A,rs1650505,A,0.801378806152145
24 | 6,138887596,C,A,rs1188832,C,0.895490707331789
25 | 6,20686765,C,A,rs9368222,A,2.49835691340481
26 | 6,41044378,A,C,rs9367093,C,0.780216120296282
27 | 6,43790159,C,A,rs998584,A,1.34310954078254
28 | 7,102793231,A,G,rs112613078,A,0.823742646266636
29 | 7,30688836,C,T,rs917195,C,1.29952150472681
30 | 7,36703281,C,T,rs6978327,T,0.806240575751213
31 | 7,74662156,A,G,rs13238568,G,1.32515294362145
32 | 7,90174320,T,C,rs6956980,C,0.781549811483159
33 | 8,117172544,C,T,rs13266634,C,1.53444442512662
34 | 8,134763303,C,T,rs4294149,T,1.25838719583287
35 | 8,14291481,A,C,rs35753840,C,0.819747448050646
36 | 8,38485494,T,C,rs328301,T,1.07567462759089
37 | 9,114181077,A,G,rs1431819,G,0.854451885941679
38 | 9,123331143,C,T,rs10739629,C,1.38927139145449
39 | 9,136353630,A,G,rs28642213,G,0.807196935889104
40 | 9,136360621,G,A,rs11145958,A,1.15528474059506
41 | 9,19074540,A,G,rs12380322,G,1.36006311916359
42 | 9,22134095,T,C,rs10811661,T,1.36383382139055
43 | 10,114054165,C,T,rs10787516,C,0.822099927248126
44 | 10,122433665,T,G,rs2280141,T,0.83747716545817
45 | 10,12265895,C,T,rs11257655,T,0.962620651889189
46 | 10,13498869,A,C,rs11258422,C,0.964423963268342
47 | 10,92703125,C,T,rs1111875,C,2.35424533937019
48 | 10,97331612,G,A,rs945187,G,1.01454202820145
49 | 11,1675619,C,T,rs2334499,T,1.22212251401522
50 | 11,17396930,C,A,rs757110,C,0.812644009592181
51 | 11,30586586,G,A,rs11031140,A,1.14106624642707
52 | 11,43856909,G,A,rs35251247,A,1.02641196863762
53 | 11,61798436,T,C,rs174541,T,1.52395304158992
54 | 11,69071719,T,C,rs3750957,C,1.410036755746
55 | 11,8655516,C,A,rs7941510,C,0.834137037387623
56 | 12,108236003,G,A,rs1426371,G,1.60222736004863
57 | 12,121017786,G,A,rs56158042,G,1.35511608016868
58 | 12,121544257,A,G,rs4981013,G,0.947263136685008
59 | 12,21690642,C,T,rs11046164,C,1.64158566799243
60 | 12,57574955,G,A,rs11172254,G,0.826223443528635
61 | 12,6582286,G,A,rs7316626,A,0.787098928431362
62 | 12,80923335,A,G,rs11114655,G,0.9771018307699
63 | 14,22819980,T,C,rs17122782,C,0.786334241817425
64 | 14,76834520,C,T,rs2056857,C,0.924686226918637
65 | 15,62099409,C,T,rs7163757,C,2.397207778958
66 | 15,83878470,C,T,rs1812707,T,1.03292104876316
67 | 15,89838046,C,T,rs893617,C,1.07569550074564
68 | 15,90968837,G,A,rs2290203,A,1.02709147510979
69 | 16,250389,T,C,rs55857387,T,1.10768821641707
70 | 16,78955,C,T,rs1013358,C,0.880647004389011
71 | 16,89564222,C,T,rs12932337,T,1.15003648357614
72 | 17,36506381,T,C,rs1109442,C,0.847605408953373
73 | 17,37739849,C,T,rs11651755,C,0.832093852088434
74 | 17,48101312,G,A,rs3744347,A,0.858605143270236
75 | 17,67649976,G,A,rs11658220,A,1.2145404391293
76 | 17,9884528,A,G,rs17810376,G,1.4907865287698
77 | 18,34002926,T,C,rs17747955,C,1.1213800974715
78 | 19,1646713,C,T,rs4807125,T,0.907214707837185
79 | 19,45655159,G,A,rs8107527,A,0.793392743864734
80 | 20,2119449,G,A,rs6137042,G,1.10261097898468
81 | 20,33848172,C,T,rs7274168,T,0.821809381249592
82 | 20,62649310,C,T,rs2427363,C,1.08007592675979
83 | 22,35309366,A,G,rs138771,A,1.37711769959318
84 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Beta Cell 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117626841,G,A,rs2282456,A,0.93032919071777
 3 | 1,213985913,T,C,rs340874,C,1.6762660744519
 4 | 1,229537208,G,A,rs348330,G,0.851039929325742
 5 | 3,123346931,A,G,rs11708067,A,2.46744557309717
 6 | 3,125202613,C,T,rs9873519,T,0.917203516585986
 7 | 3,141382997,A,C,rs56243018,A,0.937403954956027
 8 | 3,160434511,C,A,rs13403,C,0.877194450937002
 9 | 3,171007094,T,C,rs8192675,T,2.55532848821731
10 | 3,185816694,G,A,rs9859406,A,1.02159387010531
11 | 3,188024054,A,G,rs6777684,G,1.18297263294537
12 | 5,77139179,G,A,rs7732130,G,1.11690554030416
13 | 6,20686765,C,A,rs9368222,A,1.45549628524273
14 | 6,7231610,G,A,rs9379084,G,0.975638874942903
15 | 6,7275693,A,G,rs11243149,A,0.934005086925184
16 | 7,14858657,C,T,rs17168486,T,1.50947366516019
17 | 7,15024630,G,T,rs2191348,T,2.30492784565317
18 | 7,28154778,C,A,rs860262,C,1.01713840341402
19 | 7,44184122,G,A,rs730497,A,4.56659061893173
20 | 7,44322946,C,T,rs11772021,C,0.904170136644996
21 | 7,90174320,T,C,rs6956980,C,0.849658415498303
22 | 8,117172544,C,T,rs13266634,C,2.56574126174382
23 | 8,94672919,A,C,rs11786992,A,0.960998820377985
24 | 8,94955144,A,G,rs10808671,A,0.785767900374254
25 | 9,136353630,A,G,rs28642213,G,0.980326704726829
26 | 9,22134095,T,C,rs10811661,T,1.44895718662821
27 | 9,22137686,T,G,rs7018475,G,0.931188856932379
28 | 9,4290541,C,A,rs10116772,A,1.33235499392144
29 | 10,111279909,C,T,rs11195502,C,1.39063168449064
30 | 10,112989975,G,A,rs35011184,A,2.66073173372723
31 | 10,12265895,C,T,rs11257655,T,1.28382554339938
32 | 11,2836003,G,A,rs60808706,G,1.08302359948751
33 | 11,45837033,G,A,rs12419690,G,0.815094115612855
34 | 11,61798436,T,C,rs174541,T,0.879486392485044
35 | 11,72721940,G,A,rs11603334,G,1.51934651195302
36 | 12,132493708,C,T,rs11614914,T,0.92752312567208
37 | 12,48342520,C,A,rs2732480,C,0.859374774395466
38 | 13,32988367,T,C,rs7997912,C,1.17886278318088
39 | 15,77489993,A,G,rs12910361,G,1.033996056745
40 | 16,250389,T,C,rs55857387,T,0.838644792384136
41 | 18,55383415,A,C,rs72926932,C,0.889741746333478
42 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Bilirubin.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 2,233395552,A,G,rs838722,G,1.27131145158399
3 | 2,233759924,C,T,rs887829,T,14.3539837522901
4 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Cholesterol.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 5,75581978,T,C,rs5744672,T,1.08710434354778
3 | 15,58384622,G,A,rs11858759,G,0.946624771314544
4 | 19,19268740,C,T,rs58542926,T,1.96176642040467
5 | 19,44854120,T,C,rs4803764,T,2.86428721519638
6 | 19,44908684,T,C,rs429358,T,7.71724647555388
7 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Hyper Insulin.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,205075211,A,G,rs11240351,G,1.38958402750427
 3 | 2,211410212,A,G,rs3828242,A,0.926494175774555
 4 | 2,25412902,A,G,rs34845373,A,1.29579628911929
 5 | 2,60357684,G,A,rs243021,A,1.11076642972781
 6 | 3,36828739,A,G,rs11129735,A,1.12517756322701
 7 | 3,49958085,T,C,rs6792892,C,1.76664185358465
 8 | 3,88080986,C,T,rs73146095,C,1.32242042514351
 9 | 4,44501486,G,A,rs34617913,A,0.948875110203746
10 | 4,51932458,T,C,rs1996617,C,1.17388003767607
11 | 5,134525973,C,T,rs329118,T,0.913841075351517
12 | 5,68418419,A,G,rs4976033,G,1.07548669590129
13 | 5,88401716,C,T,rs6870983,C,1.09002759214344
14 | 6,39048860,C,T,rs10305420,C,1.0232830052884
15 | 7,131889849,A,C,rs12667919,A,1.10651010188789
16 | 7,150840547,G,A,rs62492368,A,0.829251653337375
17 | 7,50510270,T,C,rs73121277,C,0.907315236654959
18 | 7,55734370,T,C,rs6972291,C,0.840283826446018
19 | 8,125618324,G,T,rs72724622,T,1.1973906210298
20 | 8,144301594,C,T,rs13268508,T,1.14805880248489
21 | 8,56583505,G,A,rs3887059,A,0.856144433795765
22 | 9,95033139,G,A,rs6479591,A,0.943154663623236
23 | 10,68622422,A,G,rs10998338,A,1.29665590452712
24 | 10,86357561,C,A,rs11201992,C,0.823809510571429
25 | 11,20930691,G,A,rs16907058,A,1.15171901943159
26 | 11,65558683,G,A,rs12789028,A,0.979598620500255
27 | 12,123943784,T,C,rs4930726,T,1.08154227375956
28 | 12,132060098,C,T,rs11830241,T,1.30895050959448
29 | 12,20438398,G,A,rs7134150,A,1.9221010387598
30 | 12,50489375,T,C,rs4519166,C,0.965545904776462
31 | 14,102909694,C,T,rs4906272,T,1.44933193891777
32 | 14,29257736,C,A,rs2333486,C,0.963458317119539
33 | 15,49501823,T,C,rs7169799,C,0.985610373873054
34 | 15,52790274,G,T,rs2440317,T,0.874242975234916
35 | 15,58384622,G,A,rs11858759,G,0.97286839395891
36 | 15,98733292,G,T,rs59646751,T,1.16109993287073
37 | 16,56437498,T,C,rs7189122,C,0.794036018387664
38 | 16,917241,A,G,rs12918782,G,0.880733714884338
39 | 17,31301531,C,A,rs2040792,C,1.32680097001544
40 | 17,77377118,T,C,rs312827,C,0.927961953624098
41 | 19,13065206,G,T,rs76567647,G,0.824970466633634
42 | 19,7903283,G,A,rs2115107,A,1.26973952886874
43 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Lipodystrophy 1.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,11257875,T,C,rs7554251,C,1.12657196230347
 3 | 1,172399170,A,G,rs7546252,A,1.6480616896423
 4 | 1,205075211,A,G,rs11240351,G,0.829713414422773
 5 | 1,219577375,G,T,rs4846567,G,2.24461761071688
 6 | 1,50743476,T,C,rs79090772,T,0.87587766177136706
 7 | 2,111496274,T,C,rs1345203,T,1.40731557289665
 8 | 2,164651879,C,T,rs10184004,C,2.85459330697619
 9 | 2,218303709,G,A,rs1877712,G,0.812724620542421
10 | 2,226241205,C,T,rs2943650,T,2.34046222220295
11 | 2,65434334,A,C,rs12185610,A,0.800130752673367
12 | 3,12288284,C,T,rs17036160,C,1.32664508848256
13 | 3,129301935,A,G,rs6765930,A,0.793869800817226
14 | 3,185816694,G,A,rs9859406,A,1.20220197510624
15 | 3,64717718,G,T,rs66815886,G,1.98375391538072
16 | 4,144692655,G,A,rs6812830,G,1.2017416879526
17 | 4,156731601,C,A,rs28819812,C,1.42558363265606
18 | 4,88791970,C,T,rs9991328,T,1.95389709474007
19 | 5,158602726,G,A,rs1650505,A,0.848002161817244
20 | 5,56564954,A,G,rs3936511,G,1.9015115805769
21 | 5,68418419,A,G,rs4976033,G,0.930751438979016
22 | 6,139515991,T,C,rs11155073,T,2.85825785444522
23 | 6,160349886,T,G,rs501470,T,0.975867970776777
24 | 6,163711969,C,T,rs4709746,C,1.1573485541831
25 | 6,34276355,T,C,rs115245297,C,2.07064637140282
26 | 6,43790159,C,A,rs998584,A,3.58426411434303
27 | 7,130748625,C,T,rs4731702,C,0.817693983117954
28 | 7,25931533,G,A,rs6951827,G,0.835501837852443
29 | 7,28154778,C,A,rs860262,C,0.905773237340398
30 | 8,125618324,G,T,rs72724622,T,0.804415283565313
31 | 8,19973410,C,T,rs10096633,C,1.24960854946381
32 | 8,71507263,T,G,rs10096191,G,1.07475948476029
33 | 11,64333304,A,G,rs1662185,A,1.41127179514205
34 | 12,123943784,T,C,rs4930726,T,3.14739730540445
35 | 12,20317265,A,G,rs11045171,A,1.08591356581262
36 | 12,26310241,G,A,rs11048457,G,2.19887439257176
37 | 12,65965972,C,A,rs8756,A,0.950683509338436
38 | 12,71126981,T,G,rs10879261,G,0.859815702817528
39 | 15,58384622,G,A,rs11858759,G,0.884350253265618
40 | 15,63579093,C,T,rs7178762,C,0.928750119188819
41 | 16,81500184,C,A,rs56823429,C,2.0551961455962
42 | 17,17804815,T,C,rs11654081,T,1.30611229962082
43 | 18,63178651,T,C,rs12454712,T,0.862867594267646
44 | 19,33405526,A,C,rs4805881,A,1.00166925096288
45 | 19,7293108,T,C,rs8101064,T,0.898086333992418
46 | 20,52417142,G,A,rs4809906,G,0.935518590529031
47 | 20,64060707,A,C,rs6090040,A,0.818319211773247
48 | 22,38204535,C,T,rs2267373,T,1.17890503451983
49 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Lipodystrophy 2.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 2,120691632,C,T,rs62167015,C,0.797398616538251
 3 | 2,164651879,C,T,rs10184004,C,1.29449369783007
 4 | 2,226241205,C,T,rs2943650,T,2.13241645864393
 5 | 2,65052280,A,G,rs2723065,A,1.08764223224661
 6 | 3,12288284,C,T,rs17036160,C,2.22637661452041
 7 | 3,136350630,G,T,rs667920,T,1.2460137382691
 8 | 4,156731601,C,A,rs28819812,C,0.829824733359207
 9 | 5,158602726,G,A,rs1650505,A,0.815428674289436
10 | 5,53976834,G,A,rs4865796,A,1.37888331795972
11 | 5,56511543,C,T,rs464605,T,1.94940914048045
12 | 5,56564954,A,G,rs3936511,G,0.960913815167076
13 | 5,68418419,A,G,rs4976033,G,1.11410145533654
14 | 6,163711969,C,T,rs4709746,C,0.946221868858236
15 | 7,130748625,C,T,rs4731702,C,1.04704307479767
16 | 8,10117074,A,G,rs60384372,A,0.839422035421327
17 | 8,19973410,C,T,rs10096633,C,1.12398176692462
18 | 8,37000965,A,G,rs13365225,G,0.860056466802468
19 | 9,1036552,C,T,rs7856320,T,0.863105213838364
20 | 9,134025582,G,A,rs379417,A,0.820532423562114
21 | 9,95516131,C,T,rs113154802,C,1.031773196178
22 | 10,100152437,C,T,rs1408579,C,1.83862303926759
23 | 12,65965972,C,A,rs8756,A,0.871596104642325
24 | 15,60646617,C,A,rs8033609,A,0.860433493797303
25 | 15,73681321,T,C,rs57909886,C,1.08695886258769
26 | 18,63178651,T,C,rs12454712,T,1.45357229233753
27 | 19,19268740,C,T,rs58542926,T,1.72865989211715
28 | 19,33405526,A,C,rs4805881,A,1.96702300087785
29 | 20,33710480,C,T,rs13042148,T,1.20194405879043
30 | 22,43928975,G,A,rs3747207,A,3.15066074218633
31 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Liver-Lipid.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 2,27508073,T,C,rs1260326,C,7.65474930376802
3 | 6,126470949,A,G,rs11759026,G,0.879411210989312
4 | 11,61798436,T,C,rs174541,T,1.5345778005327
5 | 12,121017786,G,A,rs56158042,G,1.29916653195848
6 | 15,43557688,A,G,rs2470134,G,1.31302163060977
7 | 15,58384622,G,A,rs11858759,G,1.23315729718332
8 | 15,62099409,C,T,rs7163757,C,0.850792322763108
9 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Obesity.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,111747361,C,T,rs197374,T,0.944977870373138
 3 | 1,177919890,A,C,rs539515,C,1.77618274987658
 4 | 1,72285869,A,G,rs2613499,A,1.07509681204075
 5 | 2,160477361,A,C,rs6710938,A,1.10942198764396
 6 | 2,174332817,C,A,rs12992995,C,0.888638338965529
 7 | 2,180753927,A,G,rs6741676,A,1.12851090982705
 8 | 2,204511186,C,A,rs4482463,C,0.907626096223435
 9 | 2,228107168,T,C,rs13415288,C,0.868571244585216
10 | 2,58748008,T,C,rs12986742,C,1.11690750554327
11 | 2,59084401,G,A,rs980183,G,1.09369174741551
12 | 2,60057965,T,C,rs980329,C,0.869351547833271
13 | 2,653874,C,T,rs10188334,C,1.81854752841946
14 | 3,132032000,G,A,rs9857204,A,1.10991821922017
15 | 3,15664617,G,A,rs924753,G,1.05467560029361
16 | 3,173389653,T,C,rs247975,C,1.03582004807104
17 | 3,173992905,C,T,rs59489841,C,0.80456892598911
18 | 3,35628658,G,A,rs1470560,A,1.10218002850575
19 | 3,49958085,T,C,rs6792892,C,1.53620660614253
20 | 3,94262216,G,T,rs978444,G,0.998587558821075
21 | 3,9472332,G,A,rs3872707,A,0.785057832791062
22 | 4,136162038,A,C,rs1296328,A,1.03256581572728
23 | 4,45184122,G,A,rs10938398,A,1.68364771299316
24 | 4,90328687,T,C,rs17227797,C,0.887036500437549
25 | 5,154167849,G,A,rs7701886,G,0.869335843477036
26 | 5,79250470,G,A,rs2591392,G,0.839894318205273
27 | 5,88401716,C,T,rs6870983,C,0.908904807832778
28 | 5,88647527,C,T,rs13162708,T,0.851267963168398
29 | 5,96514546,A,C,rs261967,C,0.876782871919807
30 | 6,154033965,A,G,rs6936615,G,0.881816019309221
31 | 6,40441504,T,C,rs34298980,T,1.02735434805859
32 | 6,50821065,A,C,rs3798519,C,1.37909130885911
33 | 7,147961447,G,A,rs1922879,G,0.807265898202514
34 | 7,36703281,C,T,rs6978327,T,0.807848279510678
35 | 7,70184697,C,A,rs6975279,A,0.80829482242000406
36 | 7,78199422,G,A,rs3779274,G,0.803907382534401
37 | 8,115553138,T,C,rs3802219,T,0.789739152097947
38 | 8,30995310,T,C,rs2725370,T,1.14259309273326
39 | 8,34645053,A,C,rs4463416,A,0.829601893953105
40 | 9,130911265,C,T,rs6597649,T,0.922589397655005
41 | 9,28410685,T,C,rs1412234,C,1.24339704923915
42 | 10,33708299,A,C,rs71495046,C,0.840352973389949
43 | 10,75799410,T,C,rs11001500,C,0.898393574857749
44 | 11,27664649,G,T,rs10767659,G,1.78723583091228
45 | 11,43856909,G,A,rs35251247,A,1.26178987772888
46 | 11,47589600,C,T,rs11039307,T,1.13128399924468
47 | 11,8655516,C,A,rs7941510,C,0.915074992836882
48 | 12,105894667,G,A,rs12825669,G,0.797567267463839
49 | 12,41469591,C,T,rs2730827,T,0.812427188756336
50 | 12,49869365,G,A,rs7132908,A,1.66007378335109
51 | 12,60857620,T,C,rs12372209,T,0.862095802597203
52 | 13,30443131,G,A,rs12856169,G,1.02779513580648
53 | 13,53533448,G,T,rs9568868,T,0.852299702586262
54 | 13,58112626,T,C,rs7988244,T,1.0455155197036
55 | 14,102909694,C,T,rs4906272,T,0.930476424007877
56 | 14,103493689,A,G,rs56365443,A,0.812300807455789
57 | 14,32834334,C,T,rs12883788,T,0.929601085192436
58 | 15,83878470,C,T,rs1812707,T,0.963076983625625
59 | 16,15059860,T,C,rs9927842,T,0.80581894840142
60 | 16,20359659,A,C,rs9929710,A,0.803482209794921
61 | 16,287691,C,T,rs34665498,C,0.801311307225093
62 | 16,28886131,G,A,rs8056890,A,1.73326630512239
63 | 16,29946895,G,A,rs8054556,A,1.27926519064507
64 | 16,53767042,T,C,rs1421085,C,3.93479302022367
65 | 16,69632780,G,A,rs244415,G,1.16597243080594
66 | 17,36506381,T,C,rs1109442,C,0.860462806182295
67 | 17,67829132,T,C,rs12603589,C,1.02962326612309
68 | 17,80783826,A,G,rs11150745,A,0.930964254926842
69 | 18,23503774,C,T,rs303760,T,1.08178030111587
70 | 18,60161902,T,C,rs6567160,C,2.22011682246798
71 | 18,65759743,G,A,rs2032217,A,0.809834682625973
72 | 19,18723704,C,T,rs10404726,C,1.17123647879523
73 | 19,44908684,T,C,rs429358,T,0.859499197841359
74 | 19,47093845,T,C,rs10408163,C,0.835333895244067
75 | 20,52417142,G,A,rs4809906,G,0.88512812164516
76 | 22,41197577,C,T,rs11913442,T,0.845173784382441
77 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/Proinsulin.csv:
--------------------------------------------------------------------------------
 1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
 2 | 1,117626841,G,A,rs2282456,A,0.781777368119471
 3 | 4,152599323,C,T,rs6813195,C,0.972242142406481
 4 | 5,134525973,C,T,rs329118,T,1.12383552565093
 5 | 5,56511543,C,T,rs464605,T,0.860181484456669
 6 | 6,126470949,A,G,rs11759026,G,0.785301222900212
 7 | 6,127091583,G,A,rs12192275,G,1.01236936342738
 8 | 6,160349886,T,G,rs501470,T,2.0120436508473
 9 | 6,43846888,G,A,rs10456526,A,2.16139981815442
10 | 8,25607154,G,T,rs73221948,T,1.81084320190896
11 | 8,41651058,C,A,rs13262861,C,0.926474260567206
12 | 8,94672919,A,C,rs11786992,A,0.85945345793197
13 | 9,95033139,G,A,rs6479591,A,0.816311385231373
14 | 11,32459631,C,T,rs7927401,T,0.914822126342247
15 | 11,72721940,G,A,rs11603334,G,2.57463496862983
16 | 13,80141758,G,A,rs1215451,G,0.871278308670939
17 | 19,33405526,A,C,rs4805881,A,0.825142311008953
18 | 


--------------------------------------------------------------------------------
/Smith_Deutsch_NatureMedicine_2024/MultiAncestry/hg38_score_info/SHBG-LpA.csv:
--------------------------------------------------------------------------------
1 | Chr,Pos_hg38,Ref,Alt,RSID,Risk_Allele,Weight
2 | 3,136350630,G,T,rs667920,T,1.36652016101725
3 | 6,160349886,T,G,rs501470,T,2.13268926815861
4 | 17,7628647,T,C,rs858519,T,6.61457407629734
5 | 


--------------------------------------------------------------------------------
/doc/Variant clustering preprocessing pipeline_plan_KW.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gwas-partitioning/bnmf-clustering/8f0590e82dff74655450cc87bf3f62c00e074545/doc/Variant clustering preprocessing pipeline_plan_KW.docx


--------------------------------------------------------------------------------
/example_data/clustering_data_sources_example.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gwas-partitioning/bnmf-clustering/8f0590e82dff74655450cc87bf3f62c00e074545/example_data/clustering_data_sources_example.xlsx


--------------------------------------------------------------------------------
/scripts/.Rhistory:
--------------------------------------------------------------------------------
 1 | start=Sys.time()
 2 | # load requires packages
 3 | install.packages("pacman")
 4 | pacman::p_load(tidyverse, data.table, readxl, magrittr, dplyr, strex,
 5 | rstudioapi, DT, kableExtra, GenomicRanges)
 6 | if (!require("BiocManager", quietly = TRUE))
 7 | install.packages("BiocManager")
 8 | BiocManager::install("GenomicRanges")
 9 | # SECTION 1: PULL IN GWAS INFORMATION
10 | data_dir = "../example_data/"
11 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt")  # From dbSNP v1.38 -- maps positional IDs to rsIDs
12 | # GWAS for main trait
13 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"),
14 | sheet="main_gwas") %>%
15 | data.frame()
16 | setwd("~/Partners HealthCare Dropbox/Kirk Smith/MGH/bnmf-clustering/scripts")
17 | data_dir = "../example_data/"
18 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt")  # From dbSNP v1.38 -- maps positional IDs to rsIDs
19 | # GWAS for main trait
20 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"),
21 | sheet="main_gwas") %>%
22 | data.frame()
23 | # GWAS for clustering traits
24 | gwas_traits <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"),
25 | sheet="trait_gwas")
26 | # GWAS to be used for final allele alignment
27 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path)
28 | gwas_ss_files <- setNames(gwas$full_path, gwas$study)
29 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait)
30 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait)
31 | View(gwas)
32 | View(gwas_traits)
33 | # GWAS to be used for final allele alignment
34 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path)
35 | gwas_ss_files <- setNames(gwas$full_path, gwas$study)
36 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait)
37 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait)
38 | # SECTION 2: PULL SIGNIFICANT VARIANTS FROM MAIN TRAIT GWAS
39 | # P-value threshold for variants in main trait
40 | PVCUTOFF = 5e-8
41 | n_gwas <- length(gwas_ss_files)
42 | vars_sig = data.frame(VAR_ID = as.character(),
43 | P_VALUE = as.numeric(),
44 | Risk_Allele=as.character(),
45 | GWAS=as.character())
46 | print(sprintf("Pulling significant SNPs w/ pval<%.1e from %i T2D GWAS...", PVCUTOFF, n_gwas))
47 | for(i in 1:n_gwas) {
48 | print(paste0("...Reading ", names(gwas_ss_files)[i], "..."))
49 | vars <- fread(gwas_ss_files[i], data.table = F, stringsAsFactors=F)
50 | if (!"BETA" %in% colnames(vars)){
51 | print("Converting Odds Ratio to Log Odds Ratio...")
52 | vars <- vars %>%
53 | mutate(BETA = log(as.numeric(ODDS_RATIO)))
54 | }
55 | vars <- vars %>%
56 | filter(as.numeric(P_VALUE) <= PVCUTOFF) %>%
57 | subset(grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]", VAR_ID)) %>%
58 | separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove = F) %>%
59 | mutate(Risk_Allele = ifelse(BETA>=0, ALT, REF)) %>%
60 | mutate(GWAS = gwas$study[i]) %>%
61 | select(VAR_ID, P_VALUE, Risk_Allele, GWAS)
62 | print(nrow(vars))
63 | vars_sig = rbind(vars_sig, vars)
64 | }
65 | print(paste("No. total SNPs below pval cutoff:",nrow(vars_sig)))
66 | # remove duplicates
67 | vars_sig_uniq <- vars_sig %>%
68 | arrange(VAR_ID, P_VALUE) %>%
69 | filter(!duplicated(VAR_ID)) %>% # so we remove duplicates with the higher pvalue
70 | rename(PVALUE = P_VALUE)
71 | 


--------------------------------------------------------------------------------
/scripts/archive/choose_variants_2021.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(data.table)
  3 | 
  4 | 
  5 | # CURRENT ASSUMPTIONS ABOUT FORMATTING:
  6 | # - Genome build is hg19/GrCh37
  7 | # - Summary statistic datasets are whitespace-delimited with columns: VAR_ID, BETA, SE, N_PH
  8 | # - Variant IDs are all of the format: CHR_POS_REF_ALT 
  9 | 
 10 | 
 11 | ld_pruning <- function(gwas_variants, rsID_map_file, r2=0.1) {
 12 |   
 13 |   # Given a data frame of original GWAS variants (VAR_ID) and p-values (PVALUE), 
 14 |   # prune to a set of independent variants based on some LD threshold
 15 |   # Leverage the LDlinkR package to fetch LD relationships for a set of input SNPs
 16 |   
 17 |   write(gwas_variants$VAR_ID, "all_gwas_varid.tmp")
 18 |   all_var_df <- fread(cmd=paste0("grep -wFf all_gwas_varid.tmp ",
 19 |                                  rsID_map_file),
 20 |                       header=F, col.names=c("VAR_ID", "rsID"),
 21 |                       data.table=F, stringsAsFactors=F) %>%
 22 |     separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 
 23 |              sep="_", remove=F) %>%
 24 |     inner_join(gwas_variants, by="VAR_ID") %>%
 25 |     arrange(PVALUE)  # This ordering is important for the pruning steps below!
 26 |   system("rm all_gwas_varid.tmp")
 27 |   
 28 |   pruned_vars <- c()
 29 |   
 30 |   for (i in 1:22) {
 31 |     print(paste0("Pruning chromosome ", i, "..."))
 32 |     var_df <- filter(all_var_df, CHR == i)
 33 |     
 34 |     if (nrow(var_df) == 0) {
 35 |       next
 36 |     } else if (nrow(var_df) == 1) {
 37 |       pruned_vars <- c(pruned_vars, var_df$rsID)
 38 |       next
 39 |     }
 40 |     
 41 |     ld_mat <- LDlinkR::LDmatrix(snps=var_df$rsID,
 42 |                                 pop="CEU",
 43 |                                 r2d="r2",
 44 |                                 token="20ff5a8454d7")  ## This should be replaced by each user's own token (retrieve at: https://ldlink.nci.nih.gov/?tab=apiaccess)
 45 |     rownames(ld_mat) <- ld_mat$RS_number
 46 |     ld_mat$RS_number <- NULL
 47 |     ld_mat <- as.matrix(ld_mat)
 48 |     ld_mat <- ld_mat[rowSums(is.na(ld_mat)) != ncol(ld_mat),
 49 |                      colSums(is.na(ld_mat)) != nrow(ld_mat)]
 50 | 
 51 |     remaining_snps <- var_df$rsID
 52 |     
 53 |     while(length(remaining_snps) > 0) {
 54 |       pruned_vars <- c(pruned_vars, remaining_snps[1])
 55 |       if (remaining_snps[1] %in% rownames(ld_mat)) {
 56 |         remaining_snps <- setdiff(
 57 |           remaining_snps,
 58 |           rownames(ld_mat)[ld_mat[, remaining_snps[1]] >= r2]
 59 |         )
 60 |       } else {
 61 |         remaining_snps <- setdiff(remaining_snps, remaining_snps[1])
 62 |       }
 63 |     }
 64 |   }
 65 |   
 66 |   print(paste0(length(pruned_vars), " variants remain after pruning."))
 67 |   filter(all_var_df, rsID %in% pruned_vars)
 68 | }
 69 | 
 70 | 
 71 | count_traits_per_variant <- function(gwas_variants, ss_files) {
 72 | 
 73 |   # Given a vector of variants and a named vector of summary statistics files 
 74 |   # for traits to be clustered, output a vector of non-missing trait fractions
 75 |   # per variant
 76 |   
 77 |   print("Assessing variant missingness across traits...")
 78 |   
 79 |   variant_df_list <- lapply(1:length(ss_files), function(i) {
 80 |     print(paste0("...Reading ", names(ss_files)[i], "..."))
 81 |     fread(ss_files[i], data.table=F, stringsAsFactors=F) %>%
 82 |       filter(VAR_ID %in% gwas_variants)
 83 |   })
 84 |   variant_counts_df <- bind_rows(variant_df_list) %>%
 85 |     group_by(VAR_ID) %>%
 86 |     summarise(frac=n() / length(ss_files))
 87 |   variant_counts <- ifelse(
 88 |     gwas_variants %in% variant_counts_df$VAR_ID,
 89 |     variant_counts_df$frac[match(gwas_variants, variant_counts_df$VAR_ID)],  # If in counts data frame, take the non-missing fraction
 90 |     0  # If not in data frame, then the non-missing fraction is 0
 91 |   )
 92 |   setNames(variant_counts, gwas_variants)
 93 | }
 94 | 
 95 | 
 96 | find_variants_needing_proxies <- function(gwas_variant_df, var_nonmissingness,
 97 |                                           rsID_map_file) {
 98 |   
 99 |   # Given a data frame containing GWAS variants and alleles as well as a vector
100 |   # of trait missingness fractions per variant (from count_traits_per_variant),
101 |   # output a vector of variants that need proxies
102 |   # Criteria (any of the following):
103 |   #   Strand-ambiguous (AT or GC)
104 |   #   Multi-allelic
105 |   #   Low-count (available in < 80% of traits)
106 |   # rsID_map_file should point to a whitespace-delimited file with columns
107 |   # corresponding to VAR_ID and rsID
108 |   
109 |   print("Choosing variants in need of proxies...")
110 |   
111 |   gwas_variant_df <- gwas_variant_df %>%
112 |       separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"),
113 |                sep="_", remove=F)
114 |   
115 |   need_proxies_varid <- with(gwas_variant_df, {
116 |     strand_ambig <- VAR_ID[paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")]
117 |     print(paste0("...", length(strand_ambig), " strand-ambiguous variants"))
118 |     
119 |     multi_allelic <- grep("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", VAR_ID, value=T)  # i.e. ALT allele has a comma
120 |     print(paste0("...", length(multi_allelic), " multi-allelic variants"))
121 |     
122 |     low_cnt <- VAR_ID[!(VAR_ID %in% names(var_nonmissingness)) |
123 |                         var_nonmissingness[VAR_ID] < 0.8]
124 |     print(paste0("...", length(low_cnt), " variants with excessive missingness"))
125 |     
126 |     unique(c(strand_ambig, multi_allelic, low_cnt))
127 |   })
128 |   print(paste0("...", length(need_proxies_varid), " unique variants in total"))
129 |   
130 |   if (length(need_proxies_varid) == 0) return(tibble(VAR_ID=c(), rsID=c()))
131 |   
132 |   write(need_proxies_varid, "need_proxies_varid.tmp")
133 |   varid_rsid_map <- fread(cmd=paste0("grep -wFf need_proxies_varid.tmp ",
134 |                                      rsID_map_file),
135 |                           header=F, col.names=c("VAR_ID", "rsID"),
136 |                           data.table=F, stringsAsFactors=F)
137 |   need_proxies_rsid <- varid_rsid_map$rsID[match(need_proxies_varid, 
138 |                                                  varid_rsid_map$VAR_ID)]
139 |   print(paste0("...", length(unique(varid_rsid_map$rsID)), 
140 |                " of these are mapped to rsIDs"))
141 |   system("rm need_proxies_varid.tmp")
142 |   
143 |   left_join(tibble(VAR_ID=need_proxies_varid), varid_rsid_map, by="VAR_ID")
144 | }
145 | 
146 | 
147 | choose_proxies <- function(need_proxies, 
148 |                            tabix_path, ld_file, 
149 |                            rsID_map_file, trait_ss_files,
150 |                            pruned_variants) {
151 |   
152 |   # Given a vector of variants (rsIDs) needing proxies
153 |   # (from find_variants_needing_proxies) and an LD reference file,
154 |   # output a data frame linking each variant to a data frame containing possible
155 |   # proxies (variant ID + r^2 + alleles)
156 |   # Criteria for eligibility:
157 |   #   Not strand-ambiguous
158 |   #   Trait fraction >= 80%
159 |   #   r^2 >= 0.8 with the index variant
160 |   # Choose based on first trait count, then r^2
161 |   
162 |   # First, run "/path/to/tabix /path/to/LDfile rsID_1 rsID_2 ..."
163 |   system(paste0(tabix_path, " ", ld_file, " ",
164 |                 paste(need_proxies$rsID, collapse=" "),
165 |                 " > ld_ref.tmp"))
166 |   
167 |   proxy_df <- read_tsv("ld_ref.tmp", col_names=c("rsID", "proxy_data")) %>%
168 |     separate_rows(proxy_data, sep=";") %>%
169 |     separate(proxy_data, into=c("proxy_rsID", "r2", "D"), sep=",")
170 |   
171 |   write(proxy_df$proxy_rsID, "potential_proxies_rsid.tmp")
172 |   potential_proxies_map <- fread(cmd=paste0("grep -wFf potential_proxies_rsid.tmp ",
173 |                                             rsID_map_file),
174 |                                  header=F, col.names=c("proxy_VAR_ID", "proxy_rsID"),
175 |                                  data.table=F, stringsAsFactors=F)
176 | 
177 |   system("rm ld_ref.tmp potential_proxies_rsid.tmp")
178 |   
179 |   proxy_missingness <- count_traits_per_variant(
180 |     potential_proxies_map$proxy_VAR_ID,
181 |     trait_ss_files
182 |   )
183 |   proxy_missingness_df <- tibble(
184 |     proxy_VAR_ID=names(proxy_missingness),
185 |     frac_nonmissing=proxy_missingness
186 |   )
187 |   
188 |   final_proxy_df <- proxy_df %>%
189 |     inner_join(potential_proxies_map, by="proxy_rsID") %>%
190 |     separate(proxy_VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 
191 |              sep="_", remove=F) %>%
192 |     inner_join(proxy_missingness_df, by="proxy_VAR_ID") %>%
193 |     filter(
194 |       !(paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")),  # Not strand-ambiguous
195 |       !grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", proxy_VAR_ID),  # Not multi-allelic
196 |       frac_nonmissing >= 0.8,  # Sufficient fraction of traits non-missing
197 |       r2 >= 0.8  # Sufficient LD with the proxied variant
198 |     ) %>%
199 |     group_by(rsID) %>%
200 |     arrange(desc(frac_nonmissing),
201 |             desc(r2),
202 |             CHR) %>%  # Arbitrary sort for reproducibility in case of missingness + r2 ties
203 |     dplyr::slice(1) %>%
204 |     ungroup()
205 |   
206 |   proxies_found <- final_proxy_df$rsID
207 |   no_proxies_found <- setdiff(need_proxies$rsID, proxies_found)
208 |   print(paste0("No proxies needed for ", 
209 |                length(setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID)), 
210 |                " variants."))
211 |   print(paste0("Proxies found for ", length(proxies_found), " variants."))
212 |   print(paste0("No adequate proxies found for ", length(no_proxies_found), 
213 |                " variants."))
214 |   if (length(no_proxies_found) > 0) {
215 |     write(no_proxies_found, "no_proxies_found.txt")
216 |     print("See no_proxies_found.txt for a list of these variants.")
217 |   }
218 |   
219 |   final_variant_set <- c(
220 |     setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID),  # Original pruned variants that don't need proxies
221 |     final_proxy_df$proxy_VAR_ID  # Proxy variants fulfilling the necessary criteria
222 |   )
223 |   unique(final_variant_set) 
224 |   # NOTE: the unique() above simply discards duplicate proxies 
225 |   # (same proxy for multiple variants and/or proxy variant that is already a primary GWAS variant).
226 |   # There may be a better way to deal with this.
227 | 
228 | }
229 | 
230 | 
231 | 


--------------------------------------------------------------------------------
/scripts/archive/gwas_variant_selection.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | 
  3 | 
  4 | diamante_gwas <- read_tsv(
  5 |   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAMANTE_eu_UKBB_dv2/T2D/DATA/GWAS_DIAMANTE_eu_UKBB_dv2.T2D.1.txt",
  6 | ) %>%
  7 |   select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>%
  8 |   filter(p < 5e-8) %>%
  9 |   separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F)
 10 | 
 11 | exchip_gwas <- read_tsv(
 12 |   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/ExChip_ExTexT2D_dv1/T2D/DATA/ExChip_ExTexT2D_dv1.T2D.1.txt",
 13 | ) %>%
 14 |   select(varID=VAR_ID, p=P_VALUE, N=Neff) %>%
 15 |   filter(p < 5e-8) %>%
 16 |   separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F)
 17 |   
 18 | diagram_gwas <- read_tsv(
 19 |   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAGRAM_eu_dv2/T2D/DATA/GWAS_DIAGRAM_eu_dv2.T2D.1.txt",
 20 | ) %>%
 21 |   select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>%
 22 |   filter(p < 5e-8) %>%
 23 |   separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F)
 24 | 
 25 | wgs_got2d_gwas <- read_tsv(
 26 |   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/WGS_GoT2Dimputed_dv1/T2D/DATA/WGS_GoT2Dimputed_dv1.T2D.1.txt",
 27 | ) %>%
 28 |   select(varID=VAR_ID, p=P_VALUE, N=N_PH) %>%
 29 |   filter(p < 5e-8) %>%
 30 |   separate(varID, into=c("chr", "pos", "EA", "NEA"), sep="_", remove=F)
 31 | 
 32 | mahajan_gwas <- read_tsv(
 33 |   "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European.txt",
 34 |   col_types=cols(SNP="c")
 35 | ) %>%
 36 |   select(varID=SNP, chr=Chr, pos=Pos, p=Pvalue, N=Neff, EA, NEA) %>%
 37 |   mutate(varID=paste(chr, pos, EA, NEA, sep="_")) %>%
 38 |   filter(p < 5e-8)
 39 | 
 40 | t2d_eur_gwas <- read_tsv(
 41 |   "/humgen/diabetes2/users/clairekim/T2D_European.BMIunadjusted.txt",
 42 |   col_types=cols(SNP="c")
 43 | ) %>%
 44 |   select(varID=SNP, chr=CHR, pos=BP, p=Pvalue, N=Neff, 
 45 |          EA=EFFECT_ALLELE, NEA=OTHER_ALLELE) %>%
 46 |   mutate(varID=paste(chr, pos, EA, NEA, sep="_")) %>%
 47 |   filter(p < 5e-8)
 48 | 
 49 | 
 50 | choose_gwas_variants <- function(ss_df_list) {
 51 |   
 52 |   # Given a list of summary statistic data frames, choose variants to take
 53 |   # forward for clustering
 54 |   # Summary statistic data frames should contain the following fields:
 55 |   # chr, pos, N, p, EA, NEA
 56 |   
 57 |   # Filter out studies with N < 10k
 58 |   low_N <- sapply(ss_df_list, function(ss_df) median(ss_df$N) < 10000)
 59 |   ss_df_list <- ss_df_list[which(!low_N)]
 60 |   
 61 |   # Standardize column types for binding
 62 |   ss_df_list <- lapply(ss_df_list, function(df) {
 63 |     mutate(df, 
 64 |            chr=as.character(chr),
 65 |            pos=as.integer(pos))
 66 |   })
 67 |   
 68 |   # Bind summary stats from each GWAS and select variants
 69 |   do.call(bind_rows, c(ss_df_list, .id="gwas")) %>%
 70 |     group_by(chr, pos) %>%
 71 |     dplyr::slice(which.max(N)) %>%
 72 |     ungroup()
 73 | }
 74 | 
 75 | gw_variants_list <- list(
 76 |   diamante=diamante_gwas,
 77 |   exchip=exchip_gwas,
 78 |   diagram=diagram_gwas,
 79 |   wgs_got2d=wgs_got2d_gwas,
 80 |   mahajan=mahajan_gwas,
 81 |   t2d_eur=t2d_eur_gwas
 82 | )
 83 | 
 84 | initial_gwas_variants_df <- choose_gwas_variants(gw_variants_list)
 85 | 
 86 | # input_file_list <- list(
 87 | #   c(varID="VAR_ID", p="P_VALUE", N="N_PH"),
 88 | #   c(varID="VAR_ID", p="P_VALUE", N="Neff"),
 89 | #   c(varID="VAR_ID", p="P_VALUE", N="N_PH"),
 90 | #   c(varID="VAR_ID", p="P_VALUE", N="N_PH"),
 91 | #   c(varID="VAR_ID", p="Pvalue", n="Neff"),
 92 | #   c(varID="VAR_ID", p=)
 93 | # )
 94 | # 
 95 | # names(input_file_list) <- c(
 96 | #   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAMANTE_eu_UKBB_dv2/T2D/DATA/GWAS_DIAMANTE_eu_UKBB_dv2.T2D.1.txt",
 97 | #   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/ExChip_ExTexT2D_dv1/T2D/DATA/ExChip_ExTexT2D_dv1.T2D.1.txt",
 98 | #   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/GWAS_DIAGRAM_eu_dv2/T2D/DATA/GWAS_DIAGRAM_eu_dv2.T2D.1.txt",
 99 | #   "/humgen/diabetes2/users/mvg/portal/scripts/VARIANTS/PHENOTYPES/WGS_GoT2Dimputed_dv1/T2D/DATA/WGS_GoT2Dimputed_dv1.T2D.1.txt",
100 | #   "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European.txt",
101 | #   "/humgen/diabetes2/users/clairekim/T2D_European.BMIunadjusted.txt"
102 | # )
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/scripts/archive/main.BayesNMF.script_to_Jaeyoon_edit_claire_T2D.R:
--------------------------------------------------------------------------------
  1 | ############################################################################################
  2 | ############################################################################################
  3 | #### Copyright (c) 2017, Broad Institute
  4 | #### Redistribution and use in source and binary forms, with or without
  5 | #### modification, are permitted provided that the following conditions are
  6 | #### met:
  7 | ####     Redistributions of source code must retain the above copyright
  8 | ####     notice, this list of conditions and the following disclaimer.
  9 | ####     Redistributions in binary form must reproduce the above copyright
 10 | ####     notice, this list of conditions and the following disclaimer in
 11 | ####     the documentation and/or other materials provided with the
 12 | ####     distribution.
 13 | ####     Neither the name of the Broad Institute nor the names of its
 14 | ####     contributors may be used to endorse or promote products derived
 15 | ####     from this software without specific prior written permission.
 16 | #### THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 17 | #### "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 18 | #### LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 19 | #### A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 20 | #### HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 21 | #### SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 22 | #### LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 23 | #### DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 24 | #### THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 | #### (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 26 | #### OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 | ############################################################################################
 28 | ############################################################################################
 29 | 
 30 | ######################################################################################################
 31 | ####### Bayesian NMF algorithms for clustering
 32 | ######################################################################################################
 33 | ####### For implementation details see the ppaer 
 34 | ####### Udler MS, Kim J, von Grotthuss M,
 35 | ####### Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018)
 36 | ####### Type 2 diabetes genetic loci informed by multi-trait
 37 | ####### associations point to disease mechanisms and
 38 | ####### subtypes: A soft clustering analysis. PLoS Med 15
 39 | ####### (9): e1002654.
 40 | #################################
 41 | ####### For details on the original algorithms 
 42 | ####### see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence.
 43 | ####### IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013).
 44 | ######################################################################################################
 45 | 
 46 | ###########################
 47 | ###########################
 48 | ##### Bayesian NMF with half-normal priors for W and H
 49 | BayesNMF.L2EU <- function(V0,n.iter,a0,tol,K,K0,phi) {
 50 |         eps <- 1.e-50
 51 |         del <- 1.0
 52 |         active_nodes <- colSums(V0) != 0
 53 |         V0 <- V0[,active_nodes]
 54 |         V <- V0-min(V0)
 55 |         Vmin <- min(V)
 56 |         Vmax <- max(V)
 57 |         N <- dim(V)[1]
 58 |         M <- dim(V)[2]
 59 | 
 60 |         W <- matrix(runif(N * K)*Vmax,ncol=K)
 61 |         H <- matrix(runif(M * K)*Vmax,ncol=M)
 62 |         I <- array(1,dim=c(N,M))
 63 |         V.ap <- W%*%H+eps
 64 | 
 65 |         phi <- sd(V)^2*phi
 66 |         C <- (N+M)/2+a0+1
 67 |         b0 <- 3.14*(a0-1)*mean(V)/(2*K0)
 68 |         lambda.bound <- b0/C
 69 |         lambda <- (0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/C
 70 |         lambda.cut <- lambda.bound*1.5
 71 | 
 72 |         n.like <- list()
 73 |         n.evid <- list()
 74 |         n.error <- list()
 75 |         n.lambda <- list()
 76 |         n.lambda[[1]] <- lambda
 77 |         iter <- 2
 78 |         count <- 1
 79 |         while (del >= tol & iter < n.iter) {
 80 |                 H <- H*(t(W)%*%V)/(t(W)%*%V.ap+phi*H*matrix(rep(1/lambda,M),ncol=M)+eps)
 81 |                 V.ap <- W %*% H + eps
 82 |                 W <- W*(V%*%t(H))/(V.ap%*%t(H)+phi*W*t(matrix(rep(1/lambda,N),ncol=N))+eps)
 83 |                 V.ap <- W %*% H + eps
 84 |                 lambda <- (0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/C
 85 |                 del <- max(abs(lambda-n.lambda[[iter-1]])/n.lambda[[iter-1]])
 86 |                 like <- sum((V-V.ap)^2)/2
 87 |                 n.like[[iter]] <- like
 88 |                 n.evid[[iter]] <- like + phi*sum((0.5*colSums(W^2)+0.5*rowSums(H^2)+b0)/lambda+C*log(lambda))
 89 |                 n.lambda[[iter]] <- lambda
 90 |                 n.error[[iter]] <- sum((V-V.ap)^2)
 91 |                 if (iter %% 100 == 0) {
 92 |                         cat(iter,n.evid[[iter]],n.like[[iter]],n.error[[iter]],del,sum(colSums(W)!=0),sum(lambda>=lambda.cut),'\n')
 93 |                 }
 94 |                 iter <- iter+1
 95 |         }
 96 |         return(list(W,H,n.like,n.evid,n.lambda,n.error))
 97 | }
 98 | 
 99 | plot.heatmap.ggplot.new <- function(mat) {
100 | 	scale0 <- 0.8
101 |         scale <- 1
102 |         g.ordering <- c("G4","G3","G2","G1")
103 |         color.axis <- "black"
104 |         .theme_ss <- theme_bw(base_size=12) +
105 |                 theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis),
106 |                 axis.text.y = element_text(hjust = 0.5,size=8*scale, family="mono",face='bold',color=color.axis),
107 |                 axis.text = element_text(size = 12*scale, family = "mono",color=color.axis),
108 |                 axis.title=element_text(face="bold",size=12*scale,color="black"),
109 |                 plot.title=element_text(face="bold",size=12*scale))
110 |         mat[mat < 1.e-10] <- 0
111 |         hc <- hclust(dist(mat,method="euclidean"),method="ward.D")
112 |         feature.ordering <- hc$labels[hc$order]
113 |         df <- melt(mat)
114 |         colnames(df) <- c("feature","signature","activity")
115 |         #df$feature <- factor(df$feature,levels=feature.ordering)
116 |         #df$signature <- factor(df$signature,levels=c("W4","W3","W2","W1"))
117 |         p = ggplot(df,aes(y=feature,x=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow")
118 |         p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep=""))
119 |         #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
120 |         p = p + .theme_ss
121 |         p = p + ggtitle("Feature Assoication to Clusters")
122 |         #p = p + ylab("Contributions") + xlab("Feature")
123 |         p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0))
124 |         p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0))
125 |         p = p + theme(legend.position="right")
126 |         p = p + theme(legend.key.size = unit(0.5, "cm"))
127 |         #pdf(file=paste(OUTPUT,"feature.association_to_clusters.pdf",sep=""),width=10,height=4)
128 |                 plot(p)
129 |         #dev.off()
130 | }
131 | 
132 | plot.heatmap.2 <- function(x,rowTF,colTF) {
133 |         s1 <- 0.75
134 |         s2 <- 1.0
135 |         s3 <- 1.5
136 |         mydist <- function(c) {dist(c,method="euclidean")}
137 |         myclust <- function(c) {hclust(c,method="ward.D")}
138 |         heatmap.2(as.matrix(x), hclustfun=myclust, distfun=mydist, na.rm = TRUE, scale="none", dendrogram="both",margins=c(8,8),
139 |                 Rowv=rowTF, Colv=colTF, symbreaks=F, key=TRUE, symkey=F,
140 |                 density.info="none", trace="none",labCol=colnames(x),labRow=rownames(x),col=greenred(40),cex.lab=s1,cexRow=0.7,cexCol=0.7,keysize=s1)
141 | }
142 | 
143 | library(gplots)
144 | library(RColorBrewer)
145 | library(ggplot2)
146 | library(reshape)
147 | library(reshape2)
148 | 
149 | #CURRENT <- paste(getwd(),"/",sep="")
150 | #OUTPUT <- paste(CURRENT,"OUTPUT/DIST_test/",sep="")
151 | #system(paste("mkdir",OUTPUT,sep=" "))
152 | 
153 | ##### mat.all: the trait by genotype matrix; the positive and negative association of each trait to genes are separately handled with distinct features.
154 | ##### mat.all in this scipt is not exactly same as the one used in the paper and we included it as an example input.
155 | #load("t2d_data.example.RData")
156 | #mat_all <- mat.all
157 | # load data
158 | library(tidyverse)
159 | setwd("C:/Users/hk745/Dropbox (Partners HealthCare)")
160 | 
161 | CURRENT <- paste(getwd(),"/",sep="")
162 | OUTPUT <- paste(CURRENT,"OUTPUT/DIST_test/T2D/trait_test/",sep="")
163 | 
164 | #inputtraits <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/traitlist_pval_0.05_snpcnt_filterGIANT.txt",stringsAsFactors = FALSE, header = F)
165 | #inputtraits$V1 <- gsub("-", ".", inputtraits$V1)
166 | 
167 | 
168 | #t2d_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_98snps_23traits.txt",stringsAsFactors = FALSE)
169 | #filter <- t2d_snps
170 | #t2d_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_98snps_noloop_minusCHOL.txt",stringsAsFactors = FALSE)
171 | #t2d_snps <- t2d_snps[,names(t2d_snps) %in% names(filter)]
172 | #gbd_snps <- t2d_snps
173 | 
174 | 
175 | #filtered <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2D_filter.txt",stringsAsFactors = FALSE, header = F)
176 | #filtered <- filtered[filtered$V2<0.05,]
177 | 
178 | gbd_snps <- read.delim("C:/Users/hk745/Dropbox (Partners HealthCare)/T2Donly_eu_45traits_412snps.txt",stringsAsFactors = FALSE)
179 | 
180 | #gbd_snps <- gbd_snps[gbd_snps$VAR_ID_hg19 %in% filtered$V1,]
181 | 
182 | #gbd_snps$GLGC_dv2.CHOL.ZN <- NULL
183 | gbd_t <- gbd_snps
184 | rownames(gbd_t) <- gbd_snps$locus
185 | gbd_t <- gbd_t[,-c(1:4)]
186 | gbd_t <- t(gbd_t)
187 | gbd_t[gbd_t == "."] <- NA
188 | 
189 | 
190 | 
191 | #gbd_t <- gbd_t[rownames(gbd_t) %in% inputtraits$V1,]
192 | #write.table(rownames(gbd_t),file=paste(CURRENT,paste("traitlist_0.05_snpcnt.txt",sep="."),sep=""), append = F, quote = F, sep = "\t",
193 | #            eol = "\n", na = "NA", dec = ".", row.names = F,
194 |  #           col.names = F, qmethod = c("escape", "double"))
195 | 
196 | gbd_t <- as.data.frame(gbd_t,stringsAsFactors = FALSE)
197 | na <- as.data.frame(as.data.frame(map(gbd_t, ~sum(is.na(.)))))
198 | nadrop <- na[,na>dim(gbd_t)[1]*0.2, drop = FALSE]  #### 
199 | test <- gbd_t
200 | test <- test[,!(colnames(test) %in% colnames(nadrop))]
201 | gbd_t <- test
202 | 
203 | #gbd <- as.data.frame(t(gbd_t))
204 | #nat <- as.data.frame(as.data.frame(map(gbd, ~sum(is.na(.)))))
205 | #nadropt <- nat[,nat>dim(gbd)[1]*0.2]
206 | 
207 | gbd_t[is.na(gbd_t)] <- 0
208 | mat.neg <- gbd_t
209 | mat.pos <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)>=0,as.numeric(x),0)))
210 | #mat.pos <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)<0,as.numeric(x),0)))
211 | mat.pos <- t(mat.pos)
212 | colnames(mat.pos) <- colnames(mat.neg)
213 | mat.neg <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)>=0,0,as.numeric(x))))
214 | #mat.neg <- as.data.frame(apply(mat.neg, 1, function(x) ifelse(as.numeric(x)<0,0,as.numeric(x))))
215 | mat.neg <- t(mat.neg)
216 | colnames(mat.neg) <- colnames(mat.pos)
217 | mat.neg <- mat.neg*(-1)
218 | #mat.pos <- mat.pos*(-1)
219 | rownames(mat.pos) <- paste(rownames(mat.pos), "pos", sep = "_")
220 | rownames(mat.neg) <- paste(rownames(mat.neg), "neg", sep = "_")
221 | mat.all <- rbind(mat.pos,mat.neg)
222 | 
223 | 
224 | 
225 | ##### simple hierarchical clustering ############## not working
226 | library(pheatmap)
227 | hc1 <- hclust(dist(mat.all,method="euclidean"),method="ward.D")
228 | hc1.ordering <- hc1$labels[hc1$order]
229 | hc2 <- hclust(dist(t(mat.all),method="euclidean"),method="ward.D")
230 | hc2.ordering <- hc2$labels[hc2$order]
231 | order1 <- match(hc1.ordering,rownames(mat.all),nomatch=0)
232 | order2 <- match(hc2.ordering,colnames(mat.all),nomatch=0)
233 | pdf(file=paste(OUTPUT,"hierarchicalc.mat.pdf",sep=""),width=40,height=10)
234 | fontsize_row = 8
235 | fontsize_col = 0.5
236 | pheatmap(mat.all[order1,order2], fontsize_col = fontsize_col, fontsize_row = fontsize_row)+ scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep=""))
237 | #plot.heatmap.2(mat.all[order1,order2],F,F)
238 | dev.off()
239 | 
240 | 
241 | 
242 | ##### Running Bayesian NMF with half-normal priors for W and H 
243 | if (TRUE) {
244 | #n.iter <- 100 ### number of runs.. you can increase this number
245 | n.iter <- 30 ### number of runs.. you can increase this number
246 | 
247 | for (i in 1:n.iter) {
248 |         res <- BayesNMF.L2EU(as.matrix(mat.all),200000,10,1.e-07,10,10,1.0)
249 |         save(res,file=paste(OUTPUT,paste("res.L2EU.Bayes",i,"RData",sep="."),sep=""))
250 | }
251 | }
252 | 
253 | tmpK <- rep(0,n.iter)
254 | tmpE <- rep(0,n.iter)
255 | tmpRUN <- rep(0,n.iter)
256 | for (i in 1:n.iter) {
257 |         load(file=paste(OUTPUT,paste("res.L2EU.Bayes",i,"RData",sep="."),sep=""))
258 |         lambda <- res[[5]]
259 |         lambda <- unlist(lambda[length(lambda)])
260 |         lambda <- lambda-min(lambda)
261 |         tmpK[i] <- sum(lambda > 0)
262 | 	evid <- res[[4]]
263 | 	tmpE[i] <- evid[length(evid)]
264 | 	tmpRUN[i] <- i
265 | }
266 | df.run <- data.frame(tmpK,unlist(tmpE),tmpRUN)
267 | colnames(df.run) <- c("K","evid","run")
268 | df.run <- df.run[order(df.run$evid,decreasing=T),] 
269 | write.table(df.run,file=paste(OUTPUT,paste("summary.run.txt",sep="."),sep=""), append = F, quote = F, sep = "\t",
270 |         eol = "\n", na = "NaN", dec = ".", row.names = F,
271 |         col.names = T, qmethod = c("escape", "double"))
272 | 
273 | #### df.run - the summary data-frame for bNMF runs; K = number of clusters, evid = -log(posterior), run = the index of bNMF run
274 | #### How to choose K: (i) We usually perfer the most probable K. For example, here 57% K=5 and 43% K=4, so we will consider K=5.
275 | #### (ii) After selcting K then look at "evid" for all runs with the selected K (here K=5) and choose the run with the lowest "evid" 
276 | #### corresponding to the maximum posterior solution
277 | #### (iii) Sometimes you may need a manual inspection for other solutions based on your prior knowledge or biological consideration. 
278 | #### Specially, when your most probable solution corresponds to the lowest K, it is recommended to examine the solution with (K+1) and check which solution 
279 | #### is more biologically plausible. 
280 | 
281 | #### Below we will generate outputs of the maximum posterior solutions at different K
282 | unique.K <- table(df.run$K)
283 | n.K <- length(unique.K) ### number of distict K
284 | MAP.K <- rep(0,n.K) ### bNMF run index with the maximum posterior for given K
285 | for (i in 1:n.K) {
286 |         tmp <- df.run[df.run$K==as.numeric(names(unique.K)[i]),]
287 |         MAP.K[i] <- tmp$run[which.min(tmp$evid)]
288 | }
289 | 
290 | for (m in 1:n.K) {
291 | 
292 | index.m <- as.numeric(names(unique.K)[m])
293 | 
294 | load(file=paste(OUTPUT,paste("res.L2EU.Bayes",MAP.K[m],"RData",sep="."),sep=""))
295 | W <- res[[1]]
296 | H <- res[[2]]
297 | W <- W[,colSums(W)!=0]
298 | H <- H[rowSums(H)!=0,]
299 | colnames(W) <- paste("W",seq(1:ncol(W)),sep="")
300 | rownames(H) <- colnames(W)
301 | W[W < 1.e-10] <- 0 ### feature-cluster association matrix
302 | H[H < 1.e-10] <- 0 ### cluster-gene association matrix
303 | 
304 | if (FALSE) {
305 | 	W.mid <- W
306 | 	H.mid <- H
307 | 	for (i in 1:ncol(W)) {
308 | 		H.mid[i,] <- H.mid[i,]*colSums(W)[i]
309 | 		W.mid[i,] <- W.mid[i,]*rowSums(H)[i]
310 | 	}
311 | 	W.norm <- apply(W.mid,2,function(x) x/sum(x))
312 | 	H.norm <- apply(H.mid,2,function(x) x/sum(x))
313 | }
314 | 
315 | W0 <- data.frame(W)
316 | W0[,"feature"] <- rownames(W)
317 | H0 <- data.frame(H)
318 | H0[,"cluster"] <- rownames(H)
319 | 
320 | if (TRUE) {
321 | write.table(W0,file=paste(OUTPUT,paste("L2EU.W.mat",index.m,"txt",sep="."),sep=""), append = F, quote = F, sep = "\t",
322 |         eol = "\n", na = "NaN", dec = ".", row.names = F,
323 |         col.names = T, qmethod = c("escape", "double"))
324 | write.table(H0,file=paste(OUTPUT,paste("L2EU.H.mat",index.m,"txt",sep="."),sep=""), append = F, quote = F, sep = "\t",
325 |         eol = "\n", na = "NaN", dec = ".", row.names = F,
326 |         col.names = T, qmethod = c("escape", "double"))
327 | }
328 | 
329 | mat.reconstructed <- W%*%H ### reconstructed matrix == approximation for the input matrix 
330 | #pdf(file=paste(OUTPUT,paste("L2EU.hc.mat.WH.0",index.m,"pdf",sep="."),sep=""),width=8,height=8)
331 | #	plot.heatmap.2(mat.reconstructed[order1,order2],F,F)
332 | #dev.off()
333 | 
334 | K <- ncol(W)
335 | for (i in 1:K) {
336 | 	mat1 <- W[,i]%*%t(as.matrix(H[i,]))
337 | 	rownames(mat1) <- rownames(mat.all)
338 | 	#pdf(file=paste(OUTPUT,paste("hc.mat.WH",i,index.m,"pdf",sep="."),sep=""),width=8,height=8)
339 | 	#	plot.heatmap.2(mat1[order1,order2],F,F)
340 | 	#dev.off()
341 | }
342 | 
343 | 	scale0 <- 0.8
344 |         scale <- 1
345 | 	g.ordering <- paste("G",seq(1:ncol(W)),sep="")
346 |         color.axis <- "black"
347 |         .theme_ss <- theme_bw(base_size=12) +
348 |                 theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis),
349 |                 axis.text.y = element_text(hjust = 0.5,size=12*scale, family="mono",face='bold',color=color.axis),
350 |                 axis.text = element_text(size = 12*scale, family = "mono",color=color.axis),
351 |                 axis.title=element_text(face="bold",size=12*scale,color="black"),
352 |                 plot.title=element_text(face="bold",size=12*scale))
353 |         mat <- W
354 |         mat[mat < 1.e-10] <- 0
355 |         hc <- hclust(dist(mat,method="euclidean"),method="ward.D")
356 |         feature.ordering <- hc$labels[hc$order]
357 |         df <- melt(mat)
358 |         colnames(df) <- c("feature","signature","activity")
359 |         df$feature <- factor(df$feature,levels=feature.ordering)
360 |         df$signature <- factor(df$signature,levels=paste("W",seq(1:ncol(W)),sep=""))
361 |         p = ggplot(df,aes(x=feature,y=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow")
362 | 	p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep=""))
363 |         #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
364 |         p = p + .theme_ss
365 |         p = p + ggtitle("Feature Assoication to Clusters")
366 |         p = p + ylab("Contributions") + xlab("Feature")
367 |         p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0))
368 |         p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0))
369 |         p = p + theme(legend.position="right")
370 |         p = p + theme(legend.key.size = unit(0.5, "cm"))
371 | 	pdf(file=paste(OUTPUT,paste("L2EU.feature.association_to_clusters",index.m,"pdf",sep="."),sep=""),width=10,height=4)
372 | 		plot(p)
373 | 	dev.off()
374 | 
375 | 	# size = 8*scale (original)
376 | 	scale0 <- 0.8
377 |         scale <- 1
378 | 	g.ordering <- paste("G",seq(1:ncol(W)),sep="")
379 |         color.axis <- "black"
380 |         .theme_ss <- theme_bw(base_size=12) +
381 |                 theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8*scale, family="mono",face='bold',color=color.axis),
382 |                 axis.text.y = element_text(hjust = 0.5,size=12*scale, family="mono",face='bold',color=color.axis),
383 |                 axis.text = element_text(size = 12*scale, family = "mono",color=color.axis),
384 |                 axis.title=element_text(face="bold",size=12*scale,color="black"),
385 |                 plot.title=element_text(face="bold",size=12*scale))
386 |         mat <- H
387 |         hc <- hclust(dist(t(mat),method="euclidean"),method="ward.D")
388 |         gene.ordering <- hc$labels[hc$order]
389 |         df <- melt(mat)
390 |         colnames(df) <- c("signature","gene","activity")
391 |         df$signature <- factor(df$signature,levels=paste("W",seq(1:ncol(W)),sep=""))
392 |         df$gene <- factor(df$gene,levels=gene.ordering)
393 |         p = ggplot(df,aes(x=gene,y=signature,fill=activity))+geom_tile() #geom_tile(colour="yellow")
394 | 	p = p + scale_fill_gradient2(low="white",high ="black",name=paste("Activity",sep=""))
395 |         #p = p + scale_fill_gradientn(values=c(0,q1,q2,q3,q4,1),colours=c("yellow","green","black","red","magenta"),limit=c(0,1))
396 |         p = p + .theme_ss
397 |         p = p + ggtitle("Gene Assoication to Clusters")
398 |         p = p + ylab("Contributions") + xlab("Genes")
399 |         p = p + theme(axis.title.x = element_text(face="bold",colour="black",size=12*scale0))
400 |         p = p + theme(axis.title.y = element_text(face="bold",colour="black",size=12*scale0))
401 |         p = p + theme(legend.position="right")
402 |         p = p + theme(legend.key.size = unit(0.5, "cm"))
403 | 	pdf(file=paste(OUTPUT,paste("L2EU.gene.association_to_clusters",index.m,"pdf",sep="."),sep=""),width=10,height=4)
404 | 		plot(p)
405 | 	dev.off()
406 | }
407 | 
408 | 


--------------------------------------------------------------------------------
/scripts/archive/prep_bNMF_2021.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(data.table)
  3 | 
  4 | fetch_summary_stats <- function(variant_vec, gwas_ss_file, trait_ss_files) {
  5 |   
  6 |   # Given a final (pruned & proxied) set of variants to be clustered, 
  7 |   # fetch z-scores and sample size info from summary statistics for each of a 
  8 |   # series of traits
  9 |   # INPUTS:
 10 |   #   - variant_vec: vector of variants to be clustered
 11 |   #   - gwas_ss: filepath of  with VAR_IDs and betas from the original GWAS
 12 |   #   - trait_ss_vec: named vector of trait summary statistic filepaths
 13 |   # Final variant vector should be in VAR_ID format: [CHR]_[POS]_[REF]_[ALT] (using hg19)
 14 |   # GWAS summary statistic data frame must have at least the following 
 15 |   # columns: SNP (CHR:POS), REF, ALT, BETA
 16 |   # Each trait summary statistic dataset must have the following 
 17 |   # columns: VAR_ID, Effect_Allele_PH, BETA, SE, P_VALUE, N_PH
 18 |   
 19 |   # ISSUES:
 20 |   # - potential for strand-flip?
 21 |   
 22 |   read_single_trait <- function(trait, variant_df) {
 23 |     # Read/filter/process summary statistics for a single trait
 24 |     print(paste0("Processing ", trait, "..."))
 25 |     df <- fread(trait_ss_files[[trait]], data.table=F, stringsAsFactors=F)
 26 |     # if (grepl("\\/UKBB\\/", trait_ss_path)) {
 27 |     #   df <- df %>%
 28 |     #     mutate(VAR_ID=gsub(":", "_", variant)) %>%
 29 |     #     filter(VAR_ID %in% final_variant_ss$VAR_ID) %>%
 30 |     #     separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove=F) %>%
 31 |     #     select(VAR_ID, Effect_Allele_PH=ALT, BETA=beta, SE=se, P_VALUE=pval, N_PH=n_complete_samples)
 32 |     # }
 33 |     # if (!("N_PH" %in% names(df))) df$N_PH <- as.integer(NA)
 34 |     df %>%
 35 |       separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_") %>%
 36 |       mutate(SNP=paste(CHR, POS, sep=":")) %>%
 37 |       select(SNP, Effect_Allele_PH, N_PH, BETA, SE, P_VALUE) %>%
 38 |       right_join(variant_df, by="SNP", suffix=c(".gwas", ".trait")) %>%
 39 |       mutate(z=BETA / SE,  # First, calculate z-score magnitude
 40 |              z=case_when(  # Next, align z-score sign with GWAS phenotype-raising allele
 41 |                Effect_Allele_PH == Risk_Allele ~ z,
 42 |                Effect_Allele_PH == Nonrisk_Allele ~ -z,
 43 |                TRUE ~ as.numeric(NA)  # For example, if trait effect allele matches neither REF nor ALT from GWAS
 44 |              )) %>%
 45 |       select(SNP, z, N_PH, P_VALUE)
 46 |   }
 47 |   
 48 |   print("Retrieving risk alleles from the original GWAS summary statistics...")
 49 |   gwas_ss <- fread(gwas_ss_file, data.table=F, stringsAsFactors=F) %>%
 50 |     mutate(Risk_Allele=ifelse(BETA > 0, ALT, REF),
 51 |            Nonrisk_Allele=ifelse(BETA > 0, REF, ALT)) %>%
 52 |     select(SNP, Risk_Allele, Nonrisk_Allele)
 53 |   variant_df <- tibble(VAR_ID=variant_vec) %>%
 54 |     separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_") %>%
 55 |     mutate(SNP=paste(CHR, POS, sep=":")) %>%
 56 |     select(SNP) %>%
 57 |     inner_join(gwas_ss, by="SNP")
 58 |   print(paste0(nrow(variant_df), " of ", length(variant_vec),
 59 |                " variants are available in the primary GWAS."))
 60 |     
 61 |   
 62 |   print("Retrieving z-scores and sample sizes for each trait...")
 63 |   trait_df_long <- lapply(names(trait_ss_files), read_single_trait, variant_df) %>%
 64 |     setNames(names(trait_ss_files)) %>%
 65 |     bind_rows(.id="trait")  # Bind all processed trait datasets into a single "long" data frame
 66 |   
 67 |   z_df_wide <- trait_df_long %>%
 68 |     select(trait, SNP, z) %>%
 69 |     pivot_wider(names_from="trait", values_from="z")
 70 |   z_mat <- as.matrix(z_df_wide[, -1])
 71 |   rownames(z_mat) <- z_df_wide$SNP
 72 |   
 73 |   N_df_wide <- trait_df_long %>%
 74 |     select(trait, SNP, N_PH) %>%
 75 |     pivot_wider(names_from="trait", values_from="N_PH")
 76 |   N_mat <- as.matrix(N_df_wide[, -1])
 77 |   rownames(N_mat) <- N_df_wide$SNP
 78 |   
 79 |   # P_df <- trait_df_long %>%
 80 |   #   group_by(trait) %>%
 81 |   #   summarise(minP=min(P_VALUE, na.rm=T))
 82 |   
 83 |   list(z_mat=z_mat, N_mat=N_mat)
 84 |        # minP_vec=setNames(P_df$minP, P_df$trait))
 85 | }
 86 | 
 87 | 
 88 | prep_z_matrix <- function(z_mat, N_mat) {
 89 |   
 90 |   # Given a matrix of z-scores (N_variants x M_traits) and vector of median
 91 |   # sample sizes per trait:
 92 |   # 1) perform final pre-processing steps before bNMF clustering:
 93 |   # trait filtering by p-value, trait pruning based on correlation,
 94 |   # and z-score scaling based on sample size
 95 |   # 2) expand N x M matrix into N x 2M non-negative matrix
 96 |   
 97 |   # Filter traits by p-value (min. p-value < 0.05/N_variants)
 98 |   minP_vec <- apply(z_mat, 2, function(x) min(2 * pnorm(abs(x), lower.tail=F), na.rm=T))
 99 |   print(paste0("Removing traits with no variant having p < 0.05 / # variants: ",
100 |                paste(colnames(z_mat)[minP_vec >= 0.05 / nrow(z_mat)], 
101 |                      collapse=", ")))
102 |   z_mat <- z_mat[, minP_vec < 0.05 / nrow(z_mat)]
103 |   
104 |   # Prune traits by correlation (remove traits with Pearson |r| > 0.85)
105 |   trait_cor_mat <- cor(z_mat, use="pairwise.complete.obs")  # Trait-trait correlation matrix
106 |   trait_min_pvals <- minP_vec[names(minP_vec) %in% colnames(z_mat)]  # Remove filtered traits
107 |   remaining_traits <- names(sort(trait_min_pvals))
108 |   keep_traits <- c()
109 |   while (length(remaining_traits) > 0) {
110 |     # print(remaining_traits[1])
111 |     keep_traits <- c(keep_traits, remaining_traits[1])
112 |     remaining_traits <- setdiff(
113 |       remaining_traits, 
114 |       rownames(trait_cor_mat)[abs(trait_cor_mat[, remaining_traits[1]]) >= 0.85]
115 |     )
116 |   }
117 |   pruned_traits <- setdiff(colnames(z_mat), keep_traits)
118 |   print(paste("Traits removed in pruning process:", 
119 |               paste(pruned_traits, collapse=", ")))
120 |   z_mat <- z_mat[, keep_traits]
121 |   
122 |   # Adjust z-scores by sample size for each variant-trait combo
123 |   # i.e. (z = z / sqrt(medN) * mean(sqrt(medN_all_traits)))
124 |   print("Performing sample size adjustment...")
125 |   medN_vec <- apply(N_mat[, colnames(z_mat)], 2, median, na.rm=T)
126 |   z_mat <- z_mat / sqrt(N_mat[, colnames(z_mat)]) * mean(sqrt(medN_vec))
127 | 
128 |   
129 |   # Replace missing values with zero
130 |   print("Replacing remaining missing values with zero...")
131 |   print(paste0(sum(is.na(z_mat)), " missing values were replaced."))
132 |   z_mat[is.na(z_mat)] <- 0
133 |   
134 |   # Expand into N x 2M non-negative matrix
135 |   print("Expanding z-score matrix into non-negative matrix (N-variants x 2M-traits)...")
136 |   z_mat_pos <- z_mat
137 |   z_mat_pos[z_mat_pos < 0] <- 0
138 |   colnames(z_mat_pos) <- paste0(colnames(z_mat), "_pos")
139 |   z_mat_neg <- -z_mat
140 |   z_mat_neg[z_mat_neg < 0] <- 0
141 |   colnames(z_mat_neg) <- paste0(colnames(z_mat), "_neg")
142 |   final_z_mat <- cbind(z_mat_pos, z_mat_neg)
143 |   
144 |   # Write N x M and N x 2M matrices
145 |   saveRDS(z_mat, "z_score_mat.rds")
146 |   saveRDS(final_z_mat, "z_score_mat_nonnegative.rds")
147 |   
148 |   final_z_mat
149 | }
150 | 


--------------------------------------------------------------------------------
/scripts/archive/process_traits.R:
--------------------------------------------------------------------------------
 1 | library(readxl)
 2 | library(tidyverse)
 3 | 
 4 | 
 5 | fetch_variant_fracs <- function(trait_filepath_list, gwas_varIDs) {
 6 |   
 7 |   # Given a named list of filepaths to trait GWAS summary stats and a list of
 8 |   # primary GWAS variant IDs, return a named vector indicating the fraction of 
 9 |   # those traits having each variant
10 |   
11 |   trait_variants <- lapply(trait_filepaths, function(f) {
12 |     read_tsv(f, col_types=cols_only(VAR_ID="c"), n_max=100000) %>%
13 |       filter(VAR_ID %in% gwas_varIDs)
14 |   })
15 |   var_presence_df <- do.call(bind_rows, c(trait_variants, .id="trait")) %>%
16 |     mutate(var_present=1) %>%
17 |     pivot_wider(names_from="trait", values_from="var_present", values_fill=0)  # 0/1 values: is variant (row) present in trait GWAS (column)?
18 |   var_presence_mat <- as.matrix(var_presence_df[, 2:ncol(var_presence_df)])
19 |   rownames(var_presence_mat) <- var_presence_df$VAR_ID
20 |   var_fracs <- rowSums(var_presence_mat) / length(trait_variants)  # Fraction of traits having each variant
21 |   var_fracs
22 | }
23 | 
24 | 
25 | traits_doc <- read_excel("../data/clustering_data_source.xlsx", sheet=3)
26 | trait_filepaths <- setNames(traits_doc$full_path, traits_doc$trait_name)
27 | 
28 | gwas_varIDs <- sample(chosen_variants$varID, size=100)
29 | 
30 | variant_fracs <- fetch_variant_fracs(trait_filepaths, gwas_varIDs)
31 | 
32 | 
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/scripts/archive/proximal_preprocessing.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | 
 4 | ##### PREPROCESSING #####
 5 | 
 6 | prep_z_matrix <- function(z_mat, minP_vec, medN_vec) {
 7 |   
 8 |   # Given a matrix of z-scores (N_variants x M_traits) and vector of median
 9 |   # sample sizes per trait:
10 |   # 1) perform final pre-processing steps before bNMF clustering:
11 |   # trait filtering by p-value, trait pruning based on correlation,
12 |   # and z-score scaling based on sample size
13 |   # 2) expand N x M matrix into 2N x M non-negative matrix
14 |   
15 |   # Filter traits by p-value (min. p-value < 0.05/N_variants)
16 |   stopifnot(all(colnames(z_mat) == names(minP_vec)))
17 |   print(paste0("Removing traits with no variant having p < 0.05 / # variants: ",
18 |                paste(colnames(z_mat)[minP_vec >= 0.05 / nrow(z_mat)], 
19 |                      collapse=", ")))
20 |   z_mat <- z_mat[, minP_vec < 0.05 / nrow(z_mat)]
21 |   
22 |   # Prune traits by correlation (remove traits with Pearson |r| > 0.85)
23 |   trait_cor_mat <- cor(z_mat, use="pairwise.complete.obs")  # Trait-trait correlation matrix
24 |   trait_min_pvals <- minP_vec[names(minP_vec) %in% colnames(z_mat)]  # Remove filtered traits
25 |   remaining_traits <- names(sort(trait_min_pvals))
26 |   keep_traits <- c()
27 |   while (length(remaining_traits) > 0) {
28 |     # print(remaining_traits[1])
29 |     keep_traits <- c(keep_traits, remaining_traits[1])
30 |     remaining_traits <- setdiff(
31 |       remaining_traits, 
32 |       rownames(trait_cor_mat)[abs(trait_cor_mat[, remaining_traits[1]]) >= 0.85]
33 |     )
34 |   }
35 |   pruned_traits <- setdiff(colnames(z_mat), keep_traits)
36 |   print(paste("Traits removed in pruning process:", 
37 |               paste(pruned_traits, collapse=", ")))
38 |   z_mat <- z_mat[, keep_traits]
39 |   
40 |   # Adjust z-scores by sample size for each variant-trait combo (z = z / sqrt(N))
41 |   # z_mat <- t(t(z_mat) / sqrt(N_vec[match(pruned_traits, colnames(z_mat))]))
42 |   # Multiply full matrix by mean(sqrt(median(N))) (a single number for the whole matrix)
43 |   print(paste0("Multiplying sample size-adjusted z-score matrix by ", 
44 |                round(mean(sqrt(medN_vec[pruned_traits]))), " (i.e. mean(sqrt(median(N))))"))
45 |   z_mat <- z_mat * mean(sqrt(medN_vec[pruned_traits]))
46 |   
47 |   # Replace missing values with zero
48 |   z_mat[is.na(z_mat)] <- 0
49 |   
50 |   # Expand into 2N x M non-negative matrix
51 |   z_mat_pos <- z_mat
52 |   z_mat_pos[z_mat_pos < 0] <- 0
53 |   colnames(z_mat_pos) <- paste0(colnames(z_mat), "_pos")
54 |   z_mat_neg <- -z_mat
55 |   z_mat_neg[z_mat_neg < 0] <- 0
56 |   colnames(z_mat_neg) <- paste0(colnames(z_mat), "_neg")
57 |   final_z_mat <- cbind(z_mat_pos, z_mat_neg)
58 |   
59 |   final_z_mat
60 | }


--------------------------------------------------------------------------------
/scripts/archive/run_bNMF_2021.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | 
  3 | 
  4 | ##########################################################################
  5 | # Copyright (c) 2017, Broad Institute
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are
  8 | # met:
  9 | #     Redistributions of source code must retain the above copyright
 10 | #     notice, this list of conditions and the following disclaimer.
 11 | #     Redistributions in binary form must reproduce the above copyright
 12 | #     notice, this list of conditions and the following disclaimer in
 13 | #     the documentation and/or other materials provided with the
 14 | #     distribution.
 15 | #     Neither the name of the Broad Institute nor the names of its
 16 | #     contributors may be used to endorse or promote products derived
 17 | #     from this software without specific prior written permission.
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #########################################################################
 30 | 
 31 | ######################################################################
 32 | # Bayesian NMF algorithms for clustering
 33 | ######################################################################
 34 | # For implementation details see the ppaer 
 35 | # Udler MS, Kim J, von Grotthuss M,
 36 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018)
 37 | # Type 2 diabetes genetic loci informed by multi-trait
 38 | # associations point to disease mechanisms and
 39 | # subtypes: A soft clustering analysis. PLoS Med 15
 40 | # (9): e1002654.
 41 | ###########################
 42 | # For details on the original algorithms 
 43 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence.
 44 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013).
 45 | ######################################################################
 46 | 
 47 | BayesNMF.L2EU <- function(
 48 |   V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=10, phi=1.0
 49 | ) {
 50 |   
 51 |   # Bayesian NMF with half-normal priors for W and H
 52 |   # V0: input z-score matrix (variants x traits)
 53 |   # n.iter: Number of iterations for parameter optimization
 54 |   # a0: Hyper-parameter for inverse gamma prior on ARD relevance weights
 55 |   # tol: Tolerance for convergence of fitting procedure
 56 |   # K: Number of clusters to be initialized (algorithm may drive some to zero)
 57 |   # K0: Used for setting b0 (lambda prior hyper-parameter) -- should be equal to K
 58 |   # phi: Scaling parameter
 59 |   
 60 |   eps <- 1.e-50
 61 |   del <- 1.0
 62 |   active_nodes <- colSums(V0) != 0
 63 |   V0 <- V0[, active_nodes]
 64 |   V <- V0 - min(V0)
 65 |   Vmin <- min(V)
 66 |   Vmax <- max(V)
 67 |   N <- dim(V)[1]
 68 |   M <- dim(V)[2]
 69 |   
 70 |   W <- matrix(runif(N * K) * Vmax, ncol=K)
 71 |   H <- matrix(runif(M * K) * Vmax, ncol=M)
 72 |   I <- array(1, dim=c(N, M))
 73 |   V.ap <- W %*% H + eps
 74 |   
 75 |   phi <- sd(V)^2 * phi
 76 |   C <- (N + M) / 2 + a0 + 1
 77 |   b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0)
 78 |   lambda.bound <- b0 / C
 79 |   lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
 80 |   lambda.cut <- lambda.bound * 1.5
 81 |   
 82 |   n.like <- list()
 83 |   n.evid <- list()
 84 |   n.error <- list()
 85 |   n.lambda <- list()
 86 |   n.lambda[[1]] <- lambda
 87 |   iter <- 2
 88 |   count <- 1
 89 |   while (del >= tol & iter < n.iter) {
 90 |     H <- H * (t(W) %*% V) / 
 91 |       (t(W) %*% V.ap + phi * H * matrix(rep(1 / lambda, M), ncol=M) + eps)
 92 |     V.ap <- W %*% H + eps
 93 |     W <- W * (V %*% t(H)) / 
 94 |       (V.ap %*% t(H) + phi * W * t(matrix(rep(1 / lambda, N), ncol=N)) + eps)
 95 |     V.ap <- W %*% H + eps
 96 |     lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
 97 |     del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]])
 98 |     like <- sum((V - V.ap)^2) / 2
 99 |     n.like[[iter]] <- like
100 |     n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 
101 |                                          lambda + C * log(lambda))
102 |     n.lambda[[iter]] <- lambda
103 |     n.error[[iter]] <- sum((V - V.ap)^2)
104 |     if (iter %% 100 == 0) {
105 |       cat(iter, n.evid[[iter]], n.like[[iter]], n.error[[iter]], del, 
106 |           sum(colSums(W) != 0), sum(lambda >= lambda.cut), '\n')
107 |     }
108 |     iter <- iter + 1
109 |   }
110 |   return(list(
111 |     W,  # Variant weight matrix (N x K)
112 |     H,  # Trait weight matrix (K x M)
113 |     n.like,  # List of reconstruction errors (sum of squared errors / 2) per iteration
114 |     n.evid,  # List of negative log-likelihoods per iteration
115 |     n.lambda,  # List of lambda vectors (shared weights for each of K clusters, some ~0) per iteration
116 |     n.error  # List of reconstruction errors (sum of squared errors) per iteration
117 |   ))
118 | }
119 | 
120 | 
121 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, ...) {
122 |   
123 |   # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure
124 |   # a series of times to generate results and evaluate cluster stability
125 |   
126 |   print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)..."))
127 |   set.seed(random_seed)
128 |   bnmf_reps <- lapply(1:n_reps, function(r) {
129 |     res <- BayesNMF.L2EU(z_mat, ...)
130 |     names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error")
131 |     res
132 |   })
133 |   bnmf_reps
134 | }
135 | 
136 | 
137 | summarize_bNMF <- function(bnmf_reps) {
138 |   
139 |   # Given output from bNMF (list of length N_iterations),
140 |   # generate summary tables and plots
141 | 
142 |   make_run_summary <- function(reps) {
143 |     
144 |     # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs
145 |     
146 |     run_summary <- map_dfr(1:length(reps), function(i) {
147 |       res <- reps[[i]]
148 |       final_lambdas <- res$n.lambda[[length(res$n.lambda)]]
149 |       tibble(
150 |         run=i,
151 |         K=sum(final_lambdas > min(final_lambdas)),  # Assume that lambdas equal to the minimum lambda are ~ 0
152 |         evid=res$n.evid[[length(res$n.evid)]]  # Evidence = -log_likelihood
153 |       )
154 |     }) %>%
155 |       arrange(evid)
156 |     
157 |     unique.K <- table(run_summary$K)
158 |     n.K <- length(unique.K)  # Number of distinct K
159 |     MAP.K.run <- sapply(names(unique.K), function(k) {  # bNMF run index with the maximum posterior for given K
160 |       tmp <- run_summary[run_summary$K == k, ]
161 |       tmp$run[which.min(tmp$evid)]
162 |     })
163 |     
164 |     list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run)
165 |   }
166 |   
167 |   print("Summarizing bNMF results...")
168 |   
169 |   print("Writing table of chosen K across iterations...")
170 |   run_summary <- make_run_summary(bnmf_reps)
171 |   write_tsv(run_summary$run_tbl, "run_summary.txt")
172 | 
173 |   n.K <- length(run_summary$unique.K)  # Number of distinct K
174 |   
175 |   get_W <- function(clustering) {
176 |     W_raw <- clustering$W
177 |     W_raw[, colSums(W_raw > 1e-10) > 0]
178 |   }
179 |   
180 |   get_H <- function(clustering) {
181 |     H_raw <- clustering$H
182 |     H_raw[rowSums(H_raw > 1e-10) > 0, ]
183 |   }
184 |   
185 |   print("Plotting variant and trait contributions...")
186 |   silent <- sapply(names(run_summary$unique.K), function(k) {  # Create heatmaps for MAP iteration for each K
187 |     res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]]
188 |     W <- res$W[, colSums(res$W) != 0]  # feature-cluster association matrix
189 |     H <- res$H[rowSums(res$H) != 0, ]  # cluster-gene association matrix
190 |     W[W < 1.e-10] <- 0
191 |     H[H < 1.e-10] <- 0
192 |     
193 |     W0 <- data.frame(W)
194 |     W0[, "variant"] <- rownames(W)
195 |     H0 <- data.frame(H)
196 |     H0[, "cluster"] <- rownames(H)
197 |     
198 |     write_tsv(paste0("L2EU.W.mat.K", k, ".txt"))
199 |     write_tsv(paste0("L2EU.H.mat.K", k, ".txt"))
200 |     
201 |     mat.reconstructed <- W %*% H   # reconstructed matrix == approximation for the input matrix 
202 |     
203 |     # Setup for plotting
204 |     scale0 <- 0.8
205 |     scale <- 1
206 |     g.ordering <- paste("G", seq(1:ncol(W)), sep="")
207 |     color.axis <- "black"
208 |     .theme_ss <- theme_bw(base_size=12) +
209 |       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 
210 |                                        family="mono", face='bold', color=color.axis),
211 |             axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis),
212 |             axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis),
213 |             axis.title=element_text(face="bold", size=12 * scale,color="black"),
214 |             plot.title=element_text(face="bold", size=12 * scale))
215 |     
216 |     # Plot W matrix (feature activities)
217 |     W_hc <- hclust(dist(W, method="euclidean"), method="ward.D")
218 |     W_variant.ordering <- W_hc$labels[W_hc$order]
219 |     W_plt_df <- W %>%
220 |       as.data.frame() %>%
221 |       rownames_to_column(var="variant") %>%
222 |       gather(key="cluster", value="activity", -variant) %>%
223 |       mutate(variant=factor(variant, levels=W_variant.ordering),
224 |              cluster=factor(cluster, 
225 |                             levels=paste0("V", 1:ncol(W))))
226 |     W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 
227 |       geom_tile() +
228 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
229 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
230 |       .theme_ss +
231 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
232 |       ylab("Cluster") + xlab("Variant") +
233 |       theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) +
234 |       theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) +
235 |       theme(legend.position="right") +
236 |       theme(legend.key.size = unit(0.5, "cm"))
237 |     ggsave(paste0("W_plot_K", k, ".pdf"), plot=W_plt)
238 |     
239 |     H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D")
240 |     H_trait.ordering <- H_hc$labels[H_hc$order]
241 |     H_plt_df <- t(H) %>%
242 |       as.data.frame() %>%
243 |       rownames_to_column(var="trait") %>%
244 |       gather(key="cluster", value="activity", -trait) %>%
245 |       mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))),
246 |              trait=factor(trait, levels=H_trait.ordering))
247 |     H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 
248 |       geom_tile() +
249 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
250 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
251 |       .theme_ss +
252 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
253 |       ylab("Cluster") + xlab("Trait") +
254 |       theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) +
255 |       theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) +
256 |       theme(legend.position="right") +
257 |       theme(legend.key.size = unit(0.5, "cm"))
258 |     ggsave(paste0("H_plot_K", k, ".pdf"), plot=H_plt)
259 |   })
260 | }
261 | 


--------------------------------------------------------------------------------
/scripts/archive/test_pipeline_2021.R:
--------------------------------------------------------------------------------
 1 | # This script is intended to run the full pipeline for bNMF clustering based
 2 | # on summary statistics and test its agreement with scripts that are
 3 | # currently in place.
 4 | 
 5 | # NOTE: to run Tabix on UGER, must start with "ssh gsa4; use .zlib-1.2.6"
 6 | 
 7 | source("choose_variants.R")  # fld_pruning, count_traits_per_variant, fina_variants_needing_proxies, & choose_potential_proxies
 8 | source("prep_bNMF.R")  # fetch_summary_stats & prep_z_matrix
 9 | source("run_bNMF.R")  # run_bNMF & summarize_bNMF
10 | 
11 | gwas_traits <- readxl::read_excel("../data/clustering_data_source.xlsx", sheet="gwas_traits")
12 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait_name)
13 | trait_ss_files <- trait_ss_files[!grepl("MAGIC", names(trait_ss_files))]  # Some MAGIC GWAS files don't have N_PH field
14 | 
15 | initial_t2d_snps <- read_tsv("../data/T2D_initial_vars_pval.txt")
16 | set.seed(1)
17 | initial_t2d_snps <- sample_n(initial_t2d_snps, size=1000) %>%
18 |   select(VAR_ID=VAR_ID_hg19, PVALUE)
19 | 
20 | rsID_map_file <- "/humgen/diabetes2/users/clairekim/list_VARID_rsID_updated.txt"  # From dbSNP v1.38 -- maps positional IDs to rsIDs
21 | 
22 | # Variant choice steps
23 | 
24 | pruned_variants <- ld_pruning(initial_t2d_snps, rsID_map_file)
25 | 
26 | var_nonmissingness <- count_traits_per_variant(pruned_variants$VAR_ID, trait_ss_files)
27 | 
28 | proxies_needed_df <- find_variants_needing_proxies(pruned_variants, var_nonmissingness,
29 |                                                    rsID_map_file)
30 | 
31 | tabix_path <- "/humgen/diabetes2/users/mvg/VariantClustering/tabix-0.2.6/tabix"
32 | ld_file <- "/humgen/diabetes2/users/mvg/VariantClustering/LD_EUR.tsv.bgz"
33 | final_variant_set <- choose_proxies(
34 |   proxies_needed_df,
35 |   tabix_path,
36 |   ld_file,
37 |   rsID_map_file,
38 |   trait_ss_files,
39 |   pruned_variants
40 | )
41 | 
42 | # Prep bNMF steps
43 | 
44 | t2d_ss_filepath <- "/humgen/diabetes2/users/clairekim/Mahajan.NatGenet2018b.T2D.European_formatted.txt"
45 | 
46 | initial_zscore_matrices <- fetch_summary_stats(
47 |   final_variant_set,
48 |   t2d_ss_filepath,
49 |   trait_ss_files
50 | )
51 | 
52 | final_zscore_matrix <- prep_z_matrix(initial_zscore_matrices$z_mat,
53 |                                      initial_zscore_matrices$N_mat)
54 | 
55 | # Run bNMF steps
56 | 
57 | bnmf_reps <- run_bNMF(final_zscore_matrix, n_reps=10)
58 | 
59 | summarize_bNMF(bnmf_reps)
60 | 


--------------------------------------------------------------------------------
/scripts/bNMF_example_pipeline.R:
--------------------------------------------------------------------------------
  1 | # This script shows how to run the bNMF clustering pipeline using toy datasets.
  2 | 
  3 | #----
  4 | start=Sys.time()
  5 | 
  6 | # load requires packages
  7 | install.packages("pacman")
  8 | pacman::p_load(tidyverse, data.table, readxl, magrittr, dplyr, strex,
  9 |                rstudioapi, DT, kableExtra, GenomicRanges)
 10 | 
 11 | if (!require("BiocManager", quietly = TRUE))
 12 |   install.packages("BiocManager")
 13 | 
 14 | BiocManager::install("GenomicRanges")
 15 | BiocManager::install("Homo.sapiens")
 16 | 
 17 | # load project scripts containing bNMF functions
 18 | source("../scripts/choose_variants.R")  # ld_pruning, count_traits_per_variant, fina_variants_needing_proxies, & choose_potential_proxies
 19 | source("../scripts/prep_bNMF.R")  # fetch_summary_stats & prep_z_matrix
 20 | source("../scripts/run_bNMF.R")  # run_bNMF & summarize_bNMF
 21 | 
 22 | setwd(dirname(getActiveDocumentContext()$path))
 23 | 
 24 | #----
 25 | 
 26 | # USER INPUTS!!!
 27 | project_dir = './test_results' # path to where you want results saved
 28 | user_token = 'YOUR_LDLINK_API_TOKEN' # token for LDlinkR api
 29 | 
 30 | # create project folder 
 31 | dir.create(project_dir)
 32 | 
 33 | 
 34 | #----
 35 | 
 36 | # SECTION 1: PULL IN GWAS INFORMATION
 37 | 
 38 | data_dir = "../example_data/"
 39 | rsID_map_file <- file.path(data_dir, "rsID_map_example.txt")  # From dbSNP v1.38 -- maps positional IDs to rsIDs
 40 | 
 41 | # GWAS for main trait
 42 | gwas <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"),
 43 |                    sheet="main_gwas") %>%
 44 |   data.frame()
 45 | 
 46 | # GWAS for clustering traits
 47 | gwas_traits <- read_excel(file.path(data_dir, "clustering_data_sources_example.xlsx"),
 48 |                           sheet="trait_gwas")
 49 | 
 50 | # GWAS to be used for final allele alignment
 51 | main_ss_filepath <- gwas %>% filter(largest=="Yes") %>% pull(full_path)
 52 | 
 53 | gwas_ss_files <- setNames(gwas$full_path, gwas$study)
 54 | trait_ss_files <- setNames(gwas_traits$full_path, gwas_traits$trait)
 55 | trait_ss_size <- setNames(gwas_traits$sample_size, gwas_traits$trait)
 56 | 
 57 | #----
 58 | 
 59 | # SECTION 2: PULL SIGNIFICANT VARIANTS FROM MAIN TRAIT GWAS
 60 | 
 61 | # P-value threshold for variants in main trait
 62 | PVCUTOFF = 5e-8
 63 | 
 64 | n_gwas <- length(gwas_ss_files)
 65 | 
 66 | vars_sig = data.frame(VAR_ID = as.character(),
 67 |                       P_VALUE = as.numeric(),
 68 |                       Risk_Allele=as.character(),
 69 |                       GWAS=as.character())
 70 | 
 71 | print(sprintf("Pulling significant SNPs w/ pval<%.1e from %i T2D GWAS...", PVCUTOFF, n_gwas))
 72 | 
 73 | for(i in 1:n_gwas) {
 74 |   print(paste0("...Reading ", names(gwas_ss_files)[i], "..."))
 75 | 
 76 |   vars <- fread(gwas_ss_files[i], data.table = F, stringsAsFactors=F)
 77 | 
 78 |   if (!"BETA" %in% colnames(vars)){
 79 |     print("Converting Odds Ratio to Log Odds Ratio...")
 80 |     vars <- vars %>%
 81 |       mutate(BETA = log(as.numeric(ODDS_RATIO)))
 82 |   }
 83 |   vars <- vars %>%
 84 |     filter(as.numeric(P_VALUE) <= PVCUTOFF) %>%
 85 |     subset(grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]", VAR_ID)) %>%
 86 |     separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"), sep="_", remove = F) %>%
 87 |     mutate(Risk_Allele = ifelse(BETA>=0, ALT, REF)) %>%
 88 |     mutate(GWAS = gwas$study[i]) %>%
 89 |     select(VAR_ID, P_VALUE, Risk_Allele, GWAS)
 90 | 
 91 |   print(nrow(vars))
 92 |   vars_sig = rbind(vars_sig, vars)
 93 | }
 94 | print(paste("No. total SNPs below pval cutoff:",nrow(vars_sig)))
 95 | 
 96 | # remove duplicates
 97 | vars_sig_uniq <- vars_sig %>%
 98 |   arrange(VAR_ID, P_VALUE) %>%
 99 |   filter(!duplicated(VAR_ID)) %>% # so we remove duplicates with the higher pvalue
100 |   rename(PVALUE = P_VALUE)
101 | print(paste("No. unique SNPs:",nrow(vars_sig_uniq)))
102 | 
103 | # remove indels
104 | vars_sig_noIndels <- vars_sig_uniq %>%
105 |   separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep="_",remove = F) %>%
106 |   mutate(alleles = paste0(REF,ALT)) %>%
107 |   subset(nchar(alleles)==2 | (nchar(alleles)<=4 & grepl(",",alleles))) %>%
108 |   select(VAR_ID, PVALUE, Risk_Allele, GWAS)
109 | print(paste("No. SNPs excluding indels:",nrow(vars_sig_noIndels)))
110 | 
111 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
112 | 
113 | # #----
114 | 
115 | # SECTION 3: VARIANT PRUNING (LD-BASED)
116 | 
117 | # LD pruning
118 | print("LD-pruning using EUR panel in LDlinkR::SNPclip...")
119 | ld_prune(df_snps = vars_sig_noIndels,
120 |                     pop = "EUR",
121 |                     output_dir = project_dir,
122 |                     r2 = 0.05,
123 |                     maf=0.001,
124 |                     my_token = user_token,
125 |                     chr = c(1:22))
126 | 
127 | #----
128 | 
129 | # combine LD-pruning results
130 | print("Combining SNP.clip results...")
131 | ld_files <- list.files(path = project_dir,
132 |                        pattern = "^snpClip_results",
133 |                        full.names = T)
134 | 
135 | df_clipped_res = data.frame("RS_Number"=as.character(),
136 |                          "Position"=as.character(),
137 |                          "Alleles"= as.character(),
138 |                          "Details"=as.character())
139 | 
140 | rename_cols_clipped <- c(RS_Number="RS Number",
141 |                          Position_grch37="Position")
142 | 
143 | for (ld_file in ld_files){
144 |   df <- fread(ld_file, stringsAsFactors = F, data.table = F) %>%
145 |     dplyr::rename(any_of(rename_cols_clipped))
146 |   df_clipped_res <- rbind(df_clipped_res, df)
147 | }
148 | 
149 | df_clipped_kept <- df_clipped_res %>%
150 |     filter(Details=="Variant kept.")
151 | 
152 | pruned_vars <- vars_sig_noIndels %>%
153 |     separate(VAR_ID, into=c("CHR","POS","REF","ALT"), sep = "_",remove = F) %>%
154 |     mutate(ChrPos = paste0("chr", CHR, ":", POS)) %>%
155 |     filter(ChrPos %in% df_clipped_kept$Position)
156 | print(sprintf("T2D SNPs pruned from %i to %i...", nrow(vars_sig_noIndels), nrow(pruned_vars)))
157 | 
158 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
159 | 
160 | #----
161 | # SECTION 4: VARIANT MISSINGNESS
162 | 
163 | print("Searching for variants in trait GWAS...")
164 | gwas_variants <- pruned_vars$VAR_ID
165 | df_Ns <- count_traits_per_variant(gwas_variants,
166 |                                   trait_ss_files)
167 | 
168 | # fix column names
169 | df_Ns_rev <- df_Ns %>%
170 |   column_to_rownames("VAR_ID") %>%
171 |   set_colnames(names(trait_ss_files))
172 | 
173 | print("Calculating variant missingess in traits...")
174 | variant_counts_df <- data.frame(VAR_ID=rownames(df_Ns_rev),
175 |                                 frac=rowSums(!is.na(df_Ns_rev[,names(trait_ss_files)]))/length(trait_ss_files))
176 | var_nonmissingness <- ifelse(
177 |   gwas_variants %in% variant_counts_df$VAR_ID,
178 |   # if in counts data frame, take the non-missing fraction:
179 |   variant_counts_df$frac[match(gwas_variants, variant_counts_df$VAR_ID)],
180 |   # else not in data frame, so non-missing fraction is 0:
181 |   0
182 | )
183 | var_nonmissingness <- setNames(var_nonmissingness, gwas_variants)
184 | 
185 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
186 | 
187 | #----
188 | # SECTION 5: DETERMINE VARIANTS NEEDING PROXIES
189 | 
190 | print("Identifying variants needing proxies...")
191 | proxies_needed_df <- find_variants_needing_proxies(pruned_vars,
192 |                                                    var_nonmissingness,
193 |                                                    rsID_map_file,
194 |                                                    missing_cutoff = 0.8)
195 | 
196 | #----
197 | # SECTION 6: PROXY SEARCH
198 | 
199 | print("Searching for proxies with TopLD API...")
200 | proxy_search_results <- choose_proxies(need_proxies = proxies_needed_df,
201 |                                        method="LDlink",
202 |                                        LDlink_token = user_token,
203 |                                     topLD_path = api_path,
204 |                                     rsID_map_file = rsID_map_file,
205 |                                     trait_ss_files = trait_ss_files,
206 |                                     pruned_variants = pruned_vars,
207 |                                     population="EUR"
208 | )
209 | 
210 | df_proxies <- proxy_search_results %>%
211 |   dplyr::select(VAR_ID, proxy_VAR_ID) %>%
212 |   dplyr::inner_join(pruned_vars[,c("VAR_ID","GWAS")], by="VAR_ID") %>%
213 |   mutate(Risk_Allele=NA, PVALUE=NA)
214 | 
215 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
216 | 
217 | #----
218 | 
219 | # SECTION 7: Fetch summary statistics for SNPs in trait GWAS
220 | 
221 | print("Prepping input for fetch_summary_stats...")
222 | 
223 | # Remove SNPs from pruned_vars that needed proxies
224 | df_orig_snps <- pruned_vars %>%
225 |   filter(!VAR_ID %in% proxies_needed_df$VAR_ID)
226 | 
227 | # Join with pruned vars so we can get the original GWAS where the proxy came from 
228 | # and add the necessary columns (if you don't care about the original GWAS, can
229 | # ski the inner_join step and just set GWAS=NA)
230 | df_proxies <- proxy_search_results %>%
231 |   dplyr::select(VAR_ID, proxy_VAR_ID) %>%
232 |   dplyr::inner_join(pruned_vars[,c("VAR_ID","GWAS")], by="VAR_ID") %>%
233 |   mutate(Risk_Allele=NA, PVALUE=NA)
234 | 
235 | # combine original SNPs with proxy SNP
236 | # MAKE SURE assign proxy_VAR_ID to VAR_ID in df_proxies!!!
237 | df_input_snps <- rbind(df_orig_snps %>% select(VAR_ID, PVALUE, Risk_Allele,GWAS),
238 |                        df_proxies %>% select(VAR_ID=proxy_VAR_ID, PVALUE, Risk_Allele,GWAS)) %>%
239 |   arrange(PVALUE) %>%
240 |   filter(!duplicated(VAR_ID))
241 | 
242 | cat(sprintf("\n%i original SNPs...\n", nrow(df_orig_snps)))
243 | cat(sprintf("\n%i proxy SNPs...\n", nrow(df_proxies)))
244 | cat(sprintf("\n%i total unique SNPs!\n", nrow(df_input_snps)))
245 | 
246 | initial_zscore_matrices <- fetch_summary_stats(
247 |   df_input_snps,
248 |   main_ss_filepath,
249 |   trait_ss_files,
250 |   trait_ss_size,
251 |   pval_cutoff=0.05
252 | )
253 | 
254 | 
255 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
256 | system(sprintf("mv alignment_GWAS_summStats.csv %s", project_dir))
257 | 
258 | #----
259 | 
260 | # Section 8: get rsIDs for final variant set
261 | 
262 | print("Getting rsIDs for final snps and saving to results...")
263 | z_mat <- initial_zscore_matrices$z_mat
264 | N_mat <- initial_zscore_matrices$N_mat
265 | 
266 | df_var_ids <- df_input_snps %>%
267 |   separate(VAR_ID, into=c("Chr","Pos","Ref","Alt"),sep="_",remove = F) %>%
268 |   mutate(ChrPos=paste(Chr,Pos,sep = ":")) %>%
269 |   subset(ChrPos %in% rownames(z_mat))
270 | write(df_var_ids$VAR_ID,'my_snps.tmp')
271 | 
272 | system(sprintf("grep -wFf my_snps.tmp %s > %s",
273 |                rsID_map_file, file.path(project_dir, "rsID_map.txt")))
274 | 
275 | df_rsIDs <- fread(cmd=sprintf("grep -wFf my_snps.tmp %s",rsID_map_file),
276 |                   header = F,
277 |                   col.names = c("VAR_ID","rsID"))
278 | print(sprintf("rsIDs found for %i of %i SNPs...", nrow(df_rsIDs), nrow(df_var_ids)))
279 | 
280 | df_rsIDs_final <- df_rsIDs %>%
281 |   filter(VAR_ID %in% df_var_ids$VAR_ID)
282 | write_delim(x=df_rsIDs_final,
283 |             file = file.path(project_dir, "rsID_map.txt"),
284 |             col_names = T)
285 | 
286 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
287 | 
288 | #----
289 | 
290 | # Section 9: Fill missing data in z-score and N matrices
291 | 
292 | df_snps <- df_input_snps %>%
293 |   inner_join(df_rsIDs_final, by="VAR_ID") %>%
294 |   data.frame()
295 | 
296 | print("Searching for cover proxies for missing z-scores...")
297 | initial_zscore_matrices_final <- fill_missing_zscores(initial_zscore_matrices,
298 |                                                       df_snps,
299 |                                                       trait_ss_files,
300 |                                                       trait_ss_size,
301 |                                                       main_ss_filepath,
302 |                                                       rsID_map_file,
303 |                                                      method_fill="median",
304 |                                                      population="EUR")
305 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
306 | 
307 | #----
308 | 
309 | # Section 10.) Generate non-negative z-score matrix
310 | 
311 | prep_z_output <- prep_z_matrix(z_mat = initial_zscore_matrices_final$z_mat,
312 |                                N_mat = initial_zscore_matrices_final$N_mat,
313 |                                corr_cutoff = 0.8)
314 | 
315 | # prep_z_output has two outputs:
316 | 
317 | #   1.) The scaled, non-negative z-score matrix
318 | final_zscore_matrix <- prep_z_output$final_z_mat
319 | 
320 | #   2.) Results from the trait filtering
321 | df_traits_filtered <- prep_z_output$df_traits
322 | write_csv(x = df_traits_filtered,
323 |           file = file.path(project_dir,"df_traits.csv"))
324 | 
325 | # prep_z_matrix also save trait correlation matrix to working dir, so move to project dir
326 | system(sprintf("mv trait_cor_mat.txt %s", project_dir))
327 | 
328 | print(sprintf("Final matrix: %i SNPs x %i traits",
329 |       nrow(final_zscore_matrix),
330 |       ncol(final_zscore_matrix)/2))
331 | 
332 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
333 | 
334 | #----
335 | 
336 | # Section 11.) Run bNMF 
337 | bnmf_reps <- run_bNMF(final_zscore_matrix,
338 |                       n_reps=25,
339 |                       tolerance = 1e-6)
340 | summarize_bNMF(bnmf_reps, dir_save=project_dir)
341 | 
342 | save.image(file = file.path(project_dir, "pipeline_data.RData"))
343 | 
344 | end=Sys.time()
345 | print("Total pipeline runtime:")
346 | print(end-start)
347 | 
348 | #----
349 | 
350 | # format results
351 | k <- NULL
352 | if (is.null(k)){
353 |   html_filename <- "results_for_maxK.html"
354 | } else {
355 |   html_filename <- sprintf("results_for_K_%i.html", k)
356 | }
357 | 
358 | rmarkdown::render(
359 |   './format_bNMF_results.Rmd',
360 |   output_file = html_filename,
361 |   params = list(main_dir = project_dir,
362 |                 k = k,
363 |                 loci_file="query",
364 |                 GTEx=F,
365 |                 my_traits=gwas_traits)
366 | )
367 | 
368 | 
369 | #----
370 | 


--------------------------------------------------------------------------------
/scripts/choose_variants.R:
--------------------------------------------------------------------------------
  1 | packages = c('tidyverse', 'data.table', 'LDlinkR')
  2 | invisible(lapply(packages, library, character.only = TRUE))
  3 | 
  4 | # CURRENT ASSUMPTIONS ABOUT FORMATTING:
  5 | # - Genome build is hg19/GrCh37
  6 | # - Summary statistic datasets are whitespace-delimited with columns: VAR_ID, BETA, SE, N_PH
  7 | # - Variant IDs are all of the format: CHR_POS_REF_ALT 
  8 | 
  9 | 
 10 | 
 11 | snp_clump <- function(df_snps,
 12 |                       id="VAR_ID",
 13 |                       window=500000,
 14 |                       chr=1:22,
 15 |                       pos_range=c(1,Inf)) {
 16 |   
 17 |   clumped_snps <- c()
 18 |   uniq_chr <- sort(as.integer(unique(df_snps$CHR)))
 19 |   
 20 |   for (i in uniq_chr) {
 21 |     tmp <- df_snps %>%
 22 |       filter(CHR==i) %>%
 23 |       arrange(PVALUE) %>%
 24 |       mutate(POS = as.integer(POS)) %>%
 25 |       data.frame()
 26 |     
 27 |     if (i %in% chr) {
 28 |       print(sprintf("Clumping Chr. %i...",i))
 29 |       
 30 |       do_clump <- tmp %>%
 31 |         filter(between(POS, pos_range[1], pos_range[2]))
 32 |       dont_clump <- tmp %>%
 33 |         filter(!between(POS, pos_range[1], pos_range[2]))
 34 |       clumped_snps <- c(clumped_snps, dont_clump[,id])
 35 |       print(sprintf("Clumping %i variants, not clumping %i variants...",
 36 |                     nrow(do_clump), nrow(dont_clump)))
 37 |       
 38 |       remaining_snps <- do_clump[,id]
 39 |       
 40 |       j=0
 41 |       while (length(remaining_snps)>0){
 42 |         clumped_snps <- c(clumped_snps, remaining_snps[1])
 43 |         
 44 |         cur_pos <- do_clump$POS[do_clump[id]==remaining_snps[1]]
 45 |         
 46 |         close_snps <- do_clump[abs(do_clump$POS-cur_pos)<=window, id]
 47 | 
 48 |         remaining_snps <- setdiff(remaining_snps, close_snps)
 49 | 
 50 |         j=j+1
 51 |       }
 52 |     num_clumped <- j + nrow(dont_clump)
 53 |     } else {
 54 |       print(sprintf("No clumping for Chr %i!",i))
 55 |       num_clumped=nrow(tmp)
 56 |       clumped_snps <- c(clumped_snps, tmp[,id])
 57 |     }
 58 |     cat(sprintf("Chr %i clumped from %i to %i SNPs\n\n",i, nrow(tmp), num_clumped))
 59 |   }
 60 |   print(sprintf("No. SNPs after clumping: %i",length(clumped_snps)))
 61 |   return(clumped_snps)
 62 | }
 63 | 
 64 | ld_prune <- function(df_snps,
 65 |                                 pop,
 66 |                                 my_token,
 67 |                                 r2=0.1,
 68 |                                 maf=0.01,
 69 |                                 chr=1:22,
 70 |                                 output_dir="./") {
 71 |   
 72 |   snp_clip_input <- df_snps %>% 
 73 |     separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep="_",remove = F) %>%
 74 |     mutate(ChrPos = paste0("chr",CHR,":",POS)) %>%
 75 |     arrange(PVALUE)
 76 |   
 77 |   df_clipped <- data.frame(ChrPos=as.character(),
 78 |                            rsID=as.character())
 79 |   for (i in chr){
 80 |     start = Sys.time()
 81 |     cur_chr <- snp_clip_input %>%
 82 |       filter(CHR==i)
 83 |     print(sprintf("Chr %i (%i SNPs)",i, nrow(cur_chr)))
 84 | 
 85 |     
 86 |     if (nrow(cur_chr) == 0) {
 87 |       next
 88 |     }
 89 |     else if (nrow(cur_chr) == 1) {
 90 | 
 91 |     cur_snps <- cur_chr %>%
 92 |       pull(ChrPos)
 93 |     # if only one SNP, use LDhap to get the variant info
 94 |     clipped_res <- LDlinkR::LDhap(snps = cur_snps, 
 95 |           pop = pop, 
 96 |           token = my_token,
 97 |           genome_build = "grch37",
 98 |           table_type = "variant"
 99 |     ) %>%
100 |       rename(Alleles=Allele_Frequency) %>%
101 |       mutate(Details="Variant kept.")
102 |   } else { # >1 SNP
103 |     if (between(nrow(cur_chr), 1, 5000)) { 
104 |       cur_snps <- cur_chr %>%
105 |         pull(ChrPos)
106 |       
107 |     } else {
108 |       print("Chromosome has >5000 SNPs; breaking into sections...")
109 |       var_df_list <- split(cur_chr, (seq(nrow(cur_chr))-1) %/% 5000)
110 |       cur_snps <- c()
111 |       
112 |       for (j in 1:length(var_df_list)){
113 |         
114 |         print(sprintf("Pruning subset %i for chromosome %i...", j, i))
115 |         var_df <- var_df_list[[j]]
116 |         cur_snps_j <- var_df %>%
117 |           pull(ChrPos)
118 |         
119 |         clipped_res_split <- LDlinkR::SNPclip(
120 |           cur_snps_j,
121 |           pop = pop,
122 |           r2_threshold = r2,
123 |           maf_threshold = maf,
124 |           token = my_token,
125 |           file = FALSE,
126 |           genome_build = "grch37")
127 |         
128 |         clipped_snps <- clipped_res_split %>%
129 |           filter(Details=="Variant kept.") %>%
130 |           pull(RS_Number)
131 |         print(sprintf("Subset %i pruned to %i SNPs...", j, length(clipped_snps)))
132 |         cur_snps <- c(cur_snps, clipped_snps)
133 |       }
134 |     }
135 |     
136 |     print(sprintf("Performing final chromosomal pruning for %i SNPs...", length(cur_snps)))
137 |     clipped_res <- LDlinkR::SNPclip(
138 |       cur_snps,
139 |       pop = pop,
140 |       r2_threshold = r2,
141 |       maf_threshold = maf,
142 |       token = my_token,
143 |       file = FALSE,
144 |       genome_build = "grch37")
145 |   }
146 |   
147 |     fwrite(x=clipped_res,
148 |            file = file.path(output_dir, sprintf("snpClip_results_%s_chr%i.txt", pop, i)),
149 |            quote = F,
150 |            sep = "\t")
151 |     
152 |     df_clipped_final <- clipped_res %>%
153 |       filter(Details=="Variant kept.")
154 |     print(sprintf("Chr%i pruned from %i to %i SNPs...",i, nrow(cur_chr), nrow(df_clipped_final)))
155 |     
156 |     end = Sys.time()
157 |     print(end-start)
158 |     
159 |   }
160 |   print("Done!")
161 |   
162 | }
163 | 
164 | count_traits_per_variant <- function(gwas_variants, ss_files) {
165 | 
166 |   # Given a vector of variants and a named vector of summary statistics files
167 |   # for traits to be clustered, output a vector of non-missing trait fractions
168 |   # per variant
169 |   
170 |   print("Assessing variant missingness across traits...")
171 |   write(gwas_variants, "all_snps_varids.tmp")
172 |   
173 |   rename_cols <- c(N_PH="N")
174 |   
175 |   variant_df_list <- lapply(1:length(ss_files), function(i) {
176 |     print(sprintf("...Reading %s...", names(ss_files)[i]))
177 |     
178 |     headers <- as.character(fread(ss_files[i], nrows=1,
179 |                                   data.table=F, stringsAsFactors=F, header=F))
180 |     
181 |     if (endsWith(ss_files[i],".gz")) {
182 |       df <- fread(cmd=sprintf("gzip -cd %s | fgrep -wf all_snps_varids.tmp ",ss_files[i]),
183 |                   header=F,
184 |                   col.names=headers,
185 |                   data.table=F,
186 |                   stringsAsFactors=F) %>%
187 |         rename(any_of(rename_cols))
188 |         
189 |     } else {
190 |       df <- fread(cmd=sprintf("fgrep -wf all_snps_varids.tmp %s ",ss_files[i]),
191 |                   header=F,
192 |                   col.names=headers,
193 |                   data.table=F,
194 |                   stringsAsFactors=F) %>%
195 |         rename(any_of(rename_cols)) 
196 |         
197 |     }
198 |     print(nrow(df))
199 |     return(df)
200 |   })
201 |   
202 |   # make dataframe of Ns
203 |   df_N <- variant_df_list %>%
204 |     setNames(names(ss_files)) %>%
205 |     bind_rows(.id="trait") %>%
206 |     select(trait, VAR_ID, N_PH) %>%
207 |     pivot_wider(names_from="trait", values_from="N_PH") %>%
208 |     data.frame()
209 | }
210 | 
211 | 
212 | find_variants_needing_proxies <- function(gwas_variant_df, var_nonmissingness,
213 |                                           rsID_map_file, missing_cutoff=0.8) {
214 |   
215 |   # Given a data frame containing GWAS variants and alleles as well as a vector
216 |   # of trait missingness fractions per variant (from count_traits_per_variant),
217 |   # output a vector of variants that need proxies
218 |   # Criteria (any of the following):
219 |   #   Strand-ambiguous (AT or GC)
220 |   #   Multi-allelic
221 |   #   Low-count (available in < 80% of traits)
222 |   # rsID_map_file should point to a whitespace-delimited file with columns
223 |   # corresponding to VAR_ID and rsID
224 |   
225 |   print("Choosing variants in need of proxies...")
226 |   
227 |   gwas_variant_df <- gwas_variant_df %>%
228 |       separate(VAR_ID, into=c("CHR", "POS", "REF", "ALT"),
229 |                sep="_", remove=F)
230 |   
231 |   need_proxies_varid <- with(gwas_variant_df, {
232 |     strand_ambig <- VAR_ID[paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")]
233 |     print(paste0("...", length(strand_ambig), " strand-ambiguous variants"))
234 |     
235 |     multi_allelic <- grep("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", VAR_ID, value=T)  # i.e. ALT allele has a comma
236 |     print(paste0("...", length(multi_allelic), " multi-allelic variants"))
237 |     
238 |     low_cnt <- VAR_ID[!(VAR_ID %in% names(var_nonmissingness)) |
239 |                         var_nonmissingness[VAR_ID] < missing_cutoff]
240 |     print(paste0("...", length(low_cnt), " variants with excessive missingness"))
241 | 
242 |     unique(c(strand_ambig, multi_allelic, low_cnt)) 
243 |   })
244 |   print(paste0("...", length(need_proxies_varid), " unique variants in total"))
245 |   
246 |   if (length(need_proxies_varid) == 0) return(tibble(VAR_ID=c(), rsID=c()))
247 |   
248 |   write(need_proxies_varid, "need_proxies_varid.tmp")
249 |   varid_rsid_map <- fread(cmd=paste0("grep -wFf need_proxies_varid.tmp ",
250 |                                      rsID_map_file),
251 |                           header=F, col.names=c("VAR_ID", "rsID"),
252 |                           data.table=F, stringsAsFactors=F)
253 |   need_proxies_rsid <- varid_rsid_map$rsID[match(need_proxies_varid, 
254 |                                                  varid_rsid_map$VAR_ID)]
255 |   print(paste0("...", length(unique(varid_rsid_map$rsID)), 
256 |                " of these are mapped to rsIDs"))
257 |   system("rm need_proxies_varid.tmp")
258 |   
259 |   tibble(VAR_ID=need_proxies_varid) %>%
260 |     left_join(varid_rsid_map, by="VAR_ID") %>%
261 |     left_join(gwas_variant_df[,c("VAR_ID","PVALUE")], by="VAR_ID")
262 | }
263 | 
264 | 
265 | choose_proxies <- function(need_proxies, 
266 |                            rsID_map_file,
267 |                            trait_ss_files,
268 |                            pruned_variants,
269 |                            method="TopLD",
270 |                            LDlink_token=NULL,
271 |                            topLD_path=NULL,
272 |                            population="EUR",
273 |                            frac_nonmissing_num=0.8,
274 |                            r2_num=0.8) {
275 |   
276 |   # Given a vector of variants (rsIDs) needing proxies
277 |   # (from find_variants_needing_proxies) and an LD reference file,
278 |   # output a data frame linking each variant to a data frame containing possible
279 |   # proxies (variant ID + r^2 + alleles)
280 |   # Criteria for eligibility:
281 |   #   Not strand-ambiguous
282 |   #   Trait fraction >= 80%
283 |   #   r^2 >= 0.8 with the index variant
284 |   # Choose based on first trait count, then r^2
285 |   
286 |   # First, run "/path/to/tabix /path/to/LDfile rsID_1 rsID_2 ...
287 |   print(paste("Num rows need_proxies:",nrow(need_proxies)))
288 |   if (method %in% c("LDlink","LDlinkR","LDproxy")) {
289 |     
290 |     print("Using LDlinkR:LDproxy_batch to find proxies...")
291 |     need_proxies <- need_proxies %>%
292 |       separate(VAR_ID, into=c("CHR","POS","REF","ALT"),sep = "_",remove = F) %>%
293 |       mutate(query_snp = paste0("chr", CHR, ":", POS)) %>%
294 |       select(-c(CHR, POS, REF, ALT))
295 |     need_proxies_snps <- need_proxies$query_snp
296 | 
297 |    LDlinkR::LDproxy_batch(need_proxies_snps,
298 |                           pop = population,
299 |                           r2d = "r2",
300 |                           token = LDlink_token,
301 |                           append = T,
302 |                           genome_build = "grch37")
303 |    proxy_df <- read.table("./combined_query_snp_list_grch37.txt",sep = "\t",row.names = NULL) %>%
304 |      filter(R2>r2_num) %>%
305 |      filter(!Coord %in% need_proxies_snps) %>%
306 |      inner_join(need_proxies, by = "query_snp") %>%
307 |      arrange(PVALUE) %>%
308 |      filter(!duplicated(RS_Number)) %>%
309 |      dplyr::select(rsID, proxy_rsID=RS_Number, r2=R2) 
310 |    need_proxies <- need_proxies %>%
311 |      select(-c(query_snp))
312 |     
313 |   } else if (method=="TopLD") { # use TopLD
314 |     print(sprintf("Using TopLD to find proxies for %s!", population))
315 |     if (nrow(need_proxies)<100) {
316 |       write(need_proxies$rsID, "need_proxies_rsIDs.tmp")
317 |       system(sprintf("%s -thres %.1f -pop %s -maf 0.01 -inFile need_proxies_rsIDs.tmp -outputLD outputLD.txt -outputInfo outputInfo.txt", topLD_path, r2_num, population))
318 |     } else { # need to split up
319 |       print("Splitting proxy df into subsets (more than 100 SNPs)...")
320 |       proxy_df_list <- split(need_proxies, (seq(nrow(need_proxies))-1) %/% 100)
321 |       
322 |       system("touch outputLD.txt")
323 |       print("Running TopLD for proxy df segments...")
324 |       for (j in 1:length(proxy_df_list)){
325 |         print(sprintf("Querying LD subset %i/%i",j, length(proxy_df_list)))
326 |         df <- proxy_df_list[[j]]
327 |         write(df$rsID, "need_proxies_rsIDs.tmp")
328 |         system(sprintf("%s -thres %.1f -pop %s -maf 0.01 -inFile need_proxies_rsIDs.tmp -outputLD outputLD_temp.txt -outputInfo outputInfo.txt", topLD_path, r2_num, population))
329 |         system("cat outputLD_temp.txt >> outputLD.txt")
330 |         }
331 |       }
332 |     proxy_df <- fread("outputLD.txt", stringsAsFactors = F, data.table = F) %>%
333 |       select(rsID=rsID1, proxy_rsID=rsID2, r2=R2) %>%
334 |       subset(proxy_rsID %like% "rs")
335 |   } 
336 |   else {
337 |     stop("Enter appropriate proxy search method!") # Using stop function
338 |     
339 |   }
340 |   print(paste("No. possible proxies found:",nrow(proxy_df))) # proxy_df should have columns (rsID, proxy_rsID, r2)
341 |   write(proxy_df$proxy_rsID, "potential_proxies_rsid.tmp")
342 |   
343 |   if (nrow(proxy_df)>0) {
344 |     print("Creating proxy rsID map...")
345 |     potential_proxies_map <- fread(cmd=paste0("grep -wFf potential_proxies_rsid.tmp ",
346 |                                               rsID_map_file),
347 |                                    header=F, col.names=c("proxy_VAR_ID", "proxy_rsID"),
348 |                                    data.table=F, stringsAsFactors=F)
349 |     print(head(potential_proxies_map))
350 |     
351 |     proxy_variants <- potential_proxies_map$proxy_VAR_ID
352 |     
353 |     proxy_missingness <- count_traits_per_variant(
354 |       proxy_variants,
355 |       trait_ss_files
356 |     )
357 |     
358 |     # get proxy missingness
359 |     df_Ns_rev <- proxy_missingness %>%
360 |       column_to_rownames("VAR_ID")
361 |     df_Ns_rev[df_Ns_rev == 'NULL'] <- NA
362 |   
363 |     # get variant counts
364 |     variant_counts_df <- data.frame(VAR_ID=rownames(df_Ns_rev),
365 |                                     frac=rowSums(!is.na(df_Ns_rev))/length(trait_ss_files))
366 | 
367 |     proxy_missingness <- ifelse(
368 |       proxy_variants %in% variant_counts_df$VAR_ID,
369 |       variant_counts_df$frac[match(proxy_variants, variant_counts_df$VAR_ID)],  # If in counts data frame, take the non-missing fraction
370 |       0  # If not in data frame, then the non-missing fraction is 0
371 |     )
372 |     proxy_missingness <- setNames(proxy_missingness, proxy_variants)
373 |     
374 |     proxy_missingness_df <- tibble(
375 |       proxy_VAR_ID=names(proxy_missingness),
376 |       frac_nonmissing=proxy_missingness
377 |     )
378 | 
379 |     final_proxy_df <- proxy_df %>%
380 |       inner_join(potential_proxies_map, by="proxy_rsID") %>%
381 |       separate(proxy_VAR_ID, into=c("CHR", "POS", "REF", "ALT"), 
382 |                sep="_", remove=F) %>%
383 |       inner_join(proxy_missingness_df, by="proxy_VAR_ID") %>%
384 |       filter(
385 |         !(paste0(REF, ALT) %in% c("AT", "TA", "CG", "GC")),  # Not strand-ambiguous
386 |         !grepl("^[0-9]+_[0-9]+_[ACGT]+_[ACGT]+,[ACGT]+$", proxy_VAR_ID),  # Not multi-allelic
387 |         frac_nonmissing >= frac_nonmissing_num,  # Sufficient fraction of traits non-missing
388 |         r2 >= r2_num  # Sufficient LD with the proxied variant
389 |       ) %>%
390 |       group_by(rsID) %>%
391 |       arrange(desc(frac_nonmissing),
392 |               desc(r2),
393 |               CHR) %>%  # Arbitrary sort for reproducibility in case of missingness + r2 ties
394 |       dplyr::slice(1) %>%
395 |       ungroup() %>%
396 |       inner_join(need_proxies, by="rsID") %>%  # added to include orig VAR_ID in output 
397 |       data.frame()
398 |     } else {
399 |         final_proxy_df <- NULL
400 |   }
401 |   proxies_found <- final_proxy_df$rsID
402 |   
403 |   no_proxies_found <- setdiff(need_proxies$rsID, proxies_found)
404 |   print(paste0("No proxies needed for ", 
405 |                length(setdiff(pruned_variants$VAR_ID, need_proxies$VAR_ID)), 
406 |                " variants."))
407 |   print(paste0("Proxies found for ", length(proxies_found), " variants."))
408 |   print(paste0("No adequate proxies found for ", length(no_proxies_found), 
409 |                " variants."))
410 |   
411 |   return(final_proxy_df)
412 | 
413 | }
414 | 
415 | 


--------------------------------------------------------------------------------
/scripts/generate_varid_to_rsid_map_file.R:
--------------------------------------------------------------------------------
 1 | 
 2 | CURRENT <- paste(getwd(),"/",sep="")
 3 | 
 4 | # download all variant information for homo sapiens GRCh37 from Ensembl
 5 | url <- "http://ftp.ensembl.org/pub/grch37/current/variation/vcf/homo_sapiens/1000GENOMES-phase_3.vcf.gz"
 6 | destfile <- paste(CURRENT,"1000GENOMES-phase_3.vcf.gz",sep="")
 7 | download.file(url, destfile)
 8 | 
 9 | library(vcfR)
10 | vcf <- read.vcfR(destfile)
11 | vars = vcf@fix
12 | 
13 | # format output file
14 | vars = as.data.frame(vars[,c("CHROM","POS","ID","REF","ALT")])
15 | vars$VAR_ID = paste(vars$CHROM,vars$POS,vars$REF,vars$ALT,sep="_")
16 | vars = vars[,c("VAR_ID","ID")]
17 | names(vars) = c("VAR_ID","rsID")
18 | 
19 | write.table(vars,file=paste(CURRENT,"VARID_rsID_map_file.txt",sep=""), append = F, quote = F, sep = "\t",
20 |             eol = "\n", na = "NA", dec = ".", row.names = F,
21 |             col.names = F, qmethod = c("escape", "double"))
22 | 


--------------------------------------------------------------------------------
/scripts/run_bNMF.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | 
  3 | 
  4 | ##########################################################################
  5 | # Copyright (c) 2017, Broad Institute
  6 | # Redistribution and use in source and binary forms, with or without
  7 | # modification, are permitted provided that the following conditions are
  8 | # met:
  9 | #     Redistributions of source code must retain the above copyright
 10 | #     notice, this list of conditions and the following disclaimer.
 11 | #     Redistributions in binary form must reproduce the above copyright
 12 | #     notice, this list of conditions and the following disclaimer in
 13 | #     the documentation and/or other materials provided with the
 14 | #     distribution.
 15 | #     Neither the name of the Broad Institute nor the names of its
 16 | #     contributors may be used to endorse or promote products derived
 17 | #     from this software without specific prior written permission.
 18 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 19 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 20 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 21 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 22 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 23 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 24 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 25 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 26 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 27 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 28 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 29 | #########################################################################
 30 | 
 31 | ######################################################################
 32 | # Bayesian NMF algorithms for clustering
 33 | ######################################################################
 34 | # For implementation details see the ppaer 
 35 | # Udler MS, Kim J, von Grotthuss M,
 36 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018)
 37 | # Type 2 diabetes genetic loci informed by multi-trait
 38 | # associations point to disease mechanisms and
 39 | # subtypes: A soft clustering analysis. PLoS Med 15
 40 | # (9): e1002654.
 41 | ###########################
 42 | # For details on the original algorithms 
 43 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence.
 44 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013).
 45 | ######################################################################
 46 | 
 47 | BayesNMF.L2EU <- function(
 48 |   V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=15, phi=1.0 #20, 10
 49 | ) {
 50 |   
 51 |   # Bayesian NMF with half-normal priors for W and H
 52 |   # V0: input z-score matrix (variants x traits)
 53 |   # n.iter: Number of iterations for parameter optimization
 54 |   # a0: Hyper-parameter for inverse gamma prior on ARD relevance weights
 55 |   # tol: Tolerance for convergence of fitting procedure
 56 |   # K: Number of clusters to be initialized (algorithm may drive some to zero)
 57 |   # K0: Used for setting b0 (lambda prior hyper-parameter) -- should be equal to K
 58 |   # phi: Scaling parameter
 59 |   
 60 |   eps <- 1.e-50
 61 |   del <- 1.0
 62 |   active_nodes <- colSums(V0) != 0
 63 |   V0 <- V0[, active_nodes]
 64 |   V <- V0 - min(V0)
 65 |   Vmin <- min(V)
 66 |   Vmax <- max(V)
 67 |   N <- dim(V)[1]
 68 |   M <- dim(V)[2]
 69 |   
 70 |   W <- matrix(runif(N * K) * Vmax, ncol=K)
 71 |   H <- matrix(runif(M * K) * Vmax, ncol=M)
 72 |   
 73 |   I <- array(1, dim=c(N, M))
 74 |   V.ap <- W %*% H + eps
 75 |   
 76 |   phi <- sd(V)^2 * phi
 77 |   C <- (N + M) / 2 + a0 + 1
 78 |   b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0)
 79 |   lambda.bound <- b0 / C
 80 |   lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
 81 |   lambda.cut <- lambda.bound * 1.5
 82 |   
 83 |   n.like <- list()
 84 |   n.evid <- list()
 85 |   n.error <- list()
 86 |   n.lambda <- list()
 87 |   n.lambda[[1]] <- lambda
 88 |   iter <- 2
 89 |   count <- 1
 90 |   while (del >= tol & iter < n.iter) {
 91 |     H <- H * (t(W) %*% V) / 
 92 |       (t(W) %*% V.ap + phi * H * matrix(rep(1 / lambda, M), ncol=M) + eps)
 93 |     V.ap <- W %*% H + eps
 94 |     W <- W * (V %*% t(H)) / 
 95 |       (V.ap %*% t(H) + phi * W * t(matrix(rep(1 / lambda, N), ncol=N)) + eps)
 96 |     V.ap <- W %*% H + eps
 97 |     lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
 98 |     del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]])
 99 |     like <- sum((V - V.ap)^2) / 2
100 |     n.like[[iter]] <- like
101 |     n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 
102 |                                          lambda + C * log(lambda))
103 |     n.lambda[[iter]] <- lambda
104 |     n.error[[iter]] <- sum((V - V.ap)^2)
105 |     if (iter %% 100 == 0) {
106 |       cat(iter, n.evid[[iter]], n.like[[iter]], n.error[[iter]], del, 
107 |           sum(colSums(W) != 0), sum(lambda >= lambda.cut), '\n')
108 |     }
109 |     iter <- iter + 1
110 |   }
111 |   return(list(
112 |     W,  # Variant weight matrix (N x K)
113 |     H,  # Trait weight matrix (K x M)
114 |     n.like,  # List of reconstruction errors (sum of squared errors / 2) per iteration
115 |     n.evid,  # List of negative log-likelihoods per iteration
116 |     n.lambda,  # List of lambda vectors (shared weights for each of K clusters, some ~0) per iteration
117 |     n.error  # List of reconstruction errors (sum of squared errors) per iteration
118 |   ))
119 | }
120 | 
121 | 
122 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, K=20, K0=10, tolerance=1e-7) {
123 |   
124 |   # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure
125 |   # a series of times to generate results and evaluate cluster stability
126 |   
127 |   print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)..."))
128 |   print(sprintf("Using tolerance of %.2e!",tolerance))
129 |   
130 |   set.seed(random_seed)
131 |   
132 |   bnmf_reps <- lapply(1:n_reps, function(r) {
133 |     print(paste("ITERATION",r))
134 |     res <- BayesNMF.L2EU(V0 = z_mat, K=K, K0=K0, tol=tolerance)
135 |     names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error")
136 |     res
137 |   })
138 |   bnmf_reps
139 | }
140 | 
141 | 
142 | summarize_bNMF <- function(bnmf_reps, dir_save=NULL) {
143 |   
144 |   # Given output from bNMF (list of length N_iterations),
145 |   # generate summary tables and plots
146 | 
147 |   make_run_summary <- function(reps) {
148 |     
149 |     # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs
150 |     
151 |     run_summary <- map_dfr(1:length(reps), function(i) {
152 |       res <- reps[[i]]
153 |       final_lambdas <- res$n.lambda[[length(res$n.lambda)]]
154 |       tibble(
155 |         run=i,
156 |         K=sum(final_lambdas > min(final_lambdas)),  # Assume that lambdas equal to the minimum lambda are ~ 0
157 |         evid=res$n.evid[[length(res$n.evid)]]  # Evidence = -log_likelihood
158 |       )
159 |     }) %>%
160 |       arrange(evid)
161 |     
162 |     unique.K <- table(run_summary$K)
163 |     n.K <- length(unique.K)  # Number of distinct K
164 |     MAP.K.run <- sapply(names(unique.K), function(k) {  # bNMF run index with the maximum posterior for given K
165 |       tmp <- run_summary[run_summary$K == k, ]
166 |       tmp$run[which.min(tmp$evid)]
167 |     })
168 |     
169 |     list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run)
170 |   }
171 |   if (!is.null(dir_save)) {
172 |     dir.create(file.path(dir_save))
173 |     dir_save=paste0(dir_save,"/")
174 |   } else {dir_save="./"}
175 |   
176 |   print("Summarizing bNMF results...")
177 |   
178 |   print("Writing table of chosen K across iterations...")
179 |   run_summary <- make_run_summary(bnmf_reps)
180 |   write_tsv(run_summary$run_tbl, paste0(dir_save,"run_summary.txt"))
181 | 
182 |   n.K <- length(run_summary$unique.K)  # Number of distinct K
183 |   
184 |   get_W <- function(clustering) {
185 |     W_raw <- clustering$W
186 |     W_raw[, colSums(W_raw > 1e-10) > 0]
187 |   }
188 |   
189 |   get_H <- function(clustering) {
190 |     H_raw <- clustering$H
191 |     H_raw[rowSums(H_raw > 1e-10) > 0, ]
192 |   }
193 |   
194 |   print("Plotting variant and trait contributions...")
195 |   silent <- sapply(names(run_summary$unique.K), function(k) {  # Create heatmaps for MAP iteration for each K
196 |     res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]]
197 |     W <- res$W[, colSums(res$W) != 0]  # feature-cluster association matrix
198 |     H <- res$H[rowSums(res$H) != 0, ]  # cluster-gene association matrix
199 |     W[W < 1.e-10] <- 0
200 |     H[H < 1.e-10] <- 0
201 |     
202 |     W0 <- data.frame(W)
203 |     W0[, "variant"] <- rownames(W)
204 |     H0 <- data.frame(H)
205 |     H0[, "cluster"] <- rownames(H)
206 |     
207 |     write_tsv(W0, file=paste0(dir_save,"L2EU.W.mat.", k, ".txt"))
208 |     write_tsv(H0, file=paste0(dir_save,"L2EU.H.mat.", k, ".txt"))
209 |     
210 |     mat.reconstructed <- W %*% H   # reconstructed matrix == approximation for the input matrix 
211 |     
212 |     # Setup for plotting
213 |     scale0 <- 0.8
214 |     scale <- 1
215 |     g.ordering <- paste("G", seq(1:ncol(W)), sep="")
216 |     color.axis <- "black"
217 |     .theme_ss <- theme_bw(base_size=12) +
218 |       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 
219 |                                        family="mono", face='bold', color=color.axis),
220 |             axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis),
221 |             axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis),
222 |             axis.title=element_text(face="bold", size=12 * scale,color="black"),
223 |             plot.title=element_text(face="bold", size=12 * scale))
224 |     
225 |     # Plot W matrix (feature activities)
226 |     W_hc <- hclust(dist(W, method="euclidean"), method="ward.D")
227 |     W_variant.ordering <- W_hc$labels[W_hc$order]
228 |     W_plt_df <- W %>%
229 |       as.data.frame() %>%
230 |       rownames_to_column(var="variant") %>%
231 |       gather(key="cluster", value="activity", -variant) %>%
232 |       mutate(variant=factor(variant, levels=W_variant.ordering),
233 |              cluster=factor(cluster, 
234 |                             levels=paste0("V", 1:ncol(W))))
235 |     W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 
236 |       geom_tile() +
237 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
238 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
239 |       .theme_ss +
240 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
241 |       ylab("Cluster") + xlab("Variant") +
242 |       theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) +
243 |       theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) +
244 |       theme(legend.position="right") +
245 |       theme(legend.key.size = unit(0.5, "cm"))
246 |     ggsave(paste0(dir_save,"W_plot_K", k, ".pdf"), plot=W_plt)
247 |     
248 |     H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D")
249 |     H_trait.ordering <- H_hc$labels[H_hc$order]
250 |     H_plt_df <- t(H) %>%
251 |       as.data.frame() %>%
252 |       rownames_to_column(var="trait") %>%
253 |       gather(key="cluster", value="activity", -trait) %>%
254 |       mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))),
255 |              trait=factor(trait, levels=H_trait.ordering))
256 |     H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 
257 |       geom_tile() +
258 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
259 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
260 |       .theme_ss +
261 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
262 |       ylab("Cluster") + xlab("Trait") +
263 |       theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) +
264 |       theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) +
265 |       theme(legend.position="right") +
266 |       theme(legend.key.size = unit(0.5, "cm"))
267 |     ggsave(paste0(dir_save,"H_plot_K", k, ".pdf"), plot=H_plt)
268 |   })
269 | }
270 | 


--------------------------------------------------------------------------------
/scripts/run_bNMF_2025.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(furrr)
  3 | library(progressr)
  4 | library(rtracklayer)
  5 | library(vroom)
  6 | ##########################################################################
  7 | # Copyright (c) 2017, Broad Institute
  8 | # Redistribution and use in source and binary forms, with or without
  9 | # modification, are permitted provided that the following conditions are
 10 | # met:
 11 | #     Redistributions of source code must retain the above copyright
 12 | #     notice, this list of conditions and the following disclaimer.
 13 | #     Redistributions in binary form must reproduce the above copyright
 14 | #     notice, this list of conditions and the following disclaimer in
 15 | #     the documentation and/or other materials provided with the
 16 | #     distribution.
 17 | #     Neither the name of the Broad Institute nor the names of its
 18 | #     contributors may be used to endorse or promote products derived
 19 | #     from this software without specific prior written permission.
 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 21 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 22 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 23 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 24 | # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 25 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 26 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 27 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 28 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 29 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 30 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 31 | #########################################################################
 32 | 
 33 | ######################################################################
 34 | # Bayesian NMF algorithms for clustering
 35 | ######################################################################
 36 | # For implementation details see the ppaer 
 37 | # Udler MS, Kim J, von Grotthuss M,
 38 | # Bonàs-Guarch S, Cole JB, Chiou J, et al. (2018)
 39 | # Type 2 diabetes genetic loci informed by multi-trait
 40 | # associations point to disease mechanisms and
 41 | # subtypes: A soft clustering analysis. PLoS Med 15
 42 | # (9): e1002654.
 43 | ###########################
 44 | # For details on the original algorithms 
 45 | # see Tan, V.Y. & Févotte, C. Automatic relevance determination in nonnegative matrix factorization with the beta-divergence.
 46 | # IEEE Trans. Pattern Anal. Mach. Intell. 35, 1592–1605 (2013).
 47 | ######################################################################
 48 | 
 49 | 
 50 | BayesNMF.L2EU <- function(
 51 |     V0, n.iter=10000, a0=10, tol=1e-7, K=15, K0=15, phi=1.0,
 52 |     window_size=25, min_iter=100  # New parameters for convergence monitoring
 53 | ) {
 54 |   # ... existing initial comments ...
 55 |   
 56 |   eps <- 1.e-50
 57 |   del <- 1.0
 58 |   active_nodes <- colSums(V0) != 0
 59 |   V0 <- V0[, active_nodes]
 60 |   V <- V0 - min(V0)
 61 |   Vmin <- min(V)
 62 |   Vmax <- max(V)
 63 |   N <- dim(V)[1]
 64 |   M <- dim(V)[2]
 65 |   
 66 |   # Pre-allocate matrices
 67 |   W <- matrix(runif(N * K) * Vmax, ncol=K)
 68 |   H <- matrix(runif(M * K) * Vmax, ncol=M)
 69 |   V.ap <- matrix(0, nrow=N, ncol=M)
 70 |   lambda_matrix_M <- matrix(0, nrow=K, ncol=M)
 71 |   lambda_matrix_N <- matrix(0, nrow=K, ncol=N)
 72 |   
 73 |   I <- array(1, dim=c(N, M))
 74 |   V.ap <- W %*% H + eps
 75 |   
 76 |   phi <- sd(V)^2 * phi
 77 |   C <- (N + M) / 2 + a0 + 1
 78 |   b0 <- 3.14 * (a0 - 1) * mean(V) / (2 * K0)
 79 |   lambda.bound <- b0 / C
 80 |   lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
 81 |   lambda.cut <- lambda.bound * 1.5
 82 |   
 83 |   n.like <- list()
 84 |   n.evid <- list()
 85 |   n.error <- list()
 86 |   n.lambda <- list()
 87 |   n.active <- list()
 88 |   n.lambda[[1]] <- lambda
 89 |   
 90 |   # Function to check convergence stability
 91 |   check_convergence <- function(errors, window_size) {
 92 |     if (length(errors) < window_size) return(FALSE)
 93 |     recent <- tail(unlist(errors), window_size)
 94 |     # rel_change <- abs((recent[1] - recent[window_size]) / recent[1])
 95 |     rel_change <- abs(mean(diff(recent)) / mean(recent))
 96 |     return(rel_change < tol)
 97 |   }
 98 |   
 99 |   iter <- 2
100 |   
101 | 
102 |   while (del >= tol & iter < n.iter) {
103 |     # Update matrices efficiently
104 |     lambda_matrix_M[] <- rep(1/lambda, M)
105 |     lambda_matrix_N[] <- rep(1/lambda, N)
106 |     
107 |     # Update H
108 |     H <- H * (t(W) %*% V) / (t(W) %*% V.ap + phi * H * lambda_matrix_M + eps)
109 |     V.ap <- W %*% H + eps
110 |     
111 |     # Update W
112 |     W <- W * (V %*% t(H)) / (V.ap %*% t(H) + phi * W * t(lambda_matrix_N) + eps)
113 |     V.ap <- W %*% H + eps
114 |     
115 |     # Update lambda and calculate metrics
116 |     lambda <- (0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / C
117 |     lambda[is.na(lambda)] <- 1e-6
118 |     lambda[is.nan(lambda)] <- 1e-6
119 |     lambda[lambda == Inf] <- 1e-6
120 |     
121 |     # Clean del calculation with finite checks
122 |     # del <- max(abs(lambda - n.lambda[[iter - 1]]) / n.lambda[[iter - 1]])
123 |     del <- if (!is.null(n.lambda[[iter - 1]]) && all(is.finite(n.lambda[[iter - 1]]))) {
124 |       max(abs(lambda - n.lambda[[iter - 1]]) / (n.lambda[[iter - 1]] + 1e-10))
125 |     } else {
126 |       Inf
127 |     }
128 |     
129 |     like <- sum((V - V.ap)^2) / 2
130 |     n.like[[iter]] <- like
131 |     n.evid[[iter]] <- like + phi * sum((0.5 * colSums(W^2) + 0.5 * rowSums(H^2) + b0) / 
132 |                                          lambda + C * log(lambda))
133 |     n.error[[iter]] <- sum((V - V.ap)^2)
134 |     n.lambda[[iter]] <- lambda
135 |     n.active[[iter]] <- sum(lambda >= lambda.cut)
136 |     
137 |     # Progress monitoring
138 |     if (iter %% 100 == 0) {
139 |       cat(sprintf("Iter %d: Error=%g, Delta=%g, Active=%d, Evidence=%g\n", 
140 |                   iter, n.error[[iter]], del, n.active[[iter]], n.evid[[iter]]))
141 |     }
142 |     
143 |     # Early stopping check (but only after minimum iterations)
144 |     if (iter > min_iter && iter %% 20 == 0) {
145 |       if (check_convergence(n.error, window_size)) {
146 |         cat("Converged based on error stability\n")
147 |         break
148 |       }
149 |     }
150 |     
151 |     iter <- iter + 1
152 |   }
153 |   
154 |   # Return results with convergence metrics
155 |   return(list(
156 |     W = W,
157 |     H = H,
158 |     n.like = n.like,
159 |     n.evid = n.evid,
160 |     n.lambda = n.lambda,
161 |     n.error = n.error,
162 |     n.active = n.active,
163 |     iterations = iter - 1,
164 |     converged = del < tol
165 |   ))
166 | }
167 | 
168 | # Function to test multiple phi values
169 | test_phi_values <- function(V0, phi_values = c(1.0, 2.0, 5.0, 10.0), n_reps = 10, ...) {
170 |   results <- list()
171 |   
172 |   for (phi in phi_values) {
173 |     cat(sprintf("\nTesting phi = %g\n", phi))
174 |     
175 |     best_solution <- NULL
176 |     best_error <- Inf  # Initialize with a high error value
177 |     
178 |     for (i in 1:n_reps) {
179 |       result <- BayesNMF.L2EU(V0, phi = phi, ...)
180 |       
181 |       # Assume the function returns an error metric, e.g., result$error
182 |       if (result[["n.error"]] < best_error) {  # Use double brackets to extract numeric value
183 |         best_solution <- result
184 |         best_error <- result[["n.error"]]
185 |       }
186 |     }
187 |     
188 |     results[[as.character(phi)]] <- best_solution
189 |   }
190 |   
191 |   return(results)
192 | }
193 | # Example usage:
194 | # phi_test_results <- test_phi_values(V0, phi_values=c(1.0, 2.0, 5.0, 10.0))
195 | # print(phi_test_results$comparison)
196 | 
197 | 
198 | 
199 | run_bNMF <- function(z_mat, n_reps=10, random_seed=1, K=20, K0=10, tolerance=1e-7, phi=1) {
200 |   
201 |   # Given an input matrix as created by prep_z_matrix(), run the bNMF procedure
202 |   # a series of times to generate results and evaluate cluster stability
203 |   
204 |   print(paste0("Running bNMF clustering procedure (", n_reps, " iterations)..."))
205 |   print(sprintf("Using tolerance of %.2e!",tolerance))
206 |   
207 |   set.seed(random_seed)
208 |   
209 |   bnmf_reps <- lapply(1:n_reps, function(r) {
210 |     print(paste("ITERATION",r))
211 |     res <- BayesNMF.L2EU(V0 = z_mat, K=K, K0=K0, tol=tolerance, phi=phi)
212 |     # names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error")
213 |     res
214 |   })
215 |   bnmf_reps
216 | }
217 | 
218 | 
219 | # Activate a global handler for progress bars
220 | 
221 | run_bNMF_parallel <- function(z_mat, n_reps = 10, random_seed = 1, K = 20, K0 = 10, tolerance = 1e-7, phi=1) {
222 |   
223 |   print(paste0("Running bNMF clustering procedure in parallel! (", n_reps, " iterations)..."))
224 |   print(sprintf("Using tolerance of %.2e!", tolerance))
225 |   
226 |   # Run each repetition in parallel
227 |   bnmf_reps <- future_map(1:n_reps, function(rep) {
228 |     
229 |     # Logging progress
230 |     log_message <- paste("ITERATION", rep)
231 |     cat(log_message, "\n")  # Print to console to verify
232 |     
233 |     # Run the Bayesian NMF function and store the result
234 |     res <- BayesNMF.L2EU(V0 = z_mat, K = K, K0 = K0, tol = tolerance, phi=phi)
235 |     # names(res) <- c("W", "H", "n.like", "n.evid", "n.lambda", "n.error")
236 |     
237 |     return(res)  # Explicitly return `res` from the function
238 |   },
239 |   .options = furrr_options(seed = random_seed)  # Set the seed to a specific number
240 |   )
241 |   
242 |   return(bnmf_reps)
243 | }
244 | 
245 | summarize_bNMF <- function(bnmf_reps, dir_save=NULL) {
246 |   
247 |   # Given output from bNMF (list of length N_iterations),
248 |   # generate summary tables and plots
249 | 
250 |   make_run_summary <- function(reps) {
251 |     
252 |     # Given a list of bNMF iteration outputs, summarize the K choices and associated likelihoods across runs
253 |     
254 |     run_summary <- map_dfr(1:length(reps), function(i) {
255 |       res <- reps[[i]]
256 |       final_lambdas <- res$n.lambda[[length(res$n.lambda)]]
257 |       tibble(
258 |         run=i,
259 |         K=sum(final_lambdas > min(final_lambdas)),  # Assume that lambdas equal to the minimum lambda are ~ 0
260 |         evid=res$n.evid[[length(res$n.evid)]]  # Evidence = -log_likelihood
261 |       )
262 |     }) %>%
263 |       arrange(evid)
264 |     
265 |     unique.K <- table(run_summary$K)
266 |     n.K <- length(unique.K)  # Number of distinct K
267 |     MAP.K.run <- sapply(names(unique.K), function(k) {  # bNMF run index with the maximum posterior for given K
268 |       tmp <- run_summary[run_summary$K == k, ]
269 |       tmp$run[which.min(tmp$evid)]
270 |     })
271 |     
272 |     list(run_tbl=run_summary, unique.K=unique.K, MAP.K.run=MAP.K.run)
273 |   }
274 |   if (!is.null(dir_save)) {
275 |     dir.create(file.path(dir_save))
276 |     dir_save=paste0(dir_save,"/")
277 |   } else {dir_save="./"}
278 |   
279 |   print("Summarizing bNMF results...")
280 |   
281 |   print("Writing table of chosen K across iterations...")
282 |   run_summary <- make_run_summary(bnmf_reps)
283 |   write_tsv(run_summary$run_tbl, paste0(dir_save,"run_summary.txt"))
284 | 
285 |   n.K <- length(run_summary$unique.K)  # Number of distinct K
286 |   
287 |   get_W <- function(clustering) {
288 |     W_raw <- clustering$W
289 |     W_raw[, colSums(W_raw > 1e-10) > 0]
290 |   }
291 |   
292 |   get_H <- function(clustering) {
293 |     H_raw <- clustering$H
294 |     H_raw[rowSums(H_raw > 1e-10) > 0, ]
295 |   }
296 |   
297 |   print("Plotting variant and trait contributions...")
298 |   silent <- sapply(names(run_summary$unique.K), function(k) {  # Create heatmaps for MAP iteration for each K
299 |     res <- bnmf_reps[[run_summary$MAP.K.run[as.character(k)]]]
300 |     W <- res$W[, colSums(res$W) != 0]  # feature-cluster association matrix
301 |     H <- res$H[rowSums(res$H) != 0, ]  # cluster-gene association matrix
302 |     W[W < 1.e-10] <- 0
303 |     H[H < 1.e-10] <- 0
304 |     
305 |     W0 <- data.frame(W)
306 |     W0[, "variant"] <- rownames(W)
307 |     H0 <- data.frame(H)
308 |     H0[, "cluster"] <- rownames(H)
309 |     
310 |     write_tsv(W0, file=paste0(dir_save,"L2EU.W.mat.", k, ".txt"))
311 |     write_tsv(H0, file=paste0(dir_save,"L2EU.H.mat.", k, ".txt"))
312 |     
313 |     mat.reconstructed <- W %*% H   # reconstructed matrix == approximation for the input matrix 
314 |     
315 |     # Setup for plotting
316 |     scale0 <- 0.8
317 |     scale <- 1
318 |     g.ordering <- paste("G", seq(1:ncol(W)), sep="")
319 |     color.axis <- "black"
320 |     .theme_ss <- theme_bw(base_size=12) +
321 |       theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size=8 * scale, 
322 |                                        family="mono", face='bold', color=color.axis),
323 |             axis.text.y = element_text(hjust = 0.5,size=12 * scale, family="mono",face='bold',color=color.axis),
324 |             axis.text = element_text(size = 12 * scale, family = "mono",color=color.axis),
325 |             axis.title=element_text(face="bold", size=12 * scale,color="black"),
326 |             plot.title=element_text(face="bold", size=12 * scale))
327 |     
328 |     # Plot W matrix (feature activities)
329 |     W_hc <- hclust(dist(W, method="euclidean"), method="ward.D")
330 |     W_variant.ordering <- W_hc$labels[W_hc$order]
331 |     W_plt_df <- W %>%
332 |       as.data.frame() %>%
333 |       rownames_to_column(var="variant") %>%
334 |       gather(key="cluster", value="activity", -variant) %>%
335 |       mutate(variant=factor(variant, levels=W_variant.ordering),
336 |              cluster=factor(cluster, 
337 |                             levels=paste0("V", 1:ncol(W))))
338 |     W_plt <- ggplot(W_plt_df, aes(x=variant, y=cluster, fill=activity)) + 
339 |       geom_tile() +
340 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
341 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
342 |       .theme_ss +
343 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
344 |       ylab("Cluster") + xlab("Variant") +
345 |       theme(axis.title.x = element_text(face="bold",colour="black", size=12 * scale0)) +
346 |       theme(axis.title.y = element_text(face="bold",colour="black", size=12 * scale0)) +
347 |       theme(legend.position="right") +
348 |       theme(legend.key.size = unit(0.5, "cm"))
349 |     ggsave(paste0(dir_save,"W_plot_K", k, ".pdf"), plot=W_plt)
350 |     
351 |     H_hc <- hclust(dist(t(H), method="euclidean"), method="ward.D")
352 |     H_trait.ordering <- H_hc$labels[H_hc$order]
353 |     H_plt_df <- t(H) %>%
354 |       as.data.frame() %>%
355 |       rownames_to_column(var="trait") %>%
356 |       gather(key="cluster", value="activity", -trait) %>%
357 |       mutate(cluster=factor(cluster, levels=paste0("V", 1:nrow(H))),
358 |              trait=factor(trait, levels=H_trait.ordering))
359 |     H_plt <- ggplot(H_plt_df, aes(x=trait, y=cluster, fill=activity)) + 
360 |       geom_tile() +
361 |       scale_fill_gradient2(low="white", high ="black", name=paste("Activity", sep="")) +
362 |       #p = p + scale_fill_gradientn(values=c(0,0.1,0.2,0.5,0.7,1.0),colours=c("yellow","green","black","red","magenta"),limit=c(0,1.0))
363 |       .theme_ss +
364 |       ggtitle(paste0("Variant Association to Clusters (k=", k, ")")) +
365 |       ylab("Cluster") + xlab("Trait") +
366 |       theme(axis.title.x=element_text(face="bold", colour="black", size=12 * scale0)) +
367 |       theme(axis.title.y=element_text(face="bold", colour="black", size=12 * scale0)) +
368 |       theme(legend.position="right") +
369 |       theme(legend.key.size = unit(0.5, "cm"))
370 |     ggsave(paste0(dir_save,"H_plot_K", k, ".pdf"), plot=H_plt)
371 |   })
372 | }
373 | 
374 | 


--------------------------------------------------------------------------------