├── 06_all_omics_UKB_phecode_assoc_test ├── 01_all_omics_PGS_UKB_disease_assoc_one_platform.sh ├── 01_all_omics_PGS_UKB_disease_assoc.sh └── 01_all_omics_PGS_UKB_disease_assoc.py ├── GCTB ├── 04a_merge_shrunk_all_chrs │ ├── gctb_merge_shrunk_bins_all_chrs.sh │ ├── gctb_merge_shrunk_sparse_bins_all_chrs.sh │ └── gctb_gen_mldmlist_shrunk_all_chrs.sh ├── 04_merge_shrunk_sparse_all_chrs │ ├── gctb_merge_shrunk_sparse_bins_all_chrs.sh │ └── gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh ├── 02a_gen_shrunk_corr_data_mcpu │ ├── gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh │ ├── gctb_merge_shrunk_bins_by_chr.sh │ ├── gctb_gen_mldmlist_per_chr.sh │ └── gctb_gen_shrunk_corr_matrix_mcpu.sh ├── logs │ └── run_SBayesS_shrunk_M48434_6216077_4294967294.e ├── 03_convert_to_sparse │ └── gctb_convert_to_sparse_shrunk_corr_matrix.sh ├── README.md ├── 02_gen_shrunk_corr_data │ └── gctb_gen_shrunk_corr_matrix.sh ├── 06_run_SbayesS.sh ├── 06_run_SbayesS │ └── run_SbayesS.sh ├── 04a_merge_shrunk_all_chrs_job.sh ├── 06a_run_SbayesS_with_shrunk_corr │ └── run_SbayesS_with_shrunk.sh ├── 04_merge_shrunk_sparse_all_chrs_job.sh ├── 06a_run_SbayesS_with_shrunk_corr.sh ├── 02a_merge_shrunk_bins_job.sh ├── 03_to_shrunk_sparse_job.sh ├── 02a_gen_shrunk_corr_data_mcpu_job.sh ├── 01_gen_genetic_data_hapmap3_variants │ └── gen_INTERVAL_hapmap3_vars_genetic_data.sh ├── 02_gen_shrunk_corr_data_job.sh └── 01_gen_genetic_data_hapmap3_variants_job.sh ├── 01_convert_bgen ├── 04_fix_var_ids.R ├── 01_convert_bgen.sh ├── 03_filter_duplicates.sh └── 02_flag_duplicates.R ├── 04_extract_QTLs ├── 02_helpers │ └── reformat_dosages.R ├── 02_extract_QTL_dosages.sh └── 01_extract_QTLs.R ├── 05_genetic_score_training ├── Traditional_GRS.py ├── BayesianRidge.py └── 01_run_omics_pgs_training with_br.py ├── 03_collate_QTLs.job ├── 02_ldthin ├── 02_ldthin.sh └── 01_identify_snps.R ├── 06_all_omics_ukb_phecode_disease_assoc_test.job ├── 05_genetic_score_training.job ├── 02_ldthin.job ├── 04_extract_QTLs.job ├── 01_convert_bgen.job ├── README.md ├── 03_collate_QTLs └── 01_collate_QTLs.R └── LDpred2 └── LDpred2_auto.R /06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc_one_platform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | module load miniconda3 4 | source activate ml 5 | 6 | python /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/scripts_gene_expressions/08_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py ${1} ${2} 7 | -------------------------------------------------------------------------------- /06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | module load miniconda3 4 | source activate ml 5 | 6 | i=$SLURM_ARRAY_TASK_ID 7 | 8 | python /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/scripts_gene_expressions/08_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py ${1} ${i} 9 | -------------------------------------------------------------------------------- /GCTB/04a_merge_shrunk_all_chrs/gctb_merge_shrunk_bins_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | $gctb --mldm ${PWD}/interval_shrunk_only_chr_all.mldmlist --make-shrunk-ldm --out ${PWD}/interval_shrunk_chr_all -------------------------------------------------------------------------------- /GCTB/04a_merge_shrunk_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | $gctb --mldm ${PWD}/interval_shrunk_chr_all.mldmlist --make-sparse-ldm --chisq 0 --out ${PWD}/interval_shrunk_chr_all -------------------------------------------------------------------------------- /GCTB/04_merge_shrunk_sparse_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | $gctb --mldm ${PWD}/interval_shrunk_chr_all.mldmlist --make-sparse-ldm --chisq 0 --out ${PWD}/interval_shrunk_chr_all -------------------------------------------------------------------------------- /GCTB/04a_merge_shrunk_all_chrs/gctb_gen_mldmlist_shrunk_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 5 | 6 | out=interval_shrunk_chr 7 | 8 | 9 | for i in $( seq 1 22 ) 10 | do 11 | 12 | echo "${PWD}/${out}${i}.ldm.shrunk" >> "${PWD}/interval_shrunk_only_chr_all.mldmlist" 13 | 14 | done -------------------------------------------------------------------------------- /GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 5 | 6 | out=interval_shrunk_chr 7 | 8 | 9 | for i in $( seq 1 22 ) 10 | do 11 | 12 | echo "${PWD}/${out}${i}.ldm.sparse" >> "${PWD}/interval_shrunk_chr_all.mldmlist" 13 | 14 | done -------------------------------------------------------------------------------- /GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_merge_shrunk_bins_by_chr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | out=interval_shrunk_chr${SLURM_ARRAY_TASK_ID} 8 | 9 | 10 | 11 | $gctb --mldm ${PWD}/${out}.mldmlist --make-shrunk-ldm --out ${PWD}/${out} -------------------------------------------------------------------------------- /GCTB/04_merge_shrunk_sparse_all_chrs/gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 5 | 6 | out=interval_shrunk_chr 7 | 8 | 9 | for i in $( seq 1 22 ) 10 | do 11 | 12 | echo "${PWD}/${out}${i}.ldm.sparse" >> "${PWD}/interval_shrunk_chr_all.mldmlist" 13 | 14 | done -------------------------------------------------------------------------------- /01_convert_bgen/04_fix_var_ids.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | # Remove extra row identifier tacked onto the variant IDs 4 | for (chr_id in 1:22) { 5 | pvar = fread(sprintf("geno_files/genotype_data/impute_%s_interval_dedup.pvar", chr_id)) 6 | pvar[, ID := gsub(":[0-9]+?$", "", ID)] 7 | fwrite(pvar, sep="\t", quote=FALSE, file=sprintf("geno_files/genotype_data/impute_%s_interval_dedup.pvar", chr_id)) 8 | } 9 | 10 | -------------------------------------------------------------------------------- /GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_mldmlist_per_chr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | k=5000 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | out=interval_shrunk_chr${1} 9 | 10 | 11 | for i in $( seq 1 ${2} ) 12 | do 13 | 14 | echo "${PWD}/${out}.snp$((k*(i-1)+1))-$((k*i)).ldm.shrunk" >> "${PWD}/${out}.mldmlist" 15 | 16 | done -------------------------------------------------------------------------------- /01_convert_bgen/01_convert_bgen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ref_dir=$HOME/rds/rds-jmmh2-post_qc_data/interval/imputed/uk10k_1000g_b37/imputed 4 | out_dir=geno_files/genotype_data/ 5 | mkdir -p $out_dir 6 | 7 | chr=$SLURM_ARRAY_TASK_ID 8 | 9 | plink2 --bgen $ref_dir/impute_${chr}_interval.bgen \ 10 | --sample $ref_dir/interval.samples \ 11 | --threads $SLURM_CPUS_ON_NODE \ 12 | --memory $SLURM_MEM_PER_NODE \ 13 | --silent \ 14 | --out $out_dir/impute_${chr}_interval 15 | 16 | -------------------------------------------------------------------------------- /GCTB/logs/run_SBayesS_shrunk_M48434_6216077_4294967294.e: -------------------------------------------------------------------------------- 1 | /var/spool/slurm/slurmd/job6216077/slurm_script: line 20: 77468 Segmentation fault $gctb --sbayes S --ldm ${PWD}/interval_shrunk_chr_all.ldm.shrunk --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt --pi 0.01 --hsq 0.5 --num-chains 4 --chain-length 25000 --burn-in 2000 --seed 12345 --thread 18 --no-mcmc-bin --out-freq 10 --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1} 2 | -------------------------------------------------------------------------------- /GCTB/03_convert_to_sparse/gctb_convert_to_sparse_shrunk_corr_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 5 | 6 | shrunk_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID}.ldm.shrunk 7 | 8 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID} 9 | 10 | $gctb --ldm $shrunk_file \ 11 | --make-sparse-ldm \ 12 | --chisq 0 \ 13 | --out $output_file 14 | 15 | 16 | -------------------------------------------------------------------------------- /GCTB/README.md: -------------------------------------------------------------------------------- 1 | # Codes for using SbayesS to estimate heritability of omics traits 2 | 3 | - Software Version 4 | - GCTB 2.02 [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](https://cnsgenomics.com/software/gctb/#Download) 5 | 6 | - Variants used to construct the correlation matrix 7 | - Hapmap3 variant set 8 | - HWE test P value < 1 × 10−6 9 | - missing genotype rate > 0.05 10 | - imputation info score < 0.3 11 | - MAF > 0.01 12 | 13 | - Samples used to contruct the correlation matrix 14 | - All QCed INTERVAL smaples excluding these used in withheld set validation 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /04_extract_QTLs/02_helpers/reformat_dosages.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | args = commandArgs(trailingOnly=TRUE) 4 | 5 | dt = fread(sprintf("%s/%s_dosages.txt", args[1], args[2])) 6 | varID = dt[,.(varID=IID)] # should take first IID column encountered 7 | # drop these columns, will occur 1 time per each chromosome file pasted 8 | while ("FID" %in% names(dt)) { 9 | dt[, c('FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE') := NULL]; 10 | } 11 | # add back the varID column as the first column 12 | dt = cbind(varID, dt) 13 | # drop extracted allele from variant name in header 14 | setnames(dt, gsub('_.*', '', names(dt))); 15 | # write out 16 | fwrite(dt, sep='\t', quote=FALSE, compress="gzip", file=sprintf("%s/%s_dosages.txt.gz", args[1], args[2])) 17 | -------------------------------------------------------------------------------- /05_genetic_score_training/Traditional_GRS.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from sklearn.metrics import r2_score,explained_variance_score 4 | from scipy.stats import pearsonr 5 | import pandas as pd 6 | from scipy import stats 7 | from scipy.stats import spearmanr 8 | 9 | 10 | def get_beta_vec_by_vars_ids(beta_file,vars): 11 | df = pd.read_csv(beta_file,sep='\t',index_col=0) 12 | betas = df.loc[vars]['effect'] 13 | return np.array(betas) 14 | 15 | 16 | def traditional_GRS_selected_vars(beta_file,X,y,vars): 17 | beta_vec = get_beta_vec_by_vars_ids(beta_file,vars) 18 | y_pred = X.dot(beta_vec) 19 | y_pred = stats.zscore(y_pred) 20 | return pearsonr(y, y_pred)[0],r2_score(y, y_pred),explained_variance_score(y,y_pred),spearmanr(y, y_pred)[0] 21 | 22 | -------------------------------------------------------------------------------- /GCTB/02_gen_shrunk_corr_data/gctb_gen_shrunk_corr_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 5 | 6 | plink_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${SLURM_ARRAY_TASK_ID} 7 | 8 | genetic_map_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_genetic_map/genetic_map_chr${SLURM_ARRAY_TASK_ID}.txt 9 | 10 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID} 11 | 12 | $gctb --bfile $plink_file \ 13 | --make-shrunk-ldm \ 14 | --gen-map $genetic_map_file \ 15 | --out $output_file -------------------------------------------------------------------------------- /GCTB/06_run_SbayesS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/06_run_SbayesS/ 12 | mkdir -p $log_dir 13 | 14 | 15 | run_SBayesS=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-SL2-CPU \ 18 | --job-name "SbayesS" \ 19 | --time 8:0:0 \ 20 | --mem 60000 \ 21 | --output $log_dir/run_SBayesS_${1}_%A_%a.o \ 22 | --error $log_dir/run_SBayesS_${1}_%A_%a.e \ 23 | --partition skylake \ 24 | 06_run_SbayesS/run_SbayesS.sh ${1}) 25 | 26 | echo "Submitted jobs $run_SBayesS" 27 | 28 | -------------------------------------------------------------------------------- /GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_shrunk_corr_matrix_mcpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | k=5000 4 | i=${SLURM_ARRAY_TASK_ID} 5 | 6 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb 7 | 8 | plink_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${1} 9 | 10 | genetic_map_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_genetic_map/genetic_map_chr${1}.txt 11 | 12 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${1} 13 | 14 | $gctb --bfile $plink_file \ 15 | --make-shrunk-ldm \ 16 | --gen-map $genetic_map_file \ 17 | --snp $((k*(i-1)+1))-$((k*i)) \ 18 | --out ${output_file} -------------------------------------------------------------------------------- /01_convert_bgen/03_filter_duplicates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | out_dir=geno_files/genotype_data/ 4 | chr=$SLURM_ARRAY_TASK_ID 5 | 6 | # Identify variants to remove 7 | grep 'remove' $out_dir/impute_${chr}_interval.pvar | cut -f 3 > $out_dir/chr${chr}_duplicates.txt 8 | 9 | # Exclude these and create new pgen files 10 | plink2 --pfile $out_dir/impute_${chr}_interval \ 11 | --exclude $out_dir/chr${chr}_duplicates.txt \ 12 | --threads $SLURM_CPUS_ON_NODE \ 13 | --memory $SLURM_MEM_PER_NODE \ 14 | --silent \ 15 | --make-pgen \ 16 | --out $out_dir/impute_${chr}_interval_dedup 17 | 18 | # Remove old pgen files. 19 | rm $out_dir/impute_${chr}_interval.pgen 20 | rm $out_dir/impute_${chr}_interval.psam 21 | rm $out_dir/impute_${chr}_interval.pvar 22 | 23 | # Remove temporary exclusion file 24 | rm $out_dir/chr${chr}_duplicates.txt 25 | 26 | -------------------------------------------------------------------------------- /GCTB/06_run_SbayesS/run_SbayesS.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.03beta_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | 9 | $gctb --sbayes S \ 10 | --ldm ${PWD}/interval_shrunk_chr_all.ldm.sparse \ 11 | --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt \ 12 | --pi 0.01 \ 13 | --hsq 0.5 \ 14 | --num-chains 4 \ 15 | --chain-length 25000 \ 16 | --burn-in 100 \ 17 | --seed 12345 \ 18 | --thread 9 \ 19 | --no-mcmc-bin \ 20 | --out-freq 10 \ 21 | --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1}_1 -------------------------------------------------------------------------------- /GCTB/04a_merge_shrunk_all_chrs_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # Create logging directory 10 | log_dir=logs/04_merge_all_chrs/ 11 | mkdir -p $log_dir 12 | 13 | 14 | merge_all_job=$(sbatch \ 15 | --parsable \ 16 | --account INOUYE-COVID19-SL2-CPU \ 17 | --job-name "all_merge" \ 18 | --time 2:0:0 \ 19 | --mem 240000 \ 20 | --output $log_dir/MS_all_chrs_%A_%a.o \ 21 | --error $log_dir/MS_all_chrs_%A_%a.e \ 22 | --partition skylake-himem \ 23 | 04a_merge_shrunk_all_chrs/gctb_merge_shrunk_bins_all_chrs.sh) 24 | 25 | echo "Submitted jobs $merge_all_job" 26 | 27 | -------------------------------------------------------------------------------- /GCTB/06a_run_SbayesS_with_shrunk_corr/run_SbayesS_with_shrunk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gctb=/home/yx322/GCTB/gctb_2.03beta_Linux/gctb 4 | 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix 6 | 7 | 8 | $gctb --sbayes S \ 9 | --ldm ${PWD}/interval_shrunk_chr_all.ldm.shrunk \ 10 | --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt \ 11 | --pi 0.01 \ 12 | --hsq 0.5 \ 13 | --num-chains 4 \ 14 | --chain-length 25000 \ 15 | --burn-in 2000 \ 16 | --seed 12345 \ 17 | --thread 18 \ 18 | --no-mcmc-bin \ 19 | --out-freq 10 \ 20 | --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1} -------------------------------------------------------------------------------- /GCTB/04_merge_shrunk_sparse_all_chrs_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/04_merge_all_chrs/ 12 | mkdir -p $log_dir 13 | 14 | 15 | merge_all_job=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-COVID19-SL2-CPU \ 18 | --job-name "all_merge" \ 19 | --time 2:0:0 \ 20 | --mem 60000 \ 21 | --output $log_dir/M_all_chrs_%A_%a.o \ 22 | --error $log_dir/M_all_chrs_%A_%a.e \ 23 | --partition skylake-himem \ 24 | 04_merge_shrunk_sparse_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh) 25 | 26 | echo "Submitted jobs $merge_all_job" 27 | 28 | -------------------------------------------------------------------------------- /GCTB/06a_run_SbayesS_with_shrunk_corr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/06_run_SbayesS/ 12 | mkdir -p $log_dir 13 | 14 | 15 | run_SBayesS=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-SL2-CPU \ 18 | --job-name "SS${1}" \ 19 | --time 12:0:0 \ 20 | --mem 240000 \ 21 | --output $log_dir/run_SBayesS_shrunk_${1}_%A_%a.o \ 22 | --error $log_dir/run_SBayesS_shrunk_${1}_%A_%a.e \ 23 | --partition skylake-himem \ 24 | 06a_run_SbayesS_with_shrunk_corr/run_SbayesS_with_shrunk.sh ${1}) 25 | 26 | echo "Submitted jobs $run_SBayesS" 27 | 28 | -------------------------------------------------------------------------------- /GCTB/02a_merge_shrunk_bins_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/02_gen_shrunk_corr/ 12 | mkdir -p $log_dir 13 | 14 | 15 | merge_job=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-COVID19-SL2-CPU \ 18 | --job-name "merge_bin" \ 19 | --array 1-6 \ 20 | --time 2:0:0 \ 21 | --mem 60000 \ 22 | --output $log_dir/merge_shrunk_%A_%a.o \ 23 | --error $log_dir/merge_shrunk_%A_%a.e \ 24 | --partition skylake,skylake-himem \ 25 | 02a_gen_shrunk_corr_data_mcpu/gctb_merge_shrunk_bins_by_chr.sh) 26 | 27 | echo "Submitted jobs $merge_job" 28 | 29 | -------------------------------------------------------------------------------- /GCTB/03_to_shrunk_sparse_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/03_shrunk_sparse_corr/ 12 | mkdir -p $log_dir 13 | 14 | 15 | sparse_job=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-COVID19-SL2-CPU \ 18 | --job-name "sparse" \ 19 | --array 1-22 \ 20 | --time 2:0:0 \ 21 | --mem 60000 \ 22 | --output $log_dir/shrunk_sparse_%A_%a.o \ 23 | --error $log_dir/shrunk_sparse_%A_%a.e \ 24 | --partition skylake,skylake-himem \ 25 | 03_convert_to_sparse/gctb_convert_to_sparse_shrunk_corr_matrix.sh) 26 | 27 | echo "Submitted jobs $sparse_job" 28 | 29 | -------------------------------------------------------------------------------- /GCTB/02a_gen_shrunk_corr_data_mcpu_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/02_gen_shrunk_corr/ 12 | mkdir -p $log_dir 13 | 14 | 15 | gen_scorr_job=$(sbatch \ 16 | --parsable \ 17 | --account INOUYE-COVID19-SL2-CPU \ 18 | --job-name "gen_scorr" \ 19 | --array 1-${2} \ 20 | --time 2:0:0 \ 21 | --mem 11000 \ 22 | --output $log_dir/gen_scorr_chr${1}_%A_%a.o \ 23 | --error $log_dir/gen_scorr_chr${1}_%A_%a.e \ 24 | --partition skylake,skylake-himem \ 25 | 02a_gen_shrunk_corr_data_mcpu/gctb_gen_shrunk_corr_matrix_mcpu.sh ${1}) 26 | 27 | echo "Submitted jobs $gen_scorr_job" 28 | 29 | -------------------------------------------------------------------------------- /03_collate_QTLs.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/collate_QTLs 18 | mkdir -p $log_dir 19 | 20 | # Step 1: curate list of pQTLs to extract from the genotype data 21 | mkdir -p geno_files/ 22 | sbatch --dependency afterany:$previous_job \ 23 | --account INOUYE-COVID19-SL2-CPU \ 24 | --job-name "Collate QTLs" \ 25 | --time 36:0:0 \ 26 | --output $log_dir/collate_QTLs_%j.o \ 27 | --error $log_dir/collate_QTLs_%j.e \ 28 | --partition skylake,skylake-himem \ 29 | --wrap "Rscript scripts/03_collate_QTLs/01_collate_QTLs.R" 30 | -------------------------------------------------------------------------------- /02_ldthin/02_ldthin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ref_dir=geno_files/genotype_data 4 | out_dir=$ref_dir/ldthinned 5 | chr=$SLURM_ARRAY_TASK_ID 6 | 7 | # Get LD thinned set of variants with MAF > 0.5% 8 | plink2 --pfile $ref_dir/impute_${chr}_interval_dedup \ 9 | --extract $out_dir/chr${chr}_keep.txt \ 10 | --maf 0.005 \ 11 | --indep-pairwise 1000kb 0.8 \ 12 | --threads $SLURM_CPUS_ON_NODE \ 13 | --memory $SLURM_MEM_PER_NODE \ 14 | --silent \ 15 | --out $out_dir/chr${chr}_ldthinned 16 | 17 | # Extract those variants 18 | plink2 --pfile $ref_dir/impute_${chr}_interval_dedup \ 19 | --extract $out_dir/chr${chr}_ldthinned.prune.in \ 20 | --threads $SLURM_CPUS_ON_NODE \ 21 | --memory $SLURM_MEM_PER_NODE \ 22 | --silent \ 23 | --make-pgen \ 24 | --out $out_dir/impute_${chr}_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8 25 | 26 | # Remove temporary file 27 | rm $out_dir/chr${chr}_keep.txt 28 | rm $out_dir/chr${chr}_ldthinned.prune.in 29 | rm $out_dir/chr${chr}_ldthinned.prune.out 30 | 31 | -------------------------------------------------------------------------------- /GCTB/01_gen_genetic_data_hapmap3_variants/gen_INTERVAL_hapmap3_vars_genetic_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | plink=/home/yx322/plink_2.0/plink2 4 | 5 | bed_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/yx322/interval_genetics/interval_impute_chr${SLURM_ARRAY_TASK_ID} 6 | 7 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${SLURM_ARRAY_TASK_ID} 8 | 9 | variant_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks/data/HapMap3_transformed/HapMap1kg_variants_matched2INTERVAL_rsid.txt 10 | 11 | sample_id_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks/data/INTERVAL_genotypes/all_externalIDs.txt 12 | 13 | 14 | $plink --bfile $bed_file \ 15 | --extract $variant_file \ 16 | --geno 0.05 \ 17 | --hwe 1e-6 \ 18 | --mach-r2-filter 0.3 \ 19 | --maf 0.01 \ 20 | --remove $sample_id_file \ 21 | --make-bed \ 22 | --out $output_file 23 | 24 | -------------------------------------------------------------------------------- /06_all_omics_ukb_phecode_disease_assoc_test.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | 10 | # Create logging directory 11 | log_dir=logs/06_omics_ukb_phecode_disease_assoc_test/ 12 | mkdir -p $log_dir 13 | 14 | 15 | #submit an array job for association scan of a given phecode in UKB across all omics traits of the 5 platforms 16 | #the first argument is the phecode file 17 | asso_test_job=$(sbatch --parsable \ 18 | --account INOUYE-SL3-CPU \ 19 | --job-name "ass_test" \ 20 | --array 1-5 \ 21 | --time 12:0:0 \ 22 | --mem 36000 \ 23 | --output $log_dir/omics_${1}_assoc_tests_%A_%a.o \ 24 | --error $log_dir/omics_${1}_assoc_tests_%A_%a.e \ 25 | --partition cclake-himem,skylake-himem,skylake,cclake \ 26 | scripts/06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.sh ${1}) 27 | 28 | echo "Submitted jobs ${asso_test_job}" 29 | 30 | -------------------------------------------------------------------------------- /GCTB/02_gen_shrunk_corr_data_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/02_gen_shrunk_corr/ 18 | mkdir -p $log_dir 19 | 20 | 21 | gen_scorr_job=$(sbatch --dependency afterany:$previous_job \ 22 | --parsable \ 23 | --account INOUYE-COVID19-SL2-CPU \ 24 | --job-name "gen_scorr" \ 25 | --array 1-22 \ 26 | --time 12:0:0 \ 27 | --mem 60000 \ 28 | --output $log_dir/gen_scorr_%A_%a.o \ 29 | --error $log_dir/gen_scorr_%A_%a.e \ 30 | --partition skylake-himem \ 31 | 02_gen_shrunk_corr_data/gctb_gen_shrunk_corr_matrix.sh) 32 | 33 | echo "Submitted jobs $gen_scorr_job" 34 | 35 | -------------------------------------------------------------------------------- /GCTB/01_gen_genetic_data_hapmap3_variants_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/01_gen_genetic_data/ 18 | mkdir -p $log_dir 19 | 20 | 21 | gen_genetics_job=$(sbatch --dependency afterany:$previous_job \ 22 | --parsable \ 23 | --account INOUYE-COVID19-SL2-CPU \ 24 | --job-name "gen_genetics" \ 25 | --array 1-22 \ 26 | --time 12:0:0 \ 27 | --mem 60000 \ 28 | --output $log_dir/gen_genetics_%A_%a.o \ 29 | --error $log_dir/gen_genetics_%A_%a.e \ 30 | --partition skylake \ 31 | 01_gen_genetic_data_hapmap3_variants/gen_INTERVAL_hapmap3_vars_genetic_data.sh) 32 | 33 | echo "Submitted jobs $gen_genetics_job" 34 | 35 | -------------------------------------------------------------------------------- /05_genetic_score_training.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/genetic_score_training/ 18 | mkdir -p $log_dir 19 | 20 | # Step 1: curate list of QTLs to extract from the genotype data 21 | # First argument in python script is the omics platform name (i.e. SomaScan, Olink, Metabolon, Nightingale and RNAseq) 22 | # and the second argument is the index of a trait for training in the platform 23 | # the 3th to 6th arguments are the priors for the BR method 24 | 25 | mkdir -p geno_files/ml_inputs 26 | 27 | score_training=$(sbatch --dependency afterany:$previous_job \ 28 | --parsable \ 29 | --account INOUYE-COVID19-SL2-CPU \ 30 | --job-name "score train" \ 31 | --time 36:0:0 \ 32 | -c 32 -N 1 \ 33 | --output $log_dir/score_training_%j.o \ 34 | --error $log_dir/score_training_%j.e \ 35 | --partition skylake-himem \ 36 | --wrap "Python scripts/05_genetic_score_training/01_run_omics_pgs_training with_br.py SomaScan 1 0.000001 0.000001 0.000001 0.000001") 37 | 38 | 39 | 40 | echo "Submitted jobs $score_training" 41 | 42 | -------------------------------------------------------------------------------- /02_ldthin/01_identify_snps.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | chr_id = as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID")) 4 | 5 | ref_dir = "geno_files/genotype_data" 6 | out_dir = sprintf("%s/ldthinned", ref_dir) 7 | 8 | pvar = fread(sprintf("%s/impute_%s_interval_dedup.pvar", ref_dir, chr_id)) 9 | 10 | # Remove multi-allelic sites: 11 | multi = pvar[grepl("^rs", ID), .N, by=ID][N > 1] 12 | pvar = pvar[!multi, on = .(ID)] 13 | multi = pvar[!grepl("^rs", ID), .N, by=.(`#CHROM`, POS)][N > 1] 14 | pvar = pvar[!multi, on = .(`#CHROM`, POS)] 15 | 16 | # Function for flipping the strand of an allele. 17 | # Uses a series of gsub calls to replace A's with T's, 18 | # G's with C's, and vice-versa. Also works for alleles 19 | # with more than one nucleotide (e.g. indels). 20 | flip_strand <- function(x) { 21 | # Swap each letter for a dummy, we need this intermediate 22 | # step so we can distinguish between alleles when swapping. 23 | # E.g if we did A -> T then T -> A we'd end up with all A's 24 | # and no T's. instead we do A -> V -> T and T -> X -> A. 25 | x <- gsub("A", "V", x) 26 | x <- gsub("T", "X", x) 27 | x <- gsub("C", "Y", x) 28 | x <- gsub("G", "Z", x) 29 | x <- gsub("V", "T", x) 30 | x <- gsub("X", "A", x) 31 | x <- gsub("Y", "G", x) 32 | x <- gsub("Z", "C", x) 33 | return(x) 34 | } 35 | 36 | # Remove strand ambiguous alleles: 37 | pvar = pvar[REF != flip_strand(ALT)] 38 | 39 | # Filter to SNPs 40 | pvar = pvar[nchar(REF) == 1 & nchar(ALT) == 1] 41 | 42 | # Write out list of variants to extract prior to LD-thinning 43 | fwrite(pvar[,.(ID)], col.names=FALSE, quote=FALSE, file=sprintf("%s/chr%s_keep.txt", out_dir, chr_id)) 44 | 45 | -------------------------------------------------------------------------------- /02_ldthin.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/02_ldthin/ 18 | mkdir -p $log_dir 19 | 20 | # Step 1: identify SNPs to keep 21 | mkdir -p geno_files/genotype_data/ldthinned 22 | keep_job=$(sbatch --dependency afterany:$previous_job \ 23 | --parsable \ 24 | --account INOUYE-COVID19-SL2-CPU \ 25 | --job-name "Identify SNPs" \ 26 | --time 1:0:0 \ 27 | --array 1-22 \ 28 | --output $log_dir/identify_snps_%A_%a.o \ 29 | --error $log_dir/identify_snps_%A_%a.e \ 30 | --partition skylake \ 31 | --wrap "Rscript scripts/02_ldthin/01_identify_snps.R") 32 | 33 | # Step 2: LD thin remaining SNPs at R2=0.8 34 | thin_job=$(sbatch --dependency afterok:$keep_job \ 35 | --parsable \ 36 | --account INOUYE-COVID19-SL2-CPU \ 37 | --job-name "LDthin" \ 38 | --array 1-22 \ 39 | --time 3:0:0 \ 40 | --mem 10000 \ 41 | --output $log_dir/ldthin_%A_%a.o \ 42 | --error $log_dir/ldthin_%A_%a.e \ 43 | --partition skylake \ 44 | scripts/02_ldthin/02_ldthin.sh) 45 | 46 | echo "Submitted jobs $keep_job, $thin_job" 47 | 48 | -------------------------------------------------------------------------------- /04_extract_QTLs.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/extract_QTLs/ 18 | mkdir -p $log_dir 19 | 20 | # Step 1: curate list of QTLs to extract from the genotype data 21 | # First argument in R script is trans/genome-wide P-value threshold, (must be < 0.001) 22 | # and the second argument is the cis P-value threshold (use 1 if you want to include all cis SNPs) 23 | mkdir -p geno_files/ml_inputs 24 | qc_job=$(sbatch --dependency afterany:$previous_job \ 25 | --parsable \ 26 | --account INOUYE-COVID19-SL2-CPU \ 27 | --job-name "Extract QTLs" \ 28 | --time 36:0:0 \ 29 | -c 32 -N 1 \ 30 | --output $log_dir/curate_QTLs_%j.o \ 31 | --error $log_dir/curate_QTLs_%j.e \ 32 | --partition skylake-himem \ 33 | --wrap "Rscript scripts/04_extract_QTLs/01_extract_QTLs.R '5e-8' '5e-8'") 34 | 35 | # Step 2: extract the dosages of the effect alleles 36 | ex_job=$(sbatch --dependency afterok:$qc_job \ 37 | --parsable \ 38 | --account INOUYE-COVID19-SL2-CPU \ 39 | --job-name "Extract dosages" \ 40 | --partition skylake,skylake-himem \ 41 | --time 2:0:0 \ 42 | --array 1-200 \ 43 | --mem 10000 \ 44 | --output $log_dir/extract_dosages_%A_%a.o \ 45 | --error $log_dir/extract_dosages_%A_%a.e \ 46 | scripts/04_extract_QTLs/02_extract_QTL_dosages.sh) 47 | 48 | echo "Submitted jobs $qc_job, $ex_job" 49 | 50 | -------------------------------------------------------------------------------- /05_genetic_score_training/BayesianRidge.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | import random 3 | from sklearn.linear_model import SGDRegressor 4 | import numpy as np 5 | import pandas as pd 6 | from sklearn.metrics import r2_score,explained_variance_score 7 | from scipy.stats import pearsonr 8 | from sklearn.linear_model import BayesianRidge 9 | from sklearn.model_selection import KFold 10 | from scipy.stats import spearmanr 11 | 12 | 13 | 14 | 15 | def get_BayesianRidge_prediction(x_train, y_train, x_val, y_val, alpha_1, alpha_2, lambda_1, lambda_2): 16 | model = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2) 17 | model.fit(x_train, y_train) 18 | y_pred = model.predict(x_val) 19 | return model,pearsonr(y_val, y_pred)[0] 20 | 21 | 22 | 23 | def full_fit_BayesianRidge(x_train, x_test, y_train, y_test, alpha_1, alpha_2, lambda_1, lambda_2): 24 | # Bayesian Ridge Regression Method 25 | # 26 | # Note prior Gamma distribution are set as ( alpha_1, alpha_2) and (lambda_1, lambda_2) which were selected via cross-validation step using training data (see codes below: full_fit_BayesianRidge_para_turing; 27 | # all traits shared the same) 28 | # X_train: training genotype data (Numpy matrix - samples X viriants) 29 | # X_test: testing genotype data 30 | # y_train: training trait value data (Numpy vector - 1 X N) 31 | # y_test: testing trait value data 32 | # return the learned model, r, explained variance score and spearmanr performance 33 | model, r = get_BayesianRidge_prediction(x_train, y_train, x_test, y_test, alpha_1, alpha_2, lambda_1, lambda_2) 34 | y_pred = model.predict(x_test) 35 | return model,pearsonr(y_test, y_pred)[0],r2_score(y_test, y_pred),explained_variance_score(y_test,y_pred),spearmanr(y_test,y_pred)[0] 36 | 37 | 38 | 39 | #hyper-parameter Tuning - finding the best prior Gamma distributions on the training set of a trait 40 | #Grid search on (1e10, 1e5, 1e1, 0, -1e1, -1e5, -1e10) 41 | #return the best 'alpha_1', 'alpha_2' 'lambda_1' 'lambda_2' 42 | def full_fit_BayesianRidge_para_turing(x_train, x_val, y_train, y_val,para_file): 43 | nums = (1e10, 1e5, 1e3, 1e1, 0, 1e-1, 1e-3,1e-5,1e-10) 44 | f=open(para_file,'w') 45 | alpha_1 = nums 46 | alpha_2 = nums 47 | lambda_1 = nums 48 | lambda_2 = nums 49 | best_model = None 50 | best_r = 0 51 | for a1 in alpha_1: 52 | for a2 in alpha_2: 53 | for l1 in lambda_1: 54 | for l2 in lambda_2: 55 | model,r = get_BayesianRidge_prediction(x_train,y_train,x_val,y_val,a1,a2,l1,l2) 56 | text = "Training BayesianRidge with alpha_1: {}, alphs_2: {}, lambda_1: {}, lambda_2:{} - r score {}\n".format(a1,a2,l1,l2,r) 57 | print(text) 58 | f.write(text) 59 | if best_model == None or r > best_r: 60 | best_model = model 61 | best_r = r 62 | best_params = {'alpha_1':a1, 'alpha_2': a2, 'lambda_1': l1, 'lambda_2': l2} 63 | print("Best Para: {}".format(best_params)) 64 | return best_model 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /01_convert_bgen.job: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure job is submitted directly 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then 5 | echo "This script should be executed directly, not with sbatch." 6 | exit 1 7 | fi 8 | 9 | # allows user to specify a job to wait for completion before running any of these scripts 10 | if [ ! -z "$1" ]; then 11 | previous_job=$1 12 | else 13 | previous_job=1 # run first job immediately 14 | fi 15 | 16 | # Create logging directory 17 | log_dir=logs/01_convert_bgen/ 18 | mkdir -p $log_dir 19 | 20 | # Step 1: convert the BGEN files to plink pgen/pvar/pfam files. 21 | # These maintain the probabilistic dosage information, while also 22 | # separating the variant information into a separate file, allowing 23 | # us to fix the non-unique variant identifiers (all variants without 24 | # an rsID are given identifier ".") 25 | mkdir -p geno_files/genotype_data/ 26 | conv_job=$(sbatch --dependency afterany:$previous_job \ 27 | --parsable \ 28 | --account INOUYE-COVID19-SL2-CPU \ 29 | --job-name "Convert bgen" \ 30 | --time 1:0:0 \ 31 | --array 1-22 \ 32 | --mem 36000 \ 33 | --output $log_dir/convert_bgen_%A_%a.o \ 34 | --error $log_dir/convert_bgen_%A_%a.e \ 35 | --partition skylake \ 36 | scripts/01_convert_bgen/01_convert_bgen.sh) 37 | 38 | # Step 2: Give variants unique identifiers and flag duplicates 39 | # for removal 40 | flag_job=$(sbatch --dependency afterok:$conv_job \ 41 | --parsable \ 42 | --account INOUYE-COVID19-SL2-CPU \ 43 | --job-name "Flag duplicates" \ 44 | --time 3:0:0 \ 45 | --mem 12000 \ 46 | --output $log_dir/flag_duplicates_%j.o \ 47 | --error $log_dir/flag_duplicates_%j.e \ 48 | --partition skylake \ 49 | --wrap "Rscript scripts/01_convert_bgen/02_flag_duplicates.R") 50 | 51 | # Step 3: Remove the variants flagged for removal 52 | rmdp_job=$(sbatch --dependency afterany:$flag_job \ 53 | --parsable \ 54 | --account INOUYE-COVID19-SL2-CPU \ 55 | --job-name "Remove duplicates" \ 56 | --time 1:0:0 \ 57 | --array 1-22 \ 58 | --mem 6000 \ 59 | --output $log_dir/remove_duplicates_%A_%a.o \ 60 | --error $log_dir/remove_duplicates_%A_%a.e \ 61 | --partition skylake \ 62 | scripts/01_convert_bgen/03_filter_duplicates.sh) 63 | 64 | # Step 4: Remove the extra crud in the variant identifiers now that 65 | # the deduplication process has happened. 66 | fvid_job=$(sbatch --dependency afterok:$rmdp_job \ 67 | --parsable \ 68 | --account INOUYE-COVID19-SL2-CPU \ 69 | --job-name "Fix variant IDs" \ 70 | --time 1:0:0 \ 71 | --mem 8192 \ 72 | --output $log_dir/fix_var_ids_%j.o \ 73 | --error $log_dir/fix_var_ids_%j.e \ 74 | --partition skylake \ 75 | --wrap "Rscript scripts/01_convert_bgen/04_fix_var_ids.R") 76 | 77 | echo "Submitted jobs $conv_job, $flag_job, $rmdp_job, $fvid_job" 78 | 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI:10.1038/s41586-023-05844-9](https://img.shields.io/badge/DOI%3A-10.1038%2Fs41586--023--05844--9-orange)](https://www.nature.com/articles/s41586-023-05844-9) 2 | 3 | [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](http://www.omicspred.org/) 4 | 5 | # An atlas of genetic scores to predict multi-omic traits 6 | This repository houses and documents the codes used to train genetic scores of omic traits using INTERVAL data and internally validate these scores in the study: Xu Y et al. An atlas of genetic scores to predict multi-omic traits. Nature (2023) https://doi.org/10.1038/s41586-023-05844-9. 7 | 8 | All genetic score models trained in the study, and their internal as well as external validation results were all deposited in a cloud service (boxing.com) and are publicly accessible through our online portal (www.omicspred.org). 9 | 10 | 11 | ## The following softwares and versions were used to perform the analyses: 12 | 13 | - Scientific Linux release 7.7 (Nitrogen) (HPC operating system) 14 | - slurm version 19.05.5 (HPC queue manager and job submission system) 15 | - GNU bash version 4.2.46(2) (shell environment used to run bash scripts) 16 | - PLINK v1.90b6.11 64-bit (24 Oct 2019) (www.cog-genomics.org/plink/1.9/) 17 | - PLINK v2.00a2.3LM 64-bit Intel (24 Jan 2020) (www.cog-genomics.org/plink/2.0/) 18 | - STAR v2.7.3.a (https://github.com/alexdobin/STAR) 19 | - featureCounts v2.0.0 (http://subread.sourceforge.net/) 20 | - QTLtools v1.3.1 (https://qtltools.github.io/qtltools/) 21 | 22 | - Python version 3.6.8 with the following Python packages: 23 | - numpy version 1.19.5 24 | - pandas version 1.1.5 25 | - scikit-learn version 0.21.2 26 | - scipy version 1.5.4 27 | - statsmodels version 0.12.2 28 | - lifelines version 0.26.0 29 | 30 | - R version 3.6.1 with the following R packages: 31 | - cowplot version 1.0.0 32 | - data.table version 1.13.6 33 | - dplyr version 1.0.8 34 | - foreach version 1.5.1 35 | - ggplot2 version 3.3.5 36 | - ggpubr version 0.2.5 37 | - grid version 3.6.1 38 | - plyr version 1.8.6 39 | - reshape2 version 1.4.4 40 | - RcolorBrewer version 1.1-2 41 | - stringr version 1.4.0 42 | - tibble version 3.1.0 43 | - bigsnpr version 1.10.8 44 | 45 | ## Description of scripts in each sub-folder: 46 | 47 | - Genetic score development for multi-omic traits: 48 | - **01_convert_bgen**: convert genotype data from bgen to Plink pgen format and remove duplacate variants; 49 | - **02_ldthin**: remove multi-allelic, ambiguous (A/T, G/G) variants and variants with a MAF < 0.5%, and ld-thin variants with r2=0.8 (i.e. indep-pairwise 1000kb 0.8 in plink2); 50 | - **03_collate_QTLs**: curate the list of QTLs infomration needed for variant selection from GWAS summary statistics; 51 | - **04_extract_QTLs**: select the list QTLs with given p-value thresholds, and extract their dosages of the effect alleles as input data of Bayesian Ridge; 52 | - **05_genetic_score_training**: training genetic score models using Bayesian ridge; 53 | 54 | 55 | - Others: 56 | - **06_all_omics_UKB_phecode_assoc_test**: perform PheWAS with the genetic scores of omics traits in UK Biobank 57 | - **GCTB**: scripts attempted to use SbayesS to estimate heritability of omics traits 58 | - **LDpred2**: Scripts used to develop genetic scores of omic traits using LDpred2-auto. 59 | -------------------------------------------------------------------------------- /01_convert_bgen/02_flag_duplicates.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | # Fix missing rsIDs and flag duplicate variants for removal. 4 | for (chr_id in 1:22) { 5 | # Load variant information and add common identifier to help identify duplicates 6 | # and fix missing rsIDs. 7 | pvar = fread(sprintf("geno_files/genotype_data/impute_%s_interval.pvar", chr_id)) 8 | pvar[, row := .I] 9 | pvar[, sorted_alleles := paste(sort(c(REF, ALT)), collapse=":"), by=row] 10 | pvar[, var_id := paste(`#CHROM`, POS, sorted_alleles, sep=":")] 11 | pvar[ID == ".", ID := var_id] 12 | 13 | # Load in variant statistics so we can use INFO scores to flag 14 | # which of each pair of duplicates to remove 15 | snpstats = fread(sprintf("/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/interval_grs_scan/data/INTERVAL/reference_files/imputed_genotypes/impute_%s_interval.snpstats", chr_id)) 16 | pvar[snpstats, on = .(`#CHROM`=chromosome, POS=position, REF=A_allele, ALT=B_allele), INFO := i.information] 17 | pvar[snpstats, on = .(`#CHROM`=chromosome, POS=position, REF=B_allele, ALT=A_allele), INFO := i.information] 18 | 19 | # Identify and flag duplicates for removal, keeping the entry with 20 | # the highest INFO score in each case. There are two cases to deal with: 21 | # 22 | # (1) variants that are duplicates by position and alleles 23 | # (2) variants that are duplicates by rsid (and alleles), but which may 24 | # have different positions. 25 | # 26 | # The reason we match by alleles as well is that it appears that multi-allelic 27 | # variants are split into multiple entries even in the BGEN files, so we don't 28 | # want to incorrectly remove these. 29 | 30 | # First, identify any variant that is duplicate by position, or duplicate by 31 | # id, then filter the pvar table to all remaining variants ('ok'). 32 | dups_by_pos = pvar[,.N, by=.(var_id)][N > 1] 33 | dups_by_id = pvar[,.N,by=.(ID, sorted_alleles)][N > 1] 34 | ok = pvar[!dups_by_pos, on = .(var_id)][!dups_by_id, on=.(ID, sorted_alleles)] 35 | 36 | # Extract the remaining variants, which are all duplicates in either sense 37 | dups = pvar[!ok, on = .(row)] 38 | 39 | # First, considering all duplicates by position, take the variant with the max 40 | # INFO score (first one if there are multiple with the same INFO). 41 | max_info_by_pos = dups[,.SD[which.max(INFO)], by=.(var_id)] 42 | 43 | # Then, from these remaining variants, take the max INFO score by rsID to handle 44 | # cases where > 1 variant may have the same rsID, but differeing positions. 45 | max_info_by_id = max_info_by_pos[,.SD[which.max(INFO)], by=.(ID, sorted_alleles)] 46 | 47 | # Add these back to the "ok" table 48 | ok = rbind(ok, max_info_by_id) 49 | 50 | # Flag in the pvar table the variants to remove - important to preserve order 51 | # of variants in the output table as the row number corresponds to row in the 52 | # genotype data. 53 | pvar[, remove := FALSE] 54 | pvar[!ok, on = .(row), remove := TRUE] 55 | 56 | # Make sure every variant has a unique identifier so we can accurately flag 57 | # variants for removal with plink. 58 | pvar[, ID := paste(ID, row, sep=":")] 59 | 60 | # Flag variants for removal in the identifier 61 | pvar[(remove), ID := paste(ID, "remove", sep=":")] 62 | 63 | # Overwrite pvar file: 64 | fwrite(pvar[, .(`#CHROM`, POS, ID, REF, ALT)], sep="\t", quote=FALSE, 65 | file=sprintf("geno_files/genotype_data/impute_%s_interval.pvar", chr_id)) 66 | 67 | # Remove objects and garbage collect before going to next loop 68 | rm(list=ls()) 69 | gc() 70 | } 71 | 72 | -------------------------------------------------------------------------------- /04_extract_QTLs/02_extract_QTL_dosages.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $1 = "" ]]; then 4 | out_dir=geno_files/ml_inputs 5 | else 6 | out_dir=$1 7 | fi 8 | 9 | # Get array of phenotypes 10 | phenotypes=($(ls $out_dir/*_variant_effects.txt | sed "s#$out_dir/##" | sed 's/_variant_effects.txt//')) 11 | nphen=${#phenotypes[@]} 12 | 13 | # Determine which phenotypes we're working with for this task: 14 | task=$SLURM_ARRAY_TASK_ID 15 | ntasks=$SLURM_ARRAY_TASK_MAX 16 | 17 | nphen_per_task=$(echo "a=$nphen; b=$ntasks; if ( a%b ) a/b+1 else a/b" | bc) 18 | task_start=$(echo "a=$task; b=$nphen_per_task; (a-1)*b" | bc) 19 | task_end=$(echo "a=$task_start; b=$nphen_per_task; c=$nphen; if ( (a+b)>c ) c - 1 else a+b-1" | bc) 20 | 21 | if [ $task_start -gt $task_end ]; then 22 | echo "No remaining phenotypes for task $task of $ntasks." 23 | exit 0 24 | else 25 | echo "Task $task of $ntasks extracting dosages of phenotypes $task_start - $task_end of $nphen." 26 | fi 27 | 28 | # Iterate through each protein for this task to extract the genotype dosages 29 | for phenIdx in $(seq $task_start $task_end); do 30 | # What phenotype are we working with? 31 | phen=${phenotypes[$phenIdx]} 32 | 33 | # Determine the chromosomes we need to extract for this phenotype 34 | chrs=( $(tail -n +2 $out_dir/${phen}_variant_effects.txt | cut -f 2 | sort | uniq) ) 35 | 36 | # Determine the samples we need to keep for this phenotype 37 | grep -P "^${phen}\t" $out_dir/phenotypes.txt | cut -f 3 > $out_dir/${phen}_IID.txt 38 | echo "#FID"$'\t'"IID" > $out_dir/${phen}.samples 39 | paste $out_dir/${phen}_IID.txt $out_dir/${phen}_IID.txt >> $out_dir/${phen}.samples 40 | rm $out_dir/${phen}_IID.txt 41 | 42 | # For each chromosome, extract the dosages of the pQTLs 43 | for chr in ${chrs[@]}; do 44 | # Get the list of pQTLs on this chromosome to extract 45 | cut -f 1,2 $out_dir/${phen}_variant_effects.txt | grep -w "${chr}"'$' | cut -f 1 > $out_dir/${phen}_chr${chr}_variant_ids.txt 46 | 47 | # Get their effect alleles 48 | grep -f $out_dir/${phen}_chr${chr}_variant_ids.txt -w $out_dir/${phen}_variant_effects.txt | cut -f 1,4 > $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt 49 | 50 | # Extract the dosages of the effect alleles 51 | plink2 --pfile geno_files/genotype_data/ldthinned/impute_${chr}_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8 \ 52 | --out $out_dir/${phen}_chr${chr}_dosages \ 53 | --keep $out_dir/${phen}.samples \ 54 | --extract $out_dir/${phen}_chr${chr}_variant_ids.txt \ 55 | --export A --export-allele $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt \ 56 | --memory $SLURM_MEM_PER_NODE \ 57 | --threads $SLURM_CPUS_ON_NODE \ 58 | --silent 59 | 60 | # remove temporary files 61 | rm $out_dir/${phen}_chr${chr}_variant_ids.txt 62 | rm $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt 63 | rm $out_dir/${phen}_chr${chr}_dosages.log 64 | done 65 | 66 | # Combine chromosome data 67 | for chr in ${chrs[@]}; do 68 | paste $out_dir/${phen}_dosages.txt $out_dir/${phen}_chr${chr}_dosages.raw > $out_dir/${phen}_dosage_tmpfile 69 | mv $out_dir/${phen}_dosage_tmpfile $out_dir/${phen}_dosages.txt 70 | rm $out_dir/${phen}_chr${chr}_dosages.raw 71 | done 72 | 73 | # get the varID column and remove extra sample identifier information from each chromosome 74 | Rscript scripts/04_extract_QTLs/02_helpers/reformat_dosages.R $out_dir $phen 75 | rm $out_dir/${phen}_dosages.txt 76 | 77 | # sample file no longer needed 78 | rm $out_dir/${phen}.samples 79 | done 80 | 81 | -------------------------------------------------------------------------------- /05_genetic_score_training/01_run_omics_pgs_training with_br.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | import datetime 5 | from sklearn.model_selection import KFold 6 | from Methods.BayesianRidge import full_fit_BayesianRidge 7 | from Methods.Traditional_GRS import traditional_GRS_selected_vars 8 | from sklearn.externals import joblib 9 | import sys 10 | import os 11 | import time 12 | 13 | 14 | def read_proteins_list(proteinomics_list_file): 15 | df = pd.read_csv(proteinomics_list_file,sep='\t') 16 | return list(df['PhenotypeCompName']) 17 | 18 | 19 | def read_protein_phenos(proteinomics_phenos_file, protein_name, sample_ids): 20 | df_pheno = pd.read_csv(proteinomics_phenos_file, delimiter='\t') 21 | df_pheno = df_pheno.loc[df_pheno['PhenotypeCompName'] == protein_name] 22 | df_pheno = df_pheno.set_index('IID') 23 | return np.array(df_pheno.loc[sample_ids, 'value']) 24 | 25 | 26 | def read_protein_genotypes(geno_file): 27 | df = pd.read_csv(geno_file,compression='gzip', sep='\t') 28 | sample_ids = list(df['varID']) 29 | df = df.set_index('varID') 30 | var_ids = list(df.columns) 31 | X = np.array(df.loc[:,:]) 32 | return sample_ids,var_ids,X 33 | 34 | 35 | def run_experiments_5_folders_one_protein(platform,proteinomics_list_file,proteinomics_genotype_path, proteinomics_phenos_file,results_path,models_path,beta_path,protein_index, alpha_1, alpha_2, lambda_1, lambda_2): 36 | 37 | #read the full list of protein (or other type of omic trait) unique ids & read the current protein (or ther type of trait) id 38 | proteins_list = read_proteins_list(proteinomics_list_file) 39 | protein_name = proteins_list[protein_index] 40 | 41 | print("Start processing {}-{}-{}".format(platform,protein_index,protein_name)) 42 | 43 | #read genotype matrix X and all sample ids and variants ids 44 | geno_file = proteinomics_genotype_path + protein_name + "_dosages.txt.gz" 45 | sample_ids,var_ids,X = read_protein_genotypes(geno_file) 46 | 47 | print("Number of Variants {}".format(len(var_ids))) 48 | print("Number of Samples {}".format(len(sample_ids))) 49 | 50 | #read protein levels of the given proteins and all samples 51 | y = read_protein_phenos(proteinomics_phenos_file, protein_name, sample_ids) 52 | 53 | results_file = results_path + platform + "_" +protein_name + "_BR_UNI_prs.txt" 54 | 55 | f = open(results_file,'w') 56 | f.write('Time\tProtein\tN_Vars\tFolder\tBR_r2\tBR_sr\tUNI_r2\tUNI_sr\n') 57 | 58 | folder_count = 0 59 | kf = KFold(n_splits=5, shuffle=True, random_state=21) 60 | for train_index, test_index in kf.split(y): 61 | folder_count += 1 62 | 63 | print("folder-{}".format(folder_count)) 64 | 65 | x_train, x_test = X[train_index], X[test_index] 66 | y_train, y_test = y[train_index], y[test_index] 67 | 68 | print("{}-folder-{} Running BayesianRidge...".format(datetime.datetime.now(), folder_count)) 69 | BR_model,br_r,br_r2,br_envs,br_sr = full_fit_BayesianRidge(x_train, x_test, y_train, y_test, alpha_1, alpha_2, lambda_1, lambda_2) 70 | print("r: {}, r2: {}, env: {}, sr: {}".format(br_r, br_r2,br_envs,br_sr)) 71 | model_file = models_path + platform + "_" + protein_name + "_BR_model_" + str(folder_count) + ".pkl" 72 | joblib.dump(BR_model, model_file) 73 | 74 | print("{}-folder-{} Running Univariant method...".format(datetime.datetime.now(), folder_count)) 75 | beta_file = beta_path + protein_name + "_variant_effects.txt" 76 | grs_r, grs_r2,grs_env,grs_sr = traditional_GRS_selected_vars(beta_file, x_test, y_test, var_ids) 77 | print("r: {}, r2: {}, env: {}, sr: {}".format(grs_r, grs_r2,grs_env, grs_sr)) 78 | 79 | write_text = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(datetime.datetime.now(),protein_name,X.shape[1],folder_count,br_r**2,br_sr,grs_r**2, grs_sr) 80 | f.write(write_text) 81 | f.flush() 82 | f.close() 83 | 84 | 85 | if __name__ == "__main__": 86 | 87 | # platforms: "SomaScan", "Olink", "Metabolon", "Nightingale", "RNAseq" 88 | platform= str(sys.argv[1]) 89 | 90 | # the omic trait index in a platform 91 | protein_index = int(sys.argv[2]) 92 | 93 | # BR priors for traits in the platform 94 | alpha_1 = float(sys.argv[3]) 95 | alpha_2 = float(sys.argv[4]) 96 | lambda_1= float(sys.argv[5]) 97 | lambda_2 = float(sys.argv[6]) 98 | 99 | 100 | # trait level file 101 | proteinomics_phenos_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform + "_5e8/phenotypes.txt" 102 | 103 | # trait list on the platform 104 | proteinomics_list_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform + "_5e8/" + platform + "_phenotype_info.txt" 105 | 106 | #folder store all genotype files at each folder 107 | proteinomics_genotype_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform + "_5e8/" 108 | 109 | # results path 110 | results_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/results/" + platform + '_5e8/' 111 | 112 | if os.path.isdir(results_path) == False: 113 | os.mkdir(results_path) 114 | 115 | # model path 116 | models_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/ml_models/" + platform + '_5e8/' 117 | 118 | if os.path.isdir(models_path) == False: 119 | os.mkdir(models_path) 120 | 121 | # path stores the the betas of selected QLT variants from GWAS 122 | beta_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform+ '_5e8/' 123 | 124 | 125 | # model training for one omic trait 126 | run_experiments_5_folders_one_protein(platform,proteinomics_list_file,proteinomics_genotype_path, proteinomics_phenos_file,results_path,models_path,beta_path,protein_index, alpha_1, alpha_2, lambda_1, lambda_2) 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import StandardScaler 3 | from lifelines import CoxPHFitter 4 | import sys 5 | import statsmodels.formula.api as smf 6 | import os.path 7 | 8 | if __name__ == "__main__": 9 | 10 | phe_code_file = str(sys.argv[1]) 11 | phe_code = phe_code_file.replace('Phecode_','').replace('.csv.gz','') 12 | 13 | pgs_index = int(sys.argv[2])-1 14 | 15 | # qc file from UKB 16 | ukb_qc_file = "/home/yx322/rds/rds-jmmh2-post_qc_data/uk_biobank/reference_files/genetic/reference_files/full_release/QC_documents/sampleQC_fromUKB_withHeaders.txt" 17 | # read ukb samples qc data to get array and PCs info 18 | df_ukb_qc = pd.read_csv(ukb_qc_file, skiprows=[i for i in range(0, 31)], sep=' ') 19 | 20 | 21 | #### select white british only #### 22 | df_ukb_qc = df_ukb_qc.loc[df_ukb_qc['in.white.British.ancestry.subset'] == 1] 23 | df_ukb_qc = df_ukb_qc[['#UKB_ID1', 'genotyping.array', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']] 24 | 25 | 26 | pheno_file = "/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/PGSCatalog/PheWAS/_phenotyped/" + phe_code_file 27 | df_pheno = pd.read_csv(pheno_file,compression='gzip') 28 | df_pheno = df_pheno[['eid','genid','sex','PHECODE_AgeAsTimescale','PHECODE_AgeAsTimescale_Years']] 29 | df_pheno = df_pheno.rename(columns={'eid':'idno','PHECODE_AgeAsTimescale':'PHENOTYPE','PHECODE_AgeAsTimescale_Years':'CENSOR_AGE'}) 30 | 31 | # Incomporate pcs and arrary info 32 | df_pheno_all = pd.merge(df_pheno, df_ukb_qc, left_on='genid', right_on='#UKB_ID1') 33 | 34 | omics_pgs_files = ['/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Metabolon_full/UKB_Metabolon.sscore.gz',\ 35 | "/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Olink_full/UKB_Olink.sscore.gz", \ 36 | '/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Somalogic_full/UKB_Somalogic.sscore.gz', \ 37 | '/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/omics_PGS_scores/UKB_Nightingale_5e8/UKB_Nightingale.sscore.gz'] 38 | 39 | platforms = ['Metabolon', 'Olink', 'Somalogic', 'Nightingale'] 40 | 41 | # add in gene expression PGS by chr 42 | for chr in range(1, 23): 43 | GE_pgs_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/omics_PGS_scores/UKB_GE_5e-8/chr" + str(chr) + "/UKB_GE.sscore.gz" 44 | omics_pgs_files.append(GE_pgs_file) 45 | platform_name = "GE_chr" + str(chr) 46 | platforms.append(platform_name) 47 | 48 | i = pgs_index 49 | platform = platforms[i] 50 | SOMA_PGS_file = omics_pgs_files[i] 51 | df_soma_pgs = pd.read_csv(SOMA_PGS_file, sep='\t', compression='gzip') 52 | score_cols = list(df_soma_pgs.columns[1:]) 53 | 54 | write_file = '/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/UKB_phecode_association/raw_assocs/' + platform + '_PGS_UKB_' + phe_code + '_assoc_full_EU.txt' 55 | 56 | df_save = pd.DataFrame(columns=['Trait', 'HR', 'HR_low', 'HR_high', 'pvalue']) 57 | 58 | for col_name in score_cols: 59 | 60 | df_one_pgs = df_soma_pgs[['IID', col_name]] 61 | 62 | df_pheno_test = pd.merge(df_one_pgs, df_pheno_all, left_on='IID', right_on='idno') 63 | df_pheno_test = df_pheno_test[[col_name, 'sex', 'PHENOTYPE', 'CENSOR_AGE', 'genotyping.array', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']] 64 | 65 | df_pheno_test = pd.get_dummies(df_pheno_test, drop_first=True) 66 | df_pheno_test[[col_name, 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9','PC10']] = StandardScaler().fit_transform(df_pheno_test[[col_name, 'PC1', 'PC2', 'PC3', 'PC4', 'PC5','PC6', 'PC7', 'PC8', 'PC9', 'PC10']]) 67 | 68 | # adjust pgs for PCs 69 | #reg_trait = col_name + ' ~ PC1+ PC2+ PC3+ PC4+ PC5+ PC6+ PC7+ PC8+ PC9+ PC10' 70 | #adj_result = smf.ols(reg_trait, data=df_pheno_test, missing='drop').fit() 71 | #df_pheno_test[col_name] = adj_result.resid 72 | #df_pheno_test[[col_name]] = StandardScaler().fit_transform(df_pheno_test[[col_name]]) 73 | 74 | df_pheno_test['temp_col'] = df_pheno_test[col_name] 75 | reg_trait = 'temp_col ~ PC1+ PC2+ PC3+ PC4+ PC5+ PC6+ PC7+ PC8+ PC9+ PC10' 76 | adj_result = smf.ols(reg_trait, data=df_pheno_test, missing='drop').fit() 77 | df_pheno_test[col_name] = adj_result.resid 78 | del df_pheno_test['temp_col'] 79 | # remove nan rows 80 | index_nan = list(df_pheno_test[df_pheno_test.isnull().any(axis=1)].index) 81 | df_pheno_test.drop(index_nan,inplace=True) 82 | 83 | df_pheno_test[[col_name]] = StandardScaler().fit_transform(df_pheno_test[[col_name]]) 84 | 85 | if 'sex_Male' in df_pheno_test.columns: 86 | try: 87 | cph1 = CoxPHFitter() 88 | cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male']) 89 | except: 90 | cph1 = CoxPHFitter() 91 | cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male'],step_size=0.1) 92 | else: 93 | try: 94 | cph1 = CoxPHFitter() 95 | cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE') 96 | except: 97 | cph1 = CoxPHFitter() 98 | cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE',step_size=0.1) 99 | 100 | #cph1 = CoxPHFitter() 101 | # cph1.fit(df_testing_analysis_dummies, duration_col='duration', event_col='chd_case',strata=['sex_Male']) 102 | #cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male']) 103 | results = [col_name, cph1.hazard_ratios_[col_name], cph1.summary.loc[col_name, 'exp(coef) lower 95%'], cph1.summary.loc[col_name, 'exp(coef) upper 95%'], cph1.summary.loc[col_name, 'p']] 104 | 105 | #print(results) 106 | 107 | df_save = df_save.append({'Trait': results[0], 'HR': results[1], 'HR_low': results[2], 'HR_high': results[3], 'pvalue': results[4]}, ignore_index=True) 108 | 109 | df_save = df_save.sort_values('pvalue') 110 | df_save.to_csv(write_file, sep='\t', index=False) 111 | 112 | -------------------------------------------------------------------------------- /03_collate_QTLs/01_collate_QTLs.R: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Load libraries/dependencies 3 | # -------------------------------------------------------------------------------------- 4 | library(data.table) 5 | library(openxlsx) 6 | library(foreach) 7 | library(doMC) 8 | 9 | # -------------------------------------------------------------------------------------- 10 | # Set global script options 11 | # -------------------------------------------------------------------------------------- 12 | 13 | out_dir = "geno_files" 14 | ldthinned = "geno_files/genotype_data/ldthinned" # variant set to consider 15 | 16 | # Data comes from other projects: 17 | soma_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/somalogic_proteomics/interval/gwas/BAKEOFF151001/gwas_output/imputed/somalogic/meta" 18 | nmr_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/nightingale_metabolomics/interval/gwas/nmr/results/HPC_results" 19 | metabo_GWAS = "/rds/project/jmmh2/rds-jmmh2-results/private/metabolomics/metabolon_hd4/interval_gwas/raw_results" 20 | olink_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/scallop/jp549/olink-merged-output" 21 | olink_neu_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/interval_gwas_discovery/neu/interval_subset_olink/neuro/full_set/output/formatted_assoc_results" 22 | 23 | ## ====================================================================================== 24 | ## First, we want to load the summary statistics for all platforms, and filter to the 25 | ## ld-thinned variant set, and add a basic filter of P < 0.01 - we want to find a 26 | ## reasonable P-value threshold across all platforms but need to load the summary stats 27 | ## for all measurements 28 | ## ====================================================================================== 29 | 30 | varset = foreach(chr_id = 1:22, .combine=rbind) %do% { 31 | fread(sprintf("%s/impute_%s_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8.pvar", ldthinned, chr_id)) 32 | } 33 | setnames(varset, c("chr", "pos", "id", "ref", "alt")) 34 | 35 | ##' # -------------------------------------------------------------------------------------- 36 | ##' # Load in the NMR GWAS summary stats 37 | ##' # --------------------------------------------------------------------------------------- 38 | ##' 39 | ##' nmr_files = list.files(path=nmr_GWAS, pattern="*.gz$") 40 | ##' nmr_ss = foreach(ff = nmr_files, .combine=rbind) %do% { 41 | ##' # Load from the summary stats variants with P < 0.05 (filtered using awk). 42 | ##' # The header row is discarded because it is malformed. 43 | ##' ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $11 < 0.01 ) { print $2,$3,$5,$6,$9,$11 } }\'', nmr_GWAS, ff)) 44 | ##' setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "beta", "pval")) 45 | ##' ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0] 46 | ##' ss[, phenotype := gsub(".tar.gz", "", ff)] 47 | ##' return(ss) 48 | ##' } 49 | ##' fwrite(nmr_ss, file=sprintf("%s/nightingale_p_less_0.1.txt", out_dir), sep="\t", quote=F) 50 | ##' rm(nmr_ss) 51 | ##' gc() 52 | ##' 53 | ##' # -------------------------------------------------------------------------------------- 54 | ##' # Load in the Metabolon HD4 GWAS summary stats 55 | ##' # --------------------------------------------------------------------------------------- 56 | ##' 57 | ##' metabo_files = list.files(path=metabo_GWAS, pattern="*.gz$") 58 | ##' metabo_ss = foreach(ff = metabo_files, .combine=rbind) %do% { 59 | ##' ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $9 < 0.01 ) { print $1,$2,$3,$7,$9 } }\'', metabo_GWAS, ff)) 60 | ##' setnames(ss, c("markername", "effect_allele", "other_allele", "beta", "pval")) 61 | ##' ss[, chr := as.integer(gsub("chr", "", gsub(":.*", "", markername)))] 62 | ##' ss[, pos := as.integer(gsub(":.*", "", gsub("chr.*?:", "", markername)))] 63 | ##' ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0] 64 | ##' ss = ss[, .(chr, pos, effect_allele, other_allele, beta, pval)] 65 | ##' ss[, phenotype := gsub("_.*", "", gsub("INTERVAL_", "", ff))] 66 | ##' return(ss) 67 | ##' } 68 | ##' fwrite(metabo_ss, file=sprintf("%s/metabolon_p_less_0.1.txt", out_dir), sep="\t", quote=F) 69 | ##' rm(metabo_ss) 70 | ##' gc() 71 | 72 | # -------------------------------------------------------------------------------------- 73 | # Load in the olink data 74 | # --------------------------------------------------------------------------------------- 75 | 76 | olink_files = list.files(path=olink_GWAS, pattern="*.gz$") 77 | olink_ss = foreach(ff = olink_files, .combine=rbind) %do% { 78 | ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $22 < 0.01 ) { print $3,$4,$5,$6,$22 } }\'', olink_GWAS, ff)) 79 | setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval")) 80 | ss[, chr := as.integer(chr)] 81 | ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0] 82 | ss[, phenotype := gsub("_chr_merged.gz", "", gsub("INTERVAL_", "", ff))] 83 | return(ss) 84 | } 85 | 86 | olink_neu_files = list.files(path=olink_neu_GWAS, pattern="*.gz$") 87 | olink_neu_ss = foreach(ff = olink_neu_files, .combine=rbind) %do% { 88 | ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $22 < 0.01 ) { print $3,$4,$5,$6,$22 } }\'', olink_neu_GWAS, ff)) 89 | setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval")) 90 | ss[, chr := as.integer(chr)] 91 | ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0] 92 | ss[, phenotype := paste0("neu_", gsub("_olink.*", "", ff))] 93 | return(ss) 94 | } 95 | 96 | olink_ss = rbind(olink_ss, olink_neu_ss) 97 | fwrite(olink_ss, file=sprintf("%s/olink_p_less_0.1.txt", out_dir), sep="\t", quote=F) 98 | rm(olink_ss, olink_neu_ss) 99 | gc() 100 | 101 | # -------------------------------------------------------------------------------------- 102 | # Load in the somalogic data 103 | # --------------------------------------------------------------------------------------- 104 | 105 | soma_dirs = list.files(path=soma_GWAS) 106 | soma_ss = foreach(dd = soma_dirs, .combine=rbind) %do% { 107 | soma_files = list.files(path=sprintf("%s/%s", soma_GWAS, dd), pattern="*.gz$") 108 | foreach(ff = soma_files, .combine=rbind) %do% { 109 | ss = fread(cmd = sprintf('zcat %s/%s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $8 < -2 ) { print $1,$2,$4,$5,$8 } }\'', soma_GWAS, dd, ff)) 110 | setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval")) 111 | ss[, pval := 10^pval] 112 | ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0] 113 | ss[, phenotype := dd] 114 | return(ss) 115 | } 116 | } 117 | 118 | fwrite(soma_ss, file=sprintf("%s/somalogic_p_less_0.1.txt", out_dir), sep="\t", quote=F) 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /LDpred2/LDpred2_auto.R: -------------------------------------------------------------------------------- 1 | library(bigsnpr) 2 | library(data.table) 3 | library(foreach) 4 | library(tictoc) 5 | library(ggplot2) 6 | library(bit64) 7 | 8 | 9 | setwd("/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks") 10 | args = commandArgs(trailingOnly=TRUE) 11 | 12 | # the trait name from the gwas 13 | gwas <- args[1] 14 | 15 | # Make sure we're running on the appropriate compute node to make use of ramdisks partition 16 | # File back shared memory obejcts can only effectively be used when using the /ramdisks/ partition 17 | # as a temporary working directory, which is only available here on the skylake partitions. 18 | # Trying to use the lustre filesystem means LDpred2 grinds to a halt unless run on a single core 19 | # (and only one instance across the whole cluster) due to the consistency checks made by the lustre 20 | # filesystem on shared memory objects. 21 | 22 | if (!(Sys.getenv("SLURM_JOB_PARTITION") %like% "skylake")) { 23 | stop("Script must be run on compute node on skylake or skylake-himem partitions") 24 | } 25 | 26 | 27 | ## create ramdir 28 | starttime = Sys.time() 29 | ramdir <- "/ramdisks/ldpred2/" 30 | if (dir.exists(ramdir)) system(sprintf("rm -rf %s", ramdir), wait=TRUE) 31 | system(sprintf("mkdir -p %s", ramdir), wait=TRUE) 32 | 33 | 34 | # Copy genotype data to ramdisks 35 | system(sprintf("cp data/INTERVAL_genotypes/filtered_interval_chr*{rds,bk} %s", ramdir), wait=TRUE) 36 | 37 | 38 | gwas_file <- sprintf("data/GWAS_SumStats/gwas_filteredCleaned_%s.txt",gwas) 39 | stopifnot(file.exists(gwas_file)) 40 | 41 | # Setup output directory 42 | outdir <- sprintf("output/ldpred2/train/%s", gwas) 43 | system(sprintf("mkdir -p %s", outdir), wait=TRUE) 44 | 45 | # Set up temporary directories - clean up if already exists 46 | tmpdir <- sprintf("tmp/ldpred2/%s", gwas) 47 | if (dir.exists(tmpdir)) system(sprintf("rm -rf %s", tmpdir), wait=TRUE) 48 | system(sprintf("mkdir -p %s", tmpdir), wait=TRUE) 49 | 50 | ### Do per-SNP QC of GWAS summary statistics 51 | # Load the gwas_ss, match to the genotype data, and obtain per-SNP standard deviations and 52 | # allele frequencies for downstream SNP QC. 53 | gwas_ss <- fread(gwas_file) 54 | endtime = Sys.time() 55 | data_prep_time = as.numeric(difftime(endtime, starttime, units="secs")) 56 | 57 | 58 | # This loop needs at least 80GB of memory and takes ~16min to run 59 | tic("#1") 60 | starttime = Sys.time() 61 | gwas_ss <- foreach(this_chr = 1:22, .combine=rbind) %dopar% { 62 | cat("\nChromosome:", this_chr, "\n") 63 | # Attach file-backed genotype data 64 | geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, this_chr)) 65 | 66 | ## Match summary stats to genotype data. A few notes: 67 | # - Summary stats for all GWAS have already been filtered to HapMap3 variant 68 | # set that intersects with the variants in INTERVAL 69 | # - Strand orientation of alleles has also already been harmonized to INTERVAL for 70 | # all GWAS 71 | 72 | cat("- snp_match \n") 73 | map <- geno$map[-2] 74 | names(map) <- c("chr", "rsid", "pos", "a0", "a1") 75 | 76 | # snp_match not needed as it's been done before 77 | matched_snps <- snp_match(as.data.frame(gwas_ss[chr == this_chr]), map, strand_flip=FALSE) 78 | setDT(matched_snps) 79 | # matched_snps <- gwas_ss[chr == this_chr] 80 | 81 | cat("- allele frequency \n") 82 | # Obtain allele frequencies in the training data - dosages count 'a0' 83 | # this is the time consuming step 84 | matched_snps[, a1freq := Matrix::colSums(geno$genotype[, `_NUM_ID_`], na.rm=TRUE) / (Matrix::colSums(!is.na(geno$genotype[, `_NUM_ID_`]))*2)] 85 | matched_snps[, test := Matrix::colSums(geno$genotype[, `_NUM_ID_`], na.rm=TRUE) ] 86 | 87 | cat("- sd of allele frequency \n") 88 | # Compute the standard deviation of the allele frequency 89 | # See https://privefl.github.io/bigsnpr-extdoc/polygenic-scores-pgs.html 90 | matched_snps[, sd_val := sqrt(2 * a1freq * (1 - a1freq))] 91 | 92 | # Return 93 | return(matched_snps) 94 | } 95 | toc() 96 | endtime = Sys.time() 97 | geno_load_time = as.numeric(difftime(endtime, starttime, units="secs")) 98 | # about 20 mins 99 | 100 | 101 | # Do Per SNP QC of the summary stats. 102 | # See https://privefl.github.io/bigsnpr-extdoc/polygenic-scores-pgs.html and 103 | # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8016455/ 104 | 105 | gwas_ss[sd_val > 0, sd_y_est := median(sd_val * beta_se * sqrt(n_eff))] 106 | gwas_ss[sd_val > 0, sd_ss := sd_y_est / (beta_se * sqrt(n_eff))] 107 | gwas_ss[, fail_qc := sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05] 108 | 109 | gwas_qc <- gwas_ss[, .(rsid=rsid.ss, chr, pos, effect_allele=a1, other_allele=a0, gwas_beta=beta, gwas_se=beta_se, 110 | n_eff, trainingset_EAF=a1freq, sd_val, sd_y_est, sd_ss, fail_qc)] 111 | 112 | fwrite(gwas_qc, sep="\t", quote=FALSE, compress="gzip", file=sprintf("%s/ldpred2_gwasqc.txt.gz", outdir)) 113 | 114 | 115 | # Diagnostic plot 116 | g <- ggplot(gwas_qc, aes(x=sd_val, y=sd_ss, color=fail_qc)) + 117 | theme_bigstatsr() + 118 | geom_point(shape=19, size=0.5, alpha=0.5) + 119 | geom_abline(intercept=0, slope=1, linetype=2, colour="red") + 120 | scale_colour_manual(name="SNP failed ldpred2 QC", values=c("TRUE"="purple", "FALSE"="yellow")) + 121 | xlab("SNP dosage SD in training dataset") + 122 | ylab("SNP dosage SD in GWAS") + 123 | theme(legend.position="bottom") 124 | ggsave(g, width=7.2, height=6, file=sprintf("%s/ldpred2_gwasqc_%s.png", outdir, gwas)) 125 | 126 | 127 | tic("#2") # ~10min (or 40min sometimes...) 128 | starttime = Sys.time() 129 | for (this_chr in 1:22) { 130 | cat("\nLoading correlation matrix for chromosome ", this_chr, "\n") 131 | 132 | # Load correlation matrix precomputed for all candidate variants 133 | corr_file <- sprintf("data/ldpred2/filtered_interval_ldcorr_chr%s.rds", this_chr) 134 | stopifnot(file.exists(corr_file)) 135 | corr0 <- readRDS(corr_file) 136 | 137 | # Filter to those passing QC for this GWAS 138 | cat("- Filter out failed QC \n") 139 | corr0 <- gwas_ss[chr == this_chr & !(fail_qc), corr0[`_NUM_ID_`, `_NUM_ID_`]] 140 | 141 | # Compute LD score 142 | cat("- Compute LD score \n") 143 | gwas_ss[chr == this_chr & !(fail_qc), LDsum := Matrix::colSums(corr0^2)] 144 | 145 | foo <- sum(is.na(gwas_ss[chr == this_chr & !(fail_qc),]$LDsum)) 146 | if (foo > 0) { 147 | cat("(!) LDsum missing:", foo) 148 | } 149 | 150 | # Aggregate into a single sparse big matrix 151 | if (this_chr == 1) { 152 | cat("Initialized SFBM\n") 153 | genocorr <- as_SFBM(corr0, backingfile=sprintf("%s/ldcorr_passqc", tmpdir), compact = TRUE) 154 | } else { 155 | cat("Adding matrix to SFBM\n") 156 | genocorr$add_columns(corr0, nrow(genocorr)) 157 | } 158 | } 159 | toc() 160 | endtime= Sys.time() 161 | corr_load_time = as.numeric(difftime(endtime, starttime, units="secs")) 162 | 163 | 164 | cat("Moving SFBM backing file to /ramdisks\n") 165 | system(sprintf("cp %s/ldcorr_passqc.sbk %s/", tmpdir, ramdir), wait=TRUE) 166 | system(sprintf("rm %s/ldcorr_passqc.sbk", tmpdir), wait=TRUE) 167 | system(sprintf("ln -s %s/ldcorr_passqc.sbk %s/", ramdir, tmpdir), wait=TRUE) 168 | 169 | ## Calculate LDSC results (~3min) 170 | tic("LDSC") 171 | starttime = Sys.time() 172 | ldsc <- gwas_ss[!(fail_qc), snp_ldsc( 173 | ld_score = LDsum, ld_size = .N, 174 | chi2 = (beta / beta_se)^2, 175 | sample_size = n_eff, 176 | ncores = 1 177 | )] 178 | toc() 179 | endtime= Sys.time() 180 | ldsc_cal_time = as.numeric(difftime(endtime, starttime, units="secs")) 181 | 182 | #save ldsc estimates 183 | write.csv(ldsc,sprintf("%s/ldsc_results.csv", outdir),quote=FALSE) 184 | 185 | # Extract estimated heritability 186 | h2_est <- ldsc[["h2"]] 187 | 188 | # assgin a small heritability estimate when a negative value is returned in ldsc 189 | if (h2_est<0){ 190 | h2_est=0.001 191 | } 192 | 193 | 194 | ### Run auto model 195 | cat("Running auto model\n") 196 | 197 | tic("Auto model") 198 | starttime = Sys.time() 199 | multi_auto <- snp_ldpred2_auto( 200 | genocorr, gwas_ss[!(fail_qc)], h2_init = h2_est, allow_jump_sign=FALSE, 201 | vec_p_init = seq_log(1e-4, 0.2, length.out = 30), 202 | ncores = nb_cores() 203 | ) 204 | toc() 205 | endtime= Sys.time() 206 | ldpred_train_time = as.numeric(difftime(endtime, starttime, units="secs")) 207 | 208 | 209 | 210 | 211 | # check for "chain" convergence 212 | auto_params <- rbindlist(lapply(multi_auto, function(x) { 213 | data.table(p_init = x$p_init, h2_init = x$h2_init, p_est = x$p_est, h2_est = x$h2_est) 214 | })) 215 | auto_params[, paramset := .I] 216 | 217 | auto_path <- foreach(pIdx = seq_along(multi_auto), .combine=rbind) %do% { 218 | auto = multi_auto[[pIdx]] 219 | data.table(paramset = pIdx, path_iter = seq_along(auto$path_p_est), 220 | p_est = auto$path_p_est, h2_est = auto$path_h2_est) 221 | } 222 | 223 | g1 <- ggplot(auto_path) + aes(x = path_iter, y=p_est) + 224 | theme_bigstatsr() + 225 | geom_hline(data = auto_params, aes(yintercept=p_est), col="blue") + 226 | geom_point(shape=19, size=0.5) + 227 | scale_y_log10(name="p") + xlab("") + 228 | facet_wrap(~ paramset, ncol=10, labeller = label_both) + 229 | theme(strip.background=element_blank(), strip.text=element_text(size=6), 230 | axis.text=element_text(size=6), axis.title=element_text(size=10)) 231 | 232 | g2 <- ggplot(auto_path) + aes(x = path_iter, y=h2_est) + 233 | theme_bigstatsr() + 234 | geom_hline(data = auto_params, aes(yintercept=h2_est), col="blue") + 235 | geom_point(shape=19, size=0.5) + 236 | ylab("h2") + xlab("") + 237 | facet_wrap(~ paramset, ncol=10, labeller = label_both) + 238 | theme(strip.background=element_blank(), strip.text=element_text(size=6), 239 | axis.text=element_text(size=6), axis.title=element_text(size=10)) 240 | 241 | g <- plot_grid(g1, g2, nrow=2) 242 | ggsave(g, width=20, height=10, units="in", file=sprintf("%s/ldpred2_auto_chain_convergence.png", outdir)) 243 | 244 | 245 | ## select genetic score models to keep 246 | # and use the mean of betas of these selected models as the beta of the final genetic score model with LDpred2-auto 247 | # see https://privefl.github.io/bigsnpr/articles/LDpred2.html 248 | (range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est)))) 249 | (keep <- (range > (0.95 * quantile(range, 0.95)))) 250 | beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est)) 251 | 252 | 253 | pgs_auto <- foreach(this_chr = 1:22, .combine=`+`) %do% { 254 | geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, this_chr)) 255 | geno <- snp_fastImputeSimple(geno$genotypes, ncores=nb_cores()) # doesn't work with missing genotypes, so need to impute as median 256 | big_prodVec( 257 | X = geno, 258 | y.col = beta_auto[gwas_ss[!(fail_qc), which(chr == this_chr)]], 259 | ind.col = gwas_ss[!(fail_qc) & chr == this_chr, `_NUM_ID_`], 260 | ncores = nb_cores() 261 | ) 262 | } 263 | 264 | # get all sample IDs of the INTERVAL data 265 | geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, 1)) 266 | sample_IDs = geno$fam$sample.ID 267 | 268 | # save calculated genetic scores of the LDpred2-auto model for INTERVAL individuals 269 | auto_pgs_data = data.table(sample_IDs,pgs_auto) 270 | write.table(auto_pgs_data,sprintf("%s/ldpred2_auto_sample_pgs.csv", outdir),quote=FALSE,sep="\t",row.names=FALSE) 271 | 272 | # save the genetic score model developed using LDpred2-auto 273 | gwas_ss_sub = gwas_ss[!(fail_qc)] 274 | gwas_ss_sub$auto_beta = beta_auto 275 | auto_pgs_model = gwas_ss_sub[,c('rsid.ss','chr','pos','a1','a0',"auto_beta")] 276 | colnames(auto_pgs_model) =c("rsid",'chr','pos','effect_allele','other_allele','effect') 277 | write_file = sprintf("%s/ldpred2_auto_pgs_model.txt", outdir) 278 | write.table(auto_pgs_model,write_file,quote=FALSE,sep="\t",row.names=FALSE) 279 | 280 | # save running times at each stage of the genetic score development with LDpred2-auto 281 | values = c(data_prep_time,geno_load_time,corr_load_time,ldsc_cal_time,ldpred_train_time) 282 | time_name = c('data_prep_time','geno_load_time','corr_load_time','ldsc_cal_time','ldpred_train_time') 283 | df_time <- data.frame(time_name, values) 284 | write_file = sprintf("%s/running_time.txt", outdir) 285 | write.table(df_time,write_file,quote=FALSE,sep="\t",row.names=FALSE) 286 | 287 | # save the estimated heritability of all the trained genetic score models with LDpred2-auto above 288 | Kept = keep 289 | Heritability = sapply(multi_auto, get, x="h2_est") 290 | df_heri <- data.frame(Kept, Heritability) 291 | write_file = sprintf("%s/auto_model_heritability.txt", outdir) 292 | write.table(df_heri,write_file,quote=FALSE,sep="\t",row.names=FALSE) 293 | 294 | 295 | 296 | ### Clean up 297 | 298 | system(sprintf("rm -rf %s", tmpdir), wait=TRUE) 299 | system(sprintf("rm -rf %s", ramdir), wait=TRUE) -------------------------------------------------------------------------------- /04_extract_QTLs/01_extract_QTLs.R: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------- 2 | # Load libraries/dependencies 3 | # -------------------------------------------------------------------------------------- 4 | library(data.table) 5 | library(foreach) 6 | library(doMC) 7 | library(AnnotationHub) # Bioconductor package 8 | library(annotables) # remotes::install_github("stephenturner/annotables") 9 | 10 | args = commandArgs(trailingOnly=TRUE) 11 | trans_pthresh = as.numeric(args[1]) 12 | cis_pthresh = as.numeric(args[2]) 13 | 14 | if (is.na(trans_pthresh) || trans_pthresh > 0.001 || trans_pthresh < 0) { 15 | stop("Trans/genome-wide P-value threshold must <= 0.001") 16 | } 17 | 18 | if (is.na(cis_pthresh) || cis_pthresh > 1 || cis_pthresh < 0) { 19 | stop("Cis P-value threshold must be between 0 and 1") 20 | } 21 | 22 | 23 | ncores = 25 24 | 25 | parallelise_fread = function() { 26 | setDTthreads(ncores) 27 | registerDoMC(1) 28 | } 29 | 30 | parallelise_foreach = function() { 31 | setDTthreads(1) 32 | registerDoMC(ncores) 33 | } 34 | 35 | # -------------------------------------------------------------------------------------- 36 | # Set paths 37 | # -------------------------------------------------------------------------------------- 38 | 39 | out_dir = "geno_files/ml_inputs" 40 | geno_dir = "geno_files/genotype_data/ldthinned" # variant set to consider 41 | trait_dir = "/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/interval_grs_scan/analyses/processed_traits" 42 | trait_dir2 = "/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/INTERVAL_gwasqc_technical_covariates_only/qced" 43 | 44 | olink_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/scallop/jp549/olink-merged-output" 45 | olink_neu_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/interval_gwas_discovery/neu/interval_subset_olink/neuro/full_set/output/formatted_assoc_results" 46 | soma_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/somalogic_proteomics/interval/gwas/BAKEOFF151001/gwas_output/imputed/somalogic/meta" 47 | 48 | processed_GWAS = "geno_files" 49 | 50 | tmpdir = sprintf("%s/tmpdir", out_dir) 51 | dir.create(tmpdir, showWarnings=FALSE) 52 | 53 | # Define high complexity regions to extend 1MB cis windows when handling protein cis-QTLs 54 | # See flashpca exclusion regions: https://github.com/gabraham/flashpca 55 | # Coordinates are HG19 56 | complex_ld <- data.table( 57 | region_chr=c(5, 6, 8, 11), 58 | region_start=c(44000000, 25000000, 8000000, 45000000), 59 | region_end=c(51500000, 33500000, 12000000, 57000000), 60 | region_name=c("r1", "MHC", "r3", "r4") 61 | ) 62 | 63 | 64 | ## ====================================================================================== 65 | ## First, we want to load the summary statistics for all platforms, and filter to the 66 | ## ld-thinned variant set, and add a basic filter of P < 0.01 - we want to find a 67 | ## reasonable P-value threshold across all platforms but need to load the summary stats 68 | ## for all measurements 69 | ## ====================================================================================== 70 | 71 | parallelise_fread() 72 | varset = foreach(chr_id = 1:22, .combine=rbind) %do% { 73 | fread(sprintf("%s/impute_%s_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8.pvar", geno_dir, chr_id)) 74 | } 75 | setnames(varset, c("chr", "pos", "id", "ref", "alt")) 76 | setkey(varset, chr, pos) 77 | 78 | # -------------------------------------------------------------------------------------- 79 | # Load in NMR GWAS results, filter to P < trans_pthresh and output variant effect files 80 | # -------------------------------------------------------------------------------------- 81 | 82 | if (!file.exists(sprintf("%s/Nightingale_phenotype_info.txt", out_dir))) { 83 | nmr_SS = fread(sprintf("%s/nightingale_p_less_0.1.txt", processed_GWAS)) 84 | nmr_SS = nmr_SS[pval < trans_pthresh] 85 | 86 | # Make sure we've accurately filtered to varset snps 87 | nmr_SS = rbind( 88 | nmr_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0], 89 | nmr_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0] 90 | ) 91 | nmr_SS = nmr_SS[order(pos)][order(chr)][order(phenotype)] 92 | 93 | nmr_SS[varset, on = .(chr, pos), rsid := id] # add rsid 94 | nmr_SS = nmr_SS[, .SD[which.min(pval)], by=.(chr, pos, phenotype)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate. 95 | 96 | # Load phenotype data and info 97 | nmr_info = fread(sprintf("%s/nmr_metabolomics/trait_info.tsv", trait_dir)) 98 | nmr_pheno = fread(sprintf("%s/nmr_metabolomics/traits.tsv", trait_dir)) 99 | 100 | # Some of the trait names have "_" at the end (measurements with %). Since we use "_" as 101 | # a file name separator, we'll just replace these 102 | nmr_info[, variable := gsub("_", ".pct", variable)] 103 | nmr_pheno[, variable := gsub("_", ".pct", variable)] 104 | nmr_SS[, phenotype := gsub("_", ".pct", phenotype)] 105 | 106 | # Filter phenotype data 107 | nmr_pheno = nmr_pheno[!is.na(value)] 108 | 109 | # Fix column names: 110 | setnames(nmr_info, "variable", "PhenotypeCompName") 111 | setnames(nmr_pheno, "variable", "PhenotypeCompName") 112 | setnames(nmr_SS, "phenotype", "PhenotypeCompName") 113 | 114 | # Filter info sheet and phenotype data to measurements 115 | # with at least 1 variant passing the P-value threshol 116 | nmr_info = nmr_info[PhenotypeCompName %chin% nmr_SS$PhenotypeCompName] 117 | nmr_pheno = nmr_pheno[PhenotypeCompName %chin% nmr_SS$PhenotypeCompName] 118 | 119 | # write out variant effects file for each phenotype 120 | foreach(phen_id = unique(nmr_SS$PhenotypeCompName)) %do% { 121 | fwrite(nmr_SS[PhenotypeCompName == phen_id, .(rsid, chr, pos, effect_allele, other_allele, effect=beta, pval)], 122 | sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id)) 123 | } 124 | 125 | # Write out info sheet for NMR variables: 126 | if (nrow(nmr_info) > 0) { 127 | fwrite(nmr_info[, .(PhenotypeCompName, Name, Description, Units, Group, Sub.Group)], 128 | sep="\t", quote=FALSE, file=sprintf("%s/Nightingale_phenotype_info.txt", out_dir)) 129 | } 130 | 131 | # Free objects to free memory 132 | rm(nmr_info, nmr_SS) 133 | gc() 134 | } else { 135 | cat("Nightingale NMR GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n") 136 | nmr_info = fread(sprintf("%s/Nightingale_phenotype_info.txt", out_dir)) 137 | nmr_pheno = fread(sprintf("%s/nmr_metabolomics/traits.tsv", trait_dir)) 138 | nmr_pheno[, variable := gsub("_", ".pct", variable)] 139 | nmr_pheno = nmr_pheno[!is.na(value)] 140 | setnames(nmr_pheno, "variable", "PhenotypeCompName") 141 | nmr_pheno = nmr_pheno[PhenotypeCompName %chin% nmr_info$PhenotypeCompName] 142 | rm(nmr_info); gc() 143 | } 144 | 145 | # -------------------------------------------------------------------------------------- 146 | # Do the same for the metabolon data 147 | # -------------------------------------------------------------------------------------- 148 | 149 | if (!file.exists(sprintf("%s/Metabolon_phenotype_info.txt", out_dir))) { 150 | metabo_SS = fread(sprintf("%s/metabolon_p_less_0.1.txt", processed_GWAS)) 151 | metabo_SS = metabo_SS[pval < trans_pthresh] 152 | 153 | # Make sure we've accurately filtered to varset snps 154 | metabo_SS = rbind( 155 | metabo_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0], 156 | metabo_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0] 157 | ) 158 | metabo_SS = metabo_SS[order(pos)][order(chr)][order(phenotype)] 159 | 160 | metabo_SS[varset, on = .(chr, pos), rsid := id] # add rsid 161 | metabo_SS = metabo_SS[, .SD[which.min(pval)], by=.(chr, pos, phenotype)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate. 162 | 163 | # Load phenotype data and info 164 | metabo_info = fread(sprintf("%s/metabolon_metabolomics/trait_info.tsv", trait_dir)) 165 | metabo_pheno = fread(sprintf("%s/metabolon_metabolomics/traits.tsv", trait_dir)) 166 | 167 | # Make it so variable nmae in phenotype data table matches GWAS 168 | metabo_pheno[, variable := gsub("^m", "M", variable)] 169 | 170 | # Filter phenotype data 171 | metabo_pheno = metabo_pheno[!is.na(value)] 172 | 173 | # Filter info table to analysed metabolites: 174 | metabo_info = metabo_info[comp_id %in% metabo_pheno$variable] 175 | 176 | # Fix column names: 177 | setnames(metabo_info, "comp_id", "PhenotypeCompName") 178 | setnames(metabo_pheno, "variable", "PhenotypeCompName") 179 | setnames(metabo_SS, "phenotype", "PhenotypeCompName") 180 | 181 | # Filter info sheet and phenotype data to measurements 182 | # with at least 1 variant passing the P-value threshol 183 | metabo_info = metabo_info[PhenotypeCompName %chin% metabo_SS$PhenotypeCompName] 184 | metabo_pheno = metabo_pheno[PhenotypeCompName %chin% metabo_SS$PhenotypeCompName] 185 | 186 | # write out variant effects file for each phenotype 187 | foreach(phen_id = unique(metabo_SS$PhenotypeCompName)) %do% { 188 | fwrite(metabo_SS[PhenotypeCompName == phen_id, .(rsid, chr, pos, effect_allele, other_allele, effect=beta, pval)], 189 | sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id)) 190 | } 191 | 192 | # Write out info sheet 193 | if (nrow(metabo_info) > 0) { 194 | fwrite(metabo_info[, .(PhenotypeCompName, MASS, RI, BIOCHEMICAL, SUPER_PATHWAY, SUB_PATHWAY)], 195 | sep="\t", quote=FALSE, file=sprintf("%s/Metabolon_phenotype_info.txt", out_dir)) 196 | } 197 | 198 | # Free objects to free memory 199 | rm(metabo_info, metabo_SS) 200 | gc() 201 | } else { 202 | cat("Metabolon HD4 GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n") 203 | metabo_info = fread(sprintf("%s/Metabolon_phenotype_info.txt", out_dir)) 204 | metabo_pheno = fread(sprintf("%s/metabolon_metabolomics/traits.tsv", trait_dir)) 205 | metabo_pheno[, variable := gsub("^m", "M", variable)] 206 | metabo_pheno = metabo_pheno[!is.na(value)] 207 | setnames(metabo_pheno, "variable", "PhenotypeCompName") 208 | metabo_pheno = metabo_pheno[PhenotypeCompName %chin% metabo_info$PhenotypeCompName] 209 | rm(metabo_info); gc() 210 | } 211 | 212 | # -------------------------------------------------------------------------------------- 213 | # Olink data is slightly more complicated: we need to also load the cis-region for each 214 | # protein, and to average proteins measured on multiple platforms 215 | # -------------------------------------------------------------------------------------- 216 | 217 | if (!file.exists(sprintf("%s/Olink_phenotype_info.txt", out_dir))) { 218 | olink_SS = fread(sprintf("%s/olink_p_less_0.1.txt", processed_GWAS)) 219 | 220 | # Load phenotype data and info 221 | olink_info = fread(sprintf("%s/olink_proteins/trait_info.tsv", trait_dir)) 222 | olink_pheno = fread(sprintf("%s/olink_proteins/traits.tsv", trait_dir)) 223 | 224 | # Make variable names match across data.tables 225 | olink_SS[, varmatch := tolower(phenotype)] 226 | olink_SS[, varmatch := gsub("\\.", "", varmatch)] 227 | olink_SS[, varmatch := gsub("--", "", varmatch)] 228 | olink_SS[, varmatch := gsub("^inf1", "inf", varmatch)] 229 | olink_SS[varmatch %like% "inf_dner___q8nft8", varmatch := "inf_dner___q8nft8"] # trailing whitespace 230 | olink_SS[varmatch == "inf_4ebp1___q13541", varmatch := "inf_ebp1___q13541"] 231 | 232 | # Olink proteins are unique by UniProt identifier, so we will use this as the unique 233 | # phenotype ID. some fixes are needed. 234 | olink_info[, PhenotypeCompName := UniProt] 235 | olink_info[, PhenotypeCompName := gsub(";", ".", PhenotypeCompName)] 236 | olink_info[PhenotypeCompName == "", PhenotypeCompName := Olink_id] 237 | 238 | # Add unique identifier to olink_SS table so we have full list of genome-wide P < trans_pthresh SNPs for 239 | # each protein 240 | olink_SS[olink_info, on = .(varmatch=variable), PhenotypeCompName := i.PhenotypeCompName] 241 | 242 | # Obtain the genomic location of each protein 243 | loc = olink_info[, .(UniProt = strsplit(UniProt, "\\.")[[1]]), by=.(PhenotypeCompName)] 244 | 245 | system("mkdir -p $HOME/.cache/AnnotationHub", wait=TRUE) # so we dont get interactive prompt below 246 | ah = AnnotationHub() 247 | orgdb = query(ah, c("OrgDb", "org.Hs.eg.db"))[[1]] 248 | txdb <- query(ah, c("TxDB", "TxDb.Hsapiens.UCSC.hg19.knownGene"))[[1]] 249 | 250 | up2gene = select(orgdb, unique(loc$UniProt), c("UNIPROT", "GENENAME", "SYMBOL", "ENTREZID"), "UNIPROT") 251 | setDT(up2gene) 252 | up2gene[, ENTREZID := as.integer(ENTREZID)] 253 | up2gene = up2gene[!is.na(ENTREZID)] 254 | gene2loc = as.data.table(grch37) 255 | gene2loc = gene2loc[, .(entrez, chr, start)] 256 | up2loc = up2gene[gene2loc, on = .(ENTREZID=entrez), nomatch=0] 257 | up2loc = up2loc[!grepl("_", chr)] 258 | loc[up2loc, on = .(UniProt=UNIPROT), c("chr", "start") := .(chr, start)] 259 | 260 | # manually annotate a few missing ones by manually looking up UniProt entry 261 | # and cross referencing with NCBI gene 262 | loc[UniProt == "Q8WWJ7", c("chr", "start") := .(11, 60739113)] 263 | loc[UniProt == "Q8NF90", c("chr", "start") := .(4, 81187742)] 264 | loc[UniProt == "P16284", c("chr", "start") := .(17, 62396775)] 265 | loc = rbind(loc, data.table(PhenotypeCompName = "OID00195", UniProt=NA, chr=1, start=11917521)) 266 | 267 | # Add to olink info table 268 | olink_info[loc, on = .(PhenotypeCompName), c("chr", "TSS") := .(paste(i.chr, collapse="|"), paste(i.start, collapse="|")), by=.EACHI] 269 | 270 | # Now load summary stats for all SNPs passing P < trans_pthresh genome wide and 271 | # cis-snps < cis_pthresh for each protein 272 | parallelise_foreach() 273 | foreach(phen_id = unique(olink_SS$PhenotypeCompName), .combine=c) %dopar% { 274 | if (file.exists(sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))) { 275 | cat("Olink protein", phen_id, "already filtered at P <", trans_pthresh, "(trans),", cis_pthresh, "(cis) by previous run, skipping.\n") 276 | return(NULL) 277 | } 278 | gc() 279 | phen_ss = foreach(var_id = olink_SS[PhenotypeCompName == phen_id, unique(phenotype)], .combine=rbind) %do% { 280 | # Load full summary stats 281 | varmatch = olink_SS[PhenotypeCompName == phen_id & phenotype == var_id, unique(varmatch)] 282 | panel = olink_info[variable == varmatch, panel] 283 | if (panel == "neu") { 284 | this_ss = fread(sprintf("%s/%s_olink_neuro_full_set_autosomal_imputed_all_chrs_combined.snptest.out.gz", olink_neu_GWAS, gsub("^neu_", "", var_id)), tmpdir=tmpdir) 285 | } else { 286 | this_ss = fread(sprintf("%s/INTERVAL_%s_chr_merged.gz", olink_GWAS, var_id), tmpdir=tmpdir) 287 | } 288 | this_ss = this_ss[, .(chr=chromosome, pos=position, effect_allele=alleleB, other_allele=alleleA, effect=frequentist_add_beta_1, pval=frequentist_add_pvalue)] 289 | this_ss = this_ss[pval < pmax(trans_pthresh, cis_pthresh)] 290 | 291 | # Filter to varset snps 292 | if (nrow(this_ss) > 0) { 293 | this_ss = rbind( 294 | this_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0], 295 | this_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0] 296 | ) 297 | this_ss = this_ss[!is.na(effect)] 298 | } 299 | return(this_ss) 300 | } 301 | 302 | # R, do you even garbage collect? WTF. 303 | if(exists("this_ss")) { rm(this_ss) } 304 | gc() 305 | 306 | # Remove variants that did not pass the p-value threshold for all panels for each protein 307 | npan = olink_info[PhenotypeCompName == phen_id, length(unique(phenotype))] 308 | varn = phen_ss[,.N, by=.(chr, pos, effect_allele, other_allele)] 309 | pass = varn[N == npan] 310 | pass[,N := NULL] 311 | phen_ss = phen_ss[pass, on = .(chr, pos, effect_allele, other_allele)] 312 | 313 | # average across measurements 314 | phen_ss = phen_ss[, .(effect=mean(effect), pval=mean(pval)), by = .(chr, pos, effect_allele, other_allele)] 315 | 316 | # Get SNPs at genome-wide P < trans_pthresh 317 | gw = phen_ss[pval < trans_pthresh] 318 | 319 | # Get cis-SNPs with P < cis_pthresh 320 | chrs = strsplit(olink_info[PhenotypeCompName == phen_id, unique(chr)], "\\|")[[1]] 321 | starts = strsplit(olink_info[PhenotypeCompName == phen_id, unique(TSS)], "\\|")[[1]] 322 | cis = foreach(idx = seq_along(chrs), .combine=rbind) %do% { 323 | window = data.table(chr=as.integer(chrs[idx]), TSS=as.integer(starts[idx])) 324 | window[, start := pmax(0, TSS - 1e6)] 325 | window[, end := TSS + 1e6] 326 | window[complex_ld, on = .(chr=region_chr, start<=region_end, start>=region_start), start := region_start] 327 | window[complex_ld, on = .(chr=region_chr, end<=region_end, end>=region_start), start := region_start] 328 | phen_ss[window, on = .(chr, pos>=start, pos<=end), .(chr, pos=x.pos, effect_allele, other_allele, effect, pval)] 329 | } 330 | cis = cis[pval < cis_pthresh] 331 | 332 | # remove full phen_ss object to free up memory 333 | rm(phen_ss) 334 | gc() 335 | 336 | phen_ss = unique(rbind(gw, cis)) 337 | 338 | # If any QTLs, proceed 339 | if (nrow(phen_ss) > 0) { 340 | 341 | phen_ss = phen_ss[, .SD[which.min(pval)], by=.(chr, pos)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate. 342 | phen_ss[varset, on = .(chr, pos), rsid := id] # add rsid 343 | phen_ss = phen_ss[, .(rsid, chr, pos, effect_allele, other_allele, effect, pval)][order(pos)][order(chr)] 344 | 345 | # write out 346 | fwrite(phen_ss, sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id)) 347 | 348 | # Free up memory and garbage collect 349 | rm(phen_ss) 350 | gc() 351 | 352 | return(phen_id) 353 | } 354 | } 355 | parallelise_fread() 356 | gc() 357 | 358 | # Filter phenotype data 359 | olink_pheno = olink_pheno[!is.na(value)] 360 | 361 | # Average phenotype data across different platform measures: 362 | olink_pheno[olink_info, on = .(variable), PhenotypeCompName := PhenotypeCompName] 363 | olink_pheno = olink_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)] 364 | 365 | # Filter info sheet and phenotype data to measurements 366 | # with at least 1 variant passing the P-value threshol 367 | olink_info = olink_info[PhenotypeCompName %chin% unique(olink_SS$PhenotypeCompName)] 368 | olink_pheno = olink_pheno[PhenotypeCompName %chin% unique(olink_SS$PhenotypeCompName)] 369 | 370 | # Make sure info table has one entry per phenotype and write out 371 | if (nrow(olink_info) > 0) { 372 | olink_info = olink_info[, .(panels = paste(panel, collapse=","), protein = paste(unique(protein), collapse="/")), 373 | by = .(PhenotypeCompName, UniProt, chr, TSS)] 374 | fwrite(olink_info[, .(PhenotypeCompName, UniProt, protein, chr, TSS, panels)], 375 | sep="\t", quote=FALSE, file=sprintf("%s/Olink_phenotype_info.txt", out_dir)) 376 | } 377 | 378 | # Free objects to free memory 379 | rm(olink_info, olink_SS) 380 | gc() 381 | } else { 382 | cat("Olink protein GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n") 383 | olink_info = fread(sprintf("%s/Olink_phenotype_info.txt", out_dir)) 384 | olink_info_full = fread(sprintf("%s/olink_proteins/trait_info.tsv", trait_dir)) 385 | olink_pheno = fread(sprintf("%s/olink_proteins/traits.tsv", trait_dir)) 386 | olink_info = olink_info[, .(protein=strsplit(protein, "/")[[1]], panel = strsplit(panels, ",")[[1]]), by=.(PhenotypeCompName, UniProt)] 387 | olink_info[olink_info_full, on = .(UniProt, protein), variable := i.variable] 388 | olink_pheno = olink_pheno[!is.na(value)] 389 | olink_pheno[olink_info, on = .(variable), PhenotypeCompName := PhenotypeCompName] 390 | olink_pheno = olink_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)] 391 | olink_pheno = olink_pheno[PhenotypeCompName %chin% olink_info$PhenotypeCompName] 392 | rm(olink_info); gc() 393 | } 394 | 395 | # -------------------------------------------------------------------------------------- 396 | # Do the same for SomaLogic data. Only this time, we don't have a list of P < 0.01 397 | # GWAS summary stats to work from. 398 | # -------------------------------------------------------------------------------------- 399 | 400 | if (!file.exists(sprintf("%s/Somalogic_phenotype_info.txt", out_dir))) { 401 | # Load phenotype data and info 402 | soma_info = fread(sprintf("%s/somalogic_proteins/trait_info.tsv", trait_dir)) 403 | soma_pheno = fread(sprintf("%s/somalogic_proteins/traits.tsv", trait_dir)) 404 | 405 | # Remove bad aptamers and make unique protien names 406 | soma_info = soma_info[Type == "Protein"] 407 | soma_info = soma_info[, .(SeqId, SOMAMER_ID, variable, Target, TargetFullName, UniProt=UniProt.Id.Current.at.Uniprot, Gene=Gene.Name, chr, TSS=start)] 408 | 409 | soma_info[, PhenotypeCompName := gsub("\\..*", "", SOMAMER_ID)] 410 | 411 | # Some are not unique with respect to different Full Target Names, so we need to fix these: 412 | soma_info[PhenotypeCompName == "VEGFA", PhenotypeCompName := Target] 413 | soma_info[PhenotypeCompName == "PLG", PhenotypeCompName := Target] 414 | soma_info[PhenotypeCompName == "C3", PhenotypeCompName := Target] 415 | soma_info[PhenotypeCompName == "C4A", PhenotypeCompName := Target] 416 | soma_info[Target == "C5b, 6 Complex", PhenotypeCompName := "C5b"] 417 | soma_info[Target == "MPIF-1", PhenotypeCompName := "MPIF.1"] 418 | soma_info[Target == "Ck-b-8-1", PhenotypeCompName := "Ck.b.8.1"] 419 | soma_info[PhenotypeCompName == "CGA", PhenotypeCompName := Target] 420 | soma_info[PhenotypeCompName == "Luteinizing hormone", PhenotypeCompName := "CGA.LHB"] 421 | soma_info[PhenotypeCompName == "Glycoprotein hormones a-chain", PhenotypeCompName := "CGA"] 422 | soma_info[Target == "SCGF-beta", PhenotypeCompName := "SCGFb"] 423 | soma_info[Target == "SCGF-alpha", PhenotypeCompName := "SCGFa"] 424 | soma_info[Target == "Coagulation Factor Xa", PhenotypeCompName := "F10a"] 425 | soma_info[Target == "FN1.3", PhenotypeCompName := "FN1.3"] 426 | soma_info[Target == "Haptoglobin, Mixed Type", PhenotypeCompName := "HPm"] 427 | soma_info[PhenotypeCompName == "LRP1", PhenotypeCompName := Target] 428 | soma_info[PhenotypeCompName == "LYN", PhenotypeCompName := Target] 429 | soma_info[Target == "PILRA isoform FDF03-M14", PhenotypeCompName := "PILRA.M14"] 430 | soma_info[Target == "PILRA isoform FDF03-deltaTM", PhenotypeCompName := "PILRA.dTM"] 431 | soma_info[Target == "Ubiquitin+1", PhenotypeCompName := "RPS27Aplus1"] 432 | soma_info[Target == "alpha-1-antichymotrypsin complex", PhenotypeCompName := "SERPINA3cmplx"] 433 | soma_info[Target == "14-3-3 protein beta/alpha", PhenotypeCompName := "14.3.3.pba"] 434 | soma_info[Target == "14-3-3", PhenotypeCompName := "14.3.3"] 435 | soma_info[PhenotypeCompName == "C5", PhenotypeCompName := Target] 436 | soma_info[PhenotypeCompName == "F2", PhenotypeCompName := Target] 437 | soma_info[PhenotypeCompName == "EGFR", PhenotypeCompName := Target] 438 | soma_info[PhenotypeCompName == "FN1", PhenotypeCompName := Target] 439 | soma_info[PhenotypeCompName == "NRXN1", PhenotypeCompName := Target] 440 | soma_info[PhenotypeCompName == "ADCYAP1", PhenotypeCompName := gsub("-", ".", Target)] 441 | soma_info[PhenotypeCompName == "CKB", PhenotypeCompName := gsub("-", ".", Target)] 442 | soma_info[PhenotypeCompName == "EGFR", PhenotypeCompName := gsub("-", ".", Target)] 443 | soma_info[PhenotypeCompName == "FGA", PhenotypeCompName := gsub("-", ".", Target)] 444 | soma_info[PhenotypeCompName == "FGF8", PhenotypeCompName := gsub("-", ".", Target)] 445 | soma_info[PhenotypeCompName == "PPBP", PhenotypeCompName := gsub("-", ".", Target)] 446 | soma_info[Target == "CLF-1/CLC Complex", PhenotypeCompName := "CLF1.CLC.complex"] 447 | soma_info[Target == "CK2-A1:B", PhenotypeCompName := "CK2.A1.B"] 448 | soma_info[Target == "Coagulation Factor IX", PhenotypeCompName := "CF.IX"] 449 | soma_info[Target == "Coagulation Factor IXab", PhenotypeCompName := "CF.IXab"] 450 | soma_info[Target == "GDF-11/8", PhenotypeCompName := "GDF11.8"] 451 | soma_info[Target == "IgG2, Kappa", PhenotypeCompName := "IgG2"] 452 | soma_info[Target == "IgG4, Kappa", PhenotypeCompName := "IgG4"] 453 | soma_info[Target == "N-terminal pro-BNP", PhenotypeCompName := "NPPB.Nt"] 454 | soma_info[Target == "Activated Protein C", PhenotypeCompName := "PROC.activated"] 455 | soma_info[Target == "TLR4:MD-2 complex", PhenotypeCompName := "TLR4.MD2.complex"] 456 | soma_info[Target == "Activin A", PhenotypeCompName := "INHBA.A"] 457 | soma_info[Target == "Activin AB", PhenotypeCompName := "INHBA.AB"] 458 | soma_info[Target == "Lymphotoxin a1/b2", PhenotypeCompName := "LTA.A1.B2"] 459 | soma_info[Target == "Lymphotoxin a2/b1", PhenotypeCompName := "LTA.A2.B1"] 460 | soma_info[PhenotypeCompName == "POMC", PhenotypeCompName := gsub("-", ".", TargetFullName)] 461 | soma_info[Target == "SEM6C", Target := "SEMA6C"] 462 | soma_info[PhenotypeCompName == "14.3.3", UniProt := "P61981|Q04917"] 463 | 464 | # Actually we want to train models for each SeqId apparently... 465 | setnames(soma_info, "PhenotypeCompName", "UniqueShortName") 466 | soma_info[, PhenotypeCompName := paste0("SeqId_", gsub("-", "_", SeqId))] 467 | 468 | # Now load summary stats for all SNPs passing P < trans_pthresh genome wide and 469 | # all cis-snps for each protein. Note this code handles averaging across multiple 470 | # aptamers if we want to return to predicting protein levels. 471 | parallelise_foreach() 472 | has_qtls = foreach(phen_id = unique(soma_info$PhenotypeCompName), .combine=c) %dopar% { 473 | if (file.exists(sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))) { 474 | cat("Somalogic protein", phen_id, "already filtered at P <", trans_pthresh, "(trans),", cis_pthresh, "(cis) by previous run, skipping.\n") 475 | return(phen_id) 476 | } 477 | gc() 478 | # Load all summary stats for all aptamers targetting this protein, filtering to SNPs 479 | # in the LD-thinned variant set. 480 | phen_ss = foreach(var_id = soma_info[PhenotypeCompName == phen_id, unique(SOMAMER_ID)], .combine=rbind) %do% { 481 | this_ss = foreach(chr_id = 1:22, .combine=rbind) %do% { 482 | chr_ss = fread(sprintf("%s/%s/%s_chrom_%s_meta_1.tbl.gz", soma_GWAS, var_id, var_id, chr_id), tmpdir=tmpdir) 483 | chr_ss = chr_ss[, .(PhenotypeCompName=phen_id, SOMAMER_ID=var_id, chr=chromosome, pos=position, 484 | effect_allele=toupper(Allele1), other_allele=toupper(Allele2), effect=Effect, 485 | pval=10^`log(P)`)] 486 | chr_ss = chr_ss[pval < pmax(trans_pthresh, cis_pthresh)] 487 | if (nrow(chr_ss) > 0) { 488 | chr_ss = rbind( 489 | chr_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0], 490 | chr_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0] 491 | ) 492 | } 493 | return(chr_ss) 494 | } 495 | this_ss[!is.na(effect)] 496 | } 497 | 498 | # Remove variants that did not pass the p-value threshold for all measurements per protein 499 | napt = soma_info[PhenotypeCompName == phen_id, length(unique(SOMAMER_ID))] 500 | varn = phen_ss[,.N, by=.(chr, pos, effect_allele,other_allele)] 501 | pass = varn[N == napt] 502 | pass[,N := NULL] 503 | phen_ss = phen_ss[pass, on = .(chr, pos, effect_allele,other_allele)] 504 | 505 | # average across measurements 506 | phen_ss = phen_ss[, .(effect=mean(effect), pval=mean(pval)), by = .(chr, pos, effect_allele, other_allele)] 507 | 508 | # R, do you even garbage collect? WTF. 509 | if(exists("this_ss")) { rm(this_ss) } 510 | if(exists("chr_ss")) { rm(chr_ss) } 511 | gc() 512 | 513 | # Identify and extract all SNPs with P < trans_pthresh for all aptamers 514 | gw_snps = unique(phen_ss[pval < trans_pthresh, .(chr, pos)]) 515 | gw = phen_ss[gw_snps, on = .(chr, pos)] 516 | 517 | # Identify and extract all SNPS in cis with any gene with P < cis_pthresh 518 | chrs = strsplit(soma_info[PhenotypeCompName == phen_id, unique(chr)], "\\|")[[1]] 519 | starts = strsplit(soma_info[PhenotypeCompName == phen_id, unique(TSS)], "\\|")[[1]] 520 | cis = foreach(idx = seq_along(chrs), .combine=rbind) %do% { 521 | window = data.table(chr=as.integer(chrs[idx]), TSS=as.integer(starts[idx])) 522 | window[, start := pmax(0, TSS - 1e6)] 523 | window[, end := TSS + 1e6] 524 | window[complex_ld, on = .(chr=region_chr, start<=region_end, start>=region_start), start := region_start] 525 | window[complex_ld, on = .(chr=region_chr, end<=region_end, end>=region_start), start := region_start] 526 | chr_ss = phen_ss[window, on = .(chr, pos>=start, pos<=end), .(chr, pos=x.pos, effect_allele, other_allele, effect, pval)] 527 | chr_ss[pval < cis_pthresh] 528 | } 529 | rm(phen_ss) 530 | gc() 531 | 532 | phen_ss = unique(rbind(gw, cis)) 533 | 534 | if(nrow(phen_ss) > 0) { 535 | phen_ss = phen_ss[, .SD[which.min(pval)], by=.(chr, pos)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate. 536 | phen_ss[varset, on = .(chr, pos), rsid := id] # add rsid 537 | phen_ss = phen_ss[, .(rsid, chr, pos, effect_allele, other_allele, effect, pval)][order(pos)][order(chr)] 538 | 539 | # write out 540 | fwrite(phen_ss, sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id)) 541 | 542 | # Free up some memory 543 | rm(phen_ss) 544 | gc() 545 | 546 | return(phen_id) 547 | } 548 | } 549 | parallelise_fread() 550 | 551 | # Filter phenotype data 552 | soma_pheno = soma_pheno[variable %chin% unique(soma_info$variable)] 553 | soma_pheno = soma_pheno[!is.na(value)] 554 | 555 | # Average phenotype data across different platform measures: 556 | soma_pheno[soma_info, on = .(variable), PhenotypeCompName := PhenotypeCompName] 557 | soma_pheno = soma_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)] 558 | 559 | # Filter info sheet and phenotype data to measurements 560 | # with at least 1 variant passing the P-value threshol 561 | soma_info = soma_info[PhenotypeCompName %chin% has_qtls] 562 | soma_pheno = soma_pheno[PhenotypeCompName %chin% has_qtls] 563 | 564 | if (nrow(soma_info) > 0) { 565 | soma_info = soma_info[, .(SeqId=paste(SeqId, collapse=","), SOMAMER_ID=paste(SOMAMER_ID, collapse=",")), 566 | by = .(PhenotypeCompName, UniqueShortName, Target, TargetFullName, UniProt, Gene, chr, TSS)] 567 | fwrite(soma_info, sep="\t", quote=FALSE, file=sprintf("%s/Somalogic_phenotype_info.txt", out_dir)) 568 | } 569 | 570 | # Adjust phenotype levels for measurement batch: 571 | if (nrow(soma_pheno) > 0) { 572 | batch = fread(sprintf("%s/somalogic_proteins/covariates.tsv", trait_dir)) 573 | soma_pheno = soma_pheno[batch, on = .(IID), nomatch=0] 574 | soma_pheno[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)] 575 | soma_pheno[, batch := NULL] 576 | } 577 | } else { 578 | cat("Somalogic protein GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n") 579 | soma_info = fread(sprintf("%s/Somalogic_phenotype_info.txt", out_dir)) 580 | soma_pheno = fread(sprintf("%s/somalogic_proteins/traits.tsv", trait_dir)) 581 | batch = fread(sprintf("%s/somalogic_proteins/covariates.tsv", trait_dir)) 582 | soma_pheno = soma_pheno[variable %chin% unique(soma_info$variable)] 583 | soma_pheno = soma_pheno[!is.na(value)] 584 | soma_pheno = soma_pheno[soma_info[,.(variable, PhneotypeCompName)], on = .(variable), nomatch=0] 585 | soma_pheno = soma_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)] 586 | rm(soma_info); gc() 587 | soma_pheno = soma_pheno[batch, on = .(IID), nomatch=0] 588 | soma_pheno[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)] 589 | soma_pheno[, batch := NULL] 590 | } 591 | 592 | # -------------------------------------------------------------------------------------- 593 | # Combine phenotype data into one table and write out 594 | # -------------------------------------------------------------------------------------- 595 | 596 | pcs = fread("/rds/project/jmmh2/rds-jmmh2-post_qc_data/interval/reference_files/genetic/reference_files/annot_INT_50PCs_pcs.txt") 597 | 598 | pheno = rbind( 599 | Nightingale = nmr_pheno, 600 | Metabolon = metabo_pheno, 601 | Olink = olink_pheno, 602 | Somalogic = soma_pheno, 603 | idcol = "platform") 604 | 605 | # Adjust for 10 genotype PCs 606 | pheno = pheno[pcs, on = .(IID=ID), nomatch=0] 607 | pheno[,value := lm(value ~ PC_1 + PC_2 + PC_3 + PC_4 + PC_5 + 608 | PC_6 + PC_7 + PC_8 + PC_9 + PC_10)$residuals, 609 | by = .(PhenotypeCompName)] 610 | 611 | if (nrow(pheno) > 0) { 612 | fwrite(pheno[, .(PhenotypeCompName, platform, IID, value)], 613 | sep="\t", quote=FALSE, file=sprintf("%s/phenotypes.txt", out_dir)) 614 | } 615 | 616 | # Phenotype data adjusted for technical covariates only. 617 | soma_pheno_tech = fread(sprintf("%s/soma4000_gwasQC_adj_technical.txt", trait_dir2)) 618 | soma_pheno_tech = melt(soma_pheno_tech, id.vars="aliquot_id", variable.name="SOMAMER_ID") 619 | apt2prot = soma_info[,.(SeqID=strsplit(SeqID, ",")[[1]], SOMAMER_ID=strsplit(SOMAMER_ID, ",")[[1]]), by=PhenotypeCompName] 620 | soma_pheno_tech = soma_pheno_tech[apt2prot, on = .(SOMAMER_ID)] 621 | idmap = fread("/rds/project/jmmh2/rds-jmmh2-projects/polygenic/general/INTERVAL_data/1074/omicsMap.csv") 622 | soma_pheno_tech[idmap, on = .(aliquot_id=soma4000_gwasQC_bl), IID := Affymetrix_gwasQC_bl] 623 | soma_pheno_tech[, platform := "Somalogic"] 624 | soma_pheno_tech = soma_pheno_tech[, .(value=mean(value)), by = .(PhenotypeCompName, platform, IID)] 625 | soma_pheno_tech = soma_pheno_tech[batch, on = .(IID), nomatch=0] 626 | soma_pheno_tech[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)] 627 | soma_pheno_tech = soma_pheno_tech[pcs, on = .(IID=ID), nomatch=0] 628 | soma_pheno_tech[, value := lm(value ~ PC_1 + PC_2 + PC_3 + PC_4 + PC_5 + PC_6 + PC_7 + PC_8 + PC_9 + PC_10)$residuals, by = .(PhenotypeCompName)] 629 | soma_pheno_tech = soma_pheno_tech[PhenotypeCompName %chin% pheno[platform == "Somalogic", PhenotypeCompName]] 630 | 631 | if (nrow(soma_pheno_tech) > 0) { 632 | fwrite(soma_pheno_tech[,.(PhenotypeCompName, platform, IID, value)],, sep="\t", quote=FALSE, file=sprintf("%s/phenotypes_no_agesex.txt", out_dir)) 633 | 634 | # write out age and sex information 635 | agesex = fread(sprintf("%s/phenotypes.tsv", trait_dir)) 636 | agesex = agesex[,.(IID, sex=sexPulse, age=agePulse)] 637 | agesex = agesex[IID %in% unique(soma_pheno_tech$IID)] 638 | fwrite(agesex, sep="\t", quote=FALSE, file=sprintf("%s/agesex.txt", out_dir)) 639 | } 640 | 641 | 642 | # remove leftover temporary directory 643 | system(sprintf("rm -rf %s", tmpdir), wait=TRUE) 644 | 645 | --------------------------------------------------------------------------------