├── 06_all_omics_UKB_phecode_assoc_test
    ├── 01_all_omics_PGS_UKB_disease_assoc_one_platform.sh
    ├── 01_all_omics_PGS_UKB_disease_assoc.sh
    └── 01_all_omics_PGS_UKB_disease_assoc.py
├── GCTB
    ├── 04a_merge_shrunk_all_chrs
    │   ├── gctb_merge_shrunk_bins_all_chrs.sh
    │   ├── gctb_merge_shrunk_sparse_bins_all_chrs.sh
    │   └── gctb_gen_mldmlist_shrunk_all_chrs.sh
    ├── 04_merge_shrunk_sparse_all_chrs
    │   ├── gctb_merge_shrunk_sparse_bins_all_chrs.sh
    │   └── gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh
    ├── 02a_gen_shrunk_corr_data_mcpu
    │   ├── gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh
    │   ├── gctb_merge_shrunk_bins_by_chr.sh
    │   ├── gctb_gen_mldmlist_per_chr.sh
    │   └── gctb_gen_shrunk_corr_matrix_mcpu.sh
    ├── logs
    │   └── run_SBayesS_shrunk_M48434_6216077_4294967294.e
    ├── 03_convert_to_sparse
    │   └── gctb_convert_to_sparse_shrunk_corr_matrix.sh
    ├── README.md
    ├── 02_gen_shrunk_corr_data
    │   └── gctb_gen_shrunk_corr_matrix.sh
    ├── 06_run_SbayesS.sh
    ├── 06_run_SbayesS
    │   └── run_SbayesS.sh
    ├── 04a_merge_shrunk_all_chrs_job.sh
    ├── 06a_run_SbayesS_with_shrunk_corr
    │   └── run_SbayesS_with_shrunk.sh
    ├── 04_merge_shrunk_sparse_all_chrs_job.sh
    ├── 06a_run_SbayesS_with_shrunk_corr.sh
    ├── 02a_merge_shrunk_bins_job.sh
    ├── 03_to_shrunk_sparse_job.sh
    ├── 02a_gen_shrunk_corr_data_mcpu_job.sh
    ├── 01_gen_genetic_data_hapmap3_variants
    │   └── gen_INTERVAL_hapmap3_vars_genetic_data.sh
    ├── 02_gen_shrunk_corr_data_job.sh
    └── 01_gen_genetic_data_hapmap3_variants_job.sh
├── 01_convert_bgen
    ├── 04_fix_var_ids.R
    ├── 01_convert_bgen.sh
    ├── 03_filter_duplicates.sh
    └── 02_flag_duplicates.R
├── 04_extract_QTLs
    ├── 02_helpers
    │   └── reformat_dosages.R
    ├── 02_extract_QTL_dosages.sh
    └── 01_extract_QTLs.R
├── 05_genetic_score_training
    ├── Traditional_GRS.py
    ├── BayesianRidge.py
    └── 01_run_omics_pgs_training with_br.py
├── 03_collate_QTLs.job
├── 02_ldthin
    ├── 02_ldthin.sh
    └── 01_identify_snps.R
├── 06_all_omics_ukb_phecode_disease_assoc_test.job
├── 05_genetic_score_training.job
├── 02_ldthin.job
├── 04_extract_QTLs.job
├── 01_convert_bgen.job
├── README.md
├── 03_collate_QTLs
    └── 01_collate_QTLs.R
└── LDpred2
    └── LDpred2_auto.R


/06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc_one_platform.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | module load miniconda3
4 | source activate ml
5 | 
6 | python /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/scripts_gene_expressions/08_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py ${1} ${2}
7 | 


--------------------------------------------------------------------------------
/06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | module load miniconda3
4 | source activate ml
5 | 
6 | i=$SLURM_ARRAY_TASK_ID
7 | 
8 | python /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/scripts_gene_expressions/08_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py ${1} ${i}
9 | 


--------------------------------------------------------------------------------
/GCTB/04a_merge_shrunk_all_chrs/gctb_merge_shrunk_bins_all_chrs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
4 | 
5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
6 | 
7 | 
8 | $gctb --mldm ${PWD}/interval_shrunk_only_chr_all.mldmlist --make-shrunk-ldm --out ${PWD}/interval_shrunk_chr_all


--------------------------------------------------------------------------------
/GCTB/04a_merge_shrunk_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
4 | 
5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
6 | 
7 | 
8 | $gctb --mldm ${PWD}/interval_shrunk_chr_all.mldmlist --make-sparse-ldm --chisq 0 --out ${PWD}/interval_shrunk_chr_all


--------------------------------------------------------------------------------
/GCTB/04_merge_shrunk_sparse_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
4 | 
5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
6 | 
7 | 
8 | $gctb --mldm ${PWD}/interval_shrunk_chr_all.mldmlist --make-sparse-ldm --chisq 0 --out ${PWD}/interval_shrunk_chr_all


--------------------------------------------------------------------------------
/GCTB/04a_merge_shrunk_all_chrs/gctb_gen_mldmlist_shrunk_all_chrs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 5 | 
 6 | out=interval_shrunk_chr
 7 | 
 8 | 
 9 | for i in $( seq 1 22 )
10 | do
11 | 
12 | echo "${PWD}/${out}${i}.ldm.shrunk" >> "${PWD}/interval_shrunk_only_chr_all.mldmlist"
13 | 
14 | done


--------------------------------------------------------------------------------
/GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 5 | 
 6 | out=interval_shrunk_chr
 7 | 
 8 | 
 9 | for i in $( seq 1 22 )
10 | do
11 | 
12 | echo "${PWD}/${out}${i}.ldm.sparse" >> "${PWD}/interval_shrunk_chr_all.mldmlist"
13 | 
14 | done


--------------------------------------------------------------------------------
/GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_merge_shrunk_bins_by_chr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
 4 | 
 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 6 | 
 7 | out=interval_shrunk_chr${SLURM_ARRAY_TASK_ID}
 8 | 
 9 | 
10 | 
11 | $gctb --mldm ${PWD}/${out}.mldmlist --make-shrunk-ldm --out ${PWD}/${out}


--------------------------------------------------------------------------------
/GCTB/04_merge_shrunk_sparse_all_chrs/gctb_gen_mldmlist_shrunk_sparse_all_chrs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 5 | 
 6 | out=interval_shrunk_chr
 7 | 
 8 | 
 9 | for i in $( seq 1 22 )
10 | do
11 | 
12 | echo "${PWD}/${out}${i}.ldm.sparse" >> "${PWD}/interval_shrunk_chr_all.mldmlist"
13 | 
14 | done


--------------------------------------------------------------------------------
/01_convert_bgen/04_fix_var_ids.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | # Remove extra row identifier tacked onto the variant IDs
 4 | for (chr_id in 1:22) {
 5 |   pvar = fread(sprintf("geno_files/genotype_data/impute_%s_interval_dedup.pvar", chr_id))
 6 |   pvar[, ID := gsub(":[0-9]+?$", "", ID)]
 7 |   fwrite(pvar, sep="\t", quote=FALSE, file=sprintf("geno_files/genotype_data/impute_%s_interval_dedup.pvar", chr_id))
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_mldmlist_per_chr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | k=5000
 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 6 | 
 7 | 
 8 | out=interval_shrunk_chr${1}
 9 | 
10 | 
11 | for i in $( seq 1 ${2} )
12 | do
13 | 
14 | echo "${PWD}/${out}.snp$((k*(i-1)+1))-$((k*i)).ldm.shrunk" >> "${PWD}/${out}.mldmlist"
15 | 
16 | done


--------------------------------------------------------------------------------
/01_convert_bgen/01_convert_bgen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ref_dir=$HOME/rds/rds-jmmh2-post_qc_data/interval/imputed/uk10k_1000g_b37/imputed
 4 | out_dir=geno_files/genotype_data/
 5 | mkdir -p $out_dir
 6 | 
 7 | chr=$SLURM_ARRAY_TASK_ID
 8 | 
 9 | plink2 --bgen $ref_dir/impute_${chr}_interval.bgen \
10 |        --sample $ref_dir/interval.samples \
11 |        --threads $SLURM_CPUS_ON_NODE \
12 |        --memory $SLURM_MEM_PER_NODE \
13 |        --silent \
14 |        --out $out_dir/impute_${chr}_interval
15 | 
16 | 


--------------------------------------------------------------------------------
/GCTB/logs/run_SBayesS_shrunk_M48434_6216077_4294967294.e:
--------------------------------------------------------------------------------
1 | /var/spool/slurm/slurmd/job6216077/slurm_script: line 20: 77468 Segmentation fault      $gctb --sbayes S --ldm ${PWD}/interval_shrunk_chr_all.ldm.shrunk --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt --pi 0.01 --hsq 0.5 --num-chains 4 --chain-length 25000 --burn-in 2000 --seed 12345 --thread 18 --no-mcmc-bin --out-freq 10 --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1}
2 | 


--------------------------------------------------------------------------------
/GCTB/03_convert_to_sparse/gctb_convert_to_sparse_shrunk_corr_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
 5 | 
 6 | shrunk_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID}.ldm.shrunk
 7 | 
 8 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID}
 9 | 
10 | $gctb --ldm $shrunk_file \
11 |       --make-sparse-ldm  \
12 |       --chisq 0  \
13 |       --out $output_file
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/GCTB/README.md:
--------------------------------------------------------------------------------
 1 | # Codes for using SbayesS to estimate heritability of omics traits
 2 | 
 3 | - Software Version 
 4 |   - GCTB 2.02 [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](https://cnsgenomics.com/software/gctb/#Download)
 5 | 
 6 | - Variants used to construct the correlation matrix
 7 |   - Hapmap3 variant set
 8 |   - HWE test P value < 1 × 10−6
 9 |   - missing genotype rate > 0.05
10 |   - imputation info score < 0.3
11 |   - MAF > 0.01
12 | 
13 | - Samples used to contruct the correlation matrix
14 |    - All QCed INTERVAL smaples excluding these used in withheld set validation
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/04_extract_QTLs/02_helpers/reformat_dosages.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | args = commandArgs(trailingOnly=TRUE)
 4 | 
 5 | dt = fread(sprintf("%s/%s_dosages.txt", args[1], args[2]))
 6 | varID = dt[,.(varID=IID)] # should take first IID column encountered
 7 | # drop these columns, will occur 1 time per each chromosome file pasted
 8 | while ("FID" %in% names(dt)) {
 9 |   dt[, c('FID', 'IID', 'PAT', 'MAT', 'SEX', 'PHENOTYPE') := NULL];
10 | }
11 | # add back the varID column as the first column
12 | dt = cbind(varID, dt)
13 | # drop extracted allele from variant name in header
14 | setnames(dt, gsub('_.*', '', names(dt)));
15 | # write out
16 | fwrite(dt, sep='\t', quote=FALSE, compress="gzip", file=sprintf("%s/%s_dosages.txt.gz", args[1], args[2]))
17 | 


--------------------------------------------------------------------------------
/05_genetic_score_training/Traditional_GRS.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from sklearn.metrics import r2_score,explained_variance_score
 4 | from scipy.stats import pearsonr
 5 | import pandas as pd
 6 | from scipy import stats
 7 | from scipy.stats import spearmanr
 8 | 
 9 | 
10 | def get_beta_vec_by_vars_ids(beta_file,vars):
11 |     df = pd.read_csv(beta_file,sep='\t',index_col=0)
12 |     betas = df.loc[vars]['effect']
13 |     return np.array(betas)
14 | 
15 | 
16 | def traditional_GRS_selected_vars(beta_file,X,y,vars):
17 |     beta_vec = get_beta_vec_by_vars_ids(beta_file,vars)
18 |     y_pred = X.dot(beta_vec)
19 |     y_pred = stats.zscore(y_pred)
20 |     return pearsonr(y, y_pred)[0],r2_score(y, y_pred),explained_variance_score(y,y_pred),spearmanr(y, y_pred)[0]
21 | 
22 | 


--------------------------------------------------------------------------------
/GCTB/02_gen_shrunk_corr_data/gctb_gen_shrunk_corr_matrix.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
 5 | 
 6 | plink_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${SLURM_ARRAY_TASK_ID}
 7 | 
 8 | genetic_map_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_genetic_map/genetic_map_chr${SLURM_ARRAY_TASK_ID}.txt
 9 | 
10 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${SLURM_ARRAY_TASK_ID}
11 | 
12 | $gctb --bfile $plink_file \
13 |       --make-shrunk-ldm  \
14 |       --gen-map $genetic_map_file \
15 |       --out $output_file


--------------------------------------------------------------------------------
/GCTB/06_run_SbayesS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/06_run_SbayesS/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | run_SBayesS=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-SL2-CPU \
18 |                   --job-name "SbayesS" \
19 |                   --time 8:0:0 \
20 |                   --mem 60000 \
21 |                   --output $log_dir/run_SBayesS_${1}_%A_%a.o \
22 |                   --error $log_dir/run_SBayesS_${1}_%A_%a.e \
23 |                   --partition skylake \
24 |                   06_run_SbayesS/run_SbayesS.sh ${1})
25 | 
26 | echo "Submitted jobs $run_SBayesS"
27 | 
28 | 


--------------------------------------------------------------------------------
/GCTB/02a_gen_shrunk_corr_data_mcpu/gctb_gen_shrunk_corr_matrix_mcpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | k=5000
 4 | i=${SLURM_ARRAY_TASK_ID}
 5 | 
 6 | gctb=/home/yx322/GCTB/gctb_2.02_Linux/gctb
 7 | 
 8 | plink_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${1}
 9 | 
10 | genetic_map_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_genetic_map/genetic_map_chr${1}.txt
11 | 
12 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix/interval_shrunk_chr${1}
13 | 
14 | $gctb --bfile $plink_file \
15 |       --make-shrunk-ldm  \
16 |       --gen-map $genetic_map_file \
17 |       --snp $((k*(i-1)+1))-$((k*i)) \
18 |       --out ${output_file}


--------------------------------------------------------------------------------
/01_convert_bgen/03_filter_duplicates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | out_dir=geno_files/genotype_data/
 4 | chr=$SLURM_ARRAY_TASK_ID
 5 | 
 6 | # Identify variants to remove
 7 | grep 'remove' $out_dir/impute_${chr}_interval.pvar | cut -f 3 > $out_dir/chr${chr}_duplicates.txt
 8 | 
 9 | # Exclude these and create new pgen files
10 | plink2 --pfile $out_dir/impute_${chr}_interval \
11 |        --exclude $out_dir/chr${chr}_duplicates.txt \
12 |        --threads $SLURM_CPUS_ON_NODE \
13 |        --memory $SLURM_MEM_PER_NODE \
14 |        --silent \
15 |        --make-pgen \
16 |        --out $out_dir/impute_${chr}_interval_dedup
17 | 
18 | # Remove old pgen files.
19 | rm $out_dir/impute_${chr}_interval.pgen
20 | rm $out_dir/impute_${chr}_interval.psam
21 | rm $out_dir/impute_${chr}_interval.pvar
22 | 
23 | # Remove temporary exclusion file
24 | rm $out_dir/chr${chr}_duplicates.txt
25 | 
26 | 


--------------------------------------------------------------------------------
/GCTB/06_run_SbayesS/run_SbayesS.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gctb=/home/yx322/GCTB/gctb_2.03beta_Linux/gctb
 4 | 
 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 6 | 
 7 | 
 8 | 
 9 | $gctb --sbayes S  \
10 |       --ldm ${PWD}/interval_shrunk_chr_all.ldm.sparse  \
11 |       --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt \
12 |       --pi 0.01 \
13 |       --hsq 0.5 \
14 |     --num-chains 4 \
15 |     --chain-length 25000 \
16 |     --burn-in 100 \
17 |     --seed 12345 \
18 |     --thread 9 \
19 |     --no-mcmc-bin \
20 |     --out-freq 10 \
21 |     --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1}_1


--------------------------------------------------------------------------------
/GCTB/04a_merge_shrunk_all_chrs_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # Create logging directory
10 | log_dir=logs/04_merge_all_chrs/
11 | mkdir -p $log_dir
12 | 
13 | 
14 | merge_all_job=$(sbatch  \
15 |                   --parsable \
16 |                   --account INOUYE-COVID19-SL2-CPU \
17 |                   --job-name "all_merge" \
18 |                   --time 2:0:0 \
19 |                   --mem 240000 \
20 |                   --output $log_dir/MS_all_chrs_%A_%a.o \
21 |                   --error $log_dir/MS_all_chrs_%A_%a.e \
22 |                   --partition skylake-himem \
23 |                   04a_merge_shrunk_all_chrs/gctb_merge_shrunk_bins_all_chrs.sh)
24 | 
25 | echo "Submitted jobs $merge_all_job"
26 | 
27 | 


--------------------------------------------------------------------------------
/GCTB/06a_run_SbayesS_with_shrunk_corr/run_SbayesS_with_shrunk.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gctb=/home/yx322/GCTB/gctb_2.03beta_Linux/gctb
 4 | 
 5 | PWD=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_corr_matrix
 6 | 
 7 | 
 8 | $gctb --sbayes S  \
 9 |       --ldm ${PWD}/interval_shrunk_chr_all.ldm.shrunk  \
10 |       --gwas-summary /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_GWAS_summ_stats/Metabolon/gwas_filteredCleaned_${1}.txt \
11 |       --pi 0.01 \
12 |       --hsq 0.5 \
13 |     --num-chains 4 \
14 |     --chain-length 25000 \
15 |     --burn-in 2000 \
16 |     --seed 12345 \
17 |     --thread 18 \
18 |     --no-mcmc-bin \
19 |     --out-freq 10 \
20 |     --out /rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/results/GCTB/${1}


--------------------------------------------------------------------------------
/GCTB/04_merge_shrunk_sparse_all_chrs_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/04_merge_all_chrs/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | merge_all_job=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-COVID19-SL2-CPU \
18 |                   --job-name "all_merge" \
19 |                   --time 2:0:0 \
20 |                   --mem 60000 \
21 |                   --output $log_dir/M_all_chrs_%A_%a.o \
22 |                   --error $log_dir/M_all_chrs_%A_%a.e \
23 |                   --partition skylake-himem \
24 |                   04_merge_shrunk_sparse_all_chrs/gctb_merge_shrunk_sparse_bins_all_chrs.sh)
25 | 
26 | echo "Submitted jobs $merge_all_job"
27 | 
28 | 


--------------------------------------------------------------------------------
/GCTB/06a_run_SbayesS_with_shrunk_corr.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/06_run_SbayesS/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | run_SBayesS=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-SL2-CPU \
18 |                   --job-name "SS${1}" \
19 |                   --time 12:0:0 \
20 |                   --mem 240000 \
21 |                   --output $log_dir/run_SBayesS_shrunk_${1}_%A_%a.o \
22 |                   --error $log_dir/run_SBayesS_shrunk_${1}_%A_%a.e \
23 |                   --partition skylake-himem \
24 |                   06a_run_SbayesS_with_shrunk_corr/run_SbayesS_with_shrunk.sh ${1})
25 | 
26 | echo "Submitted jobs $run_SBayesS"
27 | 
28 | 


--------------------------------------------------------------------------------
/GCTB/02a_merge_shrunk_bins_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/02_gen_shrunk_corr/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | merge_job=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-COVID19-SL2-CPU \
18 |                   --job-name "merge_bin" \
19 |                   --array 1-6 \
20 |                   --time 2:0:0 \
21 |                   --mem 60000 \
22 |                   --output $log_dir/merge_shrunk_%A_%a.o \
23 |                   --error $log_dir/merge_shrunk_%A_%a.e \
24 |                   --partition skylake,skylake-himem \
25 |                   02a_gen_shrunk_corr_data_mcpu/gctb_merge_shrunk_bins_by_chr.sh)
26 | 
27 | echo "Submitted jobs $merge_job"
28 | 
29 | 


--------------------------------------------------------------------------------
/GCTB/03_to_shrunk_sparse_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/03_shrunk_sparse_corr/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | sparse_job=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-COVID19-SL2-CPU \
18 |                   --job-name "sparse" \
19 |                   --array 1-22 \
20 |                   --time 2:0:0 \
21 |                   --mem 60000 \
22 |                   --output $log_dir/shrunk_sparse_%A_%a.o \
23 |                   --error $log_dir/shrunk_sparse_%A_%a.e \
24 |                   --partition skylake,skylake-himem \
25 |                   03_convert_to_sparse/gctb_convert_to_sparse_shrunk_corr_matrix.sh)
26 | 
27 | echo "Submitted jobs $sparse_job"
28 | 
29 | 


--------------------------------------------------------------------------------
/GCTB/02a_gen_shrunk_corr_data_mcpu_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/02_gen_shrunk_corr/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | gen_scorr_job=$(sbatch  \
16 |                   --parsable \
17 |                   --account INOUYE-COVID19-SL2-CPU \
18 |                   --job-name "gen_scorr" \
19 |                   --array 1-${2} \
20 |                   --time 2:0:0 \
21 |                   --mem 11000 \
22 |                   --output $log_dir/gen_scorr_chr${1}_%A_%a.o \
23 |                   --error $log_dir/gen_scorr_chr${1}_%A_%a.e \
24 |                   --partition skylake,skylake-himem \
25 |                   02a_gen_shrunk_corr_data_mcpu/gctb_gen_shrunk_corr_matrix_mcpu.sh ${1})
26 | 
27 | echo "Submitted jobs $gen_scorr_job"
28 | 
29 | 


--------------------------------------------------------------------------------
/03_collate_QTLs.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/collate_QTLs
18 | mkdir -p $log_dir
19 | 
20 | # Step 1: curate list of pQTLs to extract from the genotype data
21 | mkdir -p geno_files/
22 | sbatch --dependency afterany:$previous_job \
23 |        --account INOUYE-COVID19-SL2-CPU \
24 |        --job-name "Collate QTLs" \
25 |        --time 36:0:0 \
26 |        --output $log_dir/collate_QTLs_%j.o \
27 |        --error $log_dir/collate_QTLs_%j.e \
28 |        --partition skylake,skylake-himem \
29 |        --wrap "Rscript scripts/03_collate_QTLs/01_collate_QTLs.R"
30 | 


--------------------------------------------------------------------------------
/02_ldthin/02_ldthin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ref_dir=geno_files/genotype_data
 4 | out_dir=$ref_dir/ldthinned
 5 | chr=$SLURM_ARRAY_TASK_ID
 6 | 
 7 | # Get LD thinned set of variants with MAF > 0.5%
 8 | plink2 --pfile $ref_dir/impute_${chr}_interval_dedup \
 9 |        --extract $out_dir/chr${chr}_keep.txt \
10 |        --maf 0.005 \
11 |        --indep-pairwise 1000kb 0.8 \
12 |        --threads $SLURM_CPUS_ON_NODE \
13 |        --memory $SLURM_MEM_PER_NODE \
14 |        --silent \
15 |        --out $out_dir/chr${chr}_ldthinned
16 | 
17 | # Extract those variants
18 | plink2 --pfile $ref_dir/impute_${chr}_interval_dedup \
19 |        --extract $out_dir/chr${chr}_ldthinned.prune.in \
20 |        --threads $SLURM_CPUS_ON_NODE \
21 |        --memory $SLURM_MEM_PER_NODE \
22 |        --silent \
23 |        --make-pgen \
24 |        --out $out_dir/impute_${chr}_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8
25 | 
26 | # Remove temporary file
27 | rm $out_dir/chr${chr}_keep.txt
28 | rm $out_dir/chr${chr}_ldthinned.prune.in
29 | rm $out_dir/chr${chr}_ldthinned.prune.out
30 | 
31 | 


--------------------------------------------------------------------------------
/GCTB/01_gen_genetic_data_hapmap3_variants/gen_INTERVAL_hapmap3_vars_genetic_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | plink=/home/yx322/plink_2.0/plink2
 4 | 
 5 | bed_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/yx322/interval_genetics/interval_impute_chr${SLURM_ARRAY_TASK_ID}
 6 | 
 7 | output_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/methods_benchmark/data/GCTB_INTERVAL_genetics/filtered_interval_chr${SLURM_ARRAY_TASK_ID}
 8 | 
 9 | variant_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks/data/HapMap3_transformed/HapMap1kg_variants_matched2INTERVAL_rsid.txt
10 | 
11 | sample_id_file=/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks/data/INTERVAL_genotypes/all_externalIDs.txt
12 | 
13 | 
14 | $plink  --bfile $bed_file  \
15 |         --extract $variant_file \
16 |         --geno 0.05  \
17 |         --hwe 1e-6 \
18 |         --mach-r2-filter 0.3 \
19 |         --maf 0.01 \
20 |         --remove $sample_id_file \
21 |         --make-bed \
22 |         --out $output_file
23 |         
24 | 


--------------------------------------------------------------------------------
/06_all_omics_ukb_phecode_disease_assoc_test.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | 
10 | # Create logging directory
11 | log_dir=logs/06_omics_ukb_phecode_disease_assoc_test/
12 | mkdir -p $log_dir
13 | 
14 | 
15 | #submit an array job for association scan of a given phecode in UKB across all omics traits of the 5 platforms
16 | #the first argument is the phecode file 
17 | asso_test_job=$(sbatch  --parsable \
18 |                   --account INOUYE-SL3-CPU \
19 |                   --job-name "ass_test" \
20 |                   --array 1-5 \
21 |                   --time 12:0:0 \
22 |                   --mem 36000 \
23 |                   --output $log_dir/omics_${1}_assoc_tests_%A_%a.o \
24 |                   --error $log_dir/omics_${1}_assoc_tests_%A_%a.e \
25 |                   --partition cclake-himem,skylake-himem,skylake,cclake \
26 |                   scripts/06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.sh ${1})
27 | 
28 | echo "Submitted jobs ${asso_test_job}"
29 | 
30 | 


--------------------------------------------------------------------------------
/GCTB/02_gen_shrunk_corr_data_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/02_gen_shrunk_corr/
18 | mkdir -p $log_dir
19 | 
20 | 
21 | gen_scorr_job=$(sbatch --dependency afterany:$previous_job \
22 |                   --parsable \
23 |                   --account INOUYE-COVID19-SL2-CPU \
24 |                   --job-name "gen_scorr" \
25 |                   --array 1-22 \
26 |                   --time 12:0:0 \
27 |                   --mem 60000 \
28 |                   --output $log_dir/gen_scorr_%A_%a.o \
29 |                   --error $log_dir/gen_scorr_%A_%a.e \
30 |                   --partition skylake-himem \
31 |                   02_gen_shrunk_corr_data/gctb_gen_shrunk_corr_matrix.sh)
32 | 
33 | echo "Submitted jobs $gen_scorr_job"
34 | 
35 | 


--------------------------------------------------------------------------------
/GCTB/01_gen_genetic_data_hapmap3_variants_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/01_gen_genetic_data/
18 | mkdir -p $log_dir
19 | 
20 | 
21 | gen_genetics_job=$(sbatch --dependency afterany:$previous_job \
22 |                   --parsable \
23 |                   --account INOUYE-COVID19-SL2-CPU \
24 |                   --job-name "gen_genetics" \
25 |                   --array 1-22 \
26 |                   --time 12:0:0 \
27 |                   --mem 60000 \
28 |                   --output $log_dir/gen_genetics_%A_%a.o \
29 |                   --error $log_dir/gen_genetics_%A_%a.e \
30 |                   --partition skylake \
31 |                   01_gen_genetic_data_hapmap3_variants/gen_INTERVAL_hapmap3_vars_genetic_data.sh)
32 | 
33 | echo "Submitted jobs $gen_genetics_job"
34 | 
35 | 


--------------------------------------------------------------------------------
/05_genetic_score_training.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/genetic_score_training/
18 | mkdir -p $log_dir
19 | 
20 | # Step 1: curate list of QTLs to extract from the genotype data
21 | # First argument in python script is the omics platform name (i.e. SomaScan, Olink, Metabolon, Nightingale and RNAseq)
22 | # and the second argument is the index of a trait for training in the platform
23 | # the 3th to 6th arguments are the priors for the BR method
24 | 
25 | mkdir -p geno_files/ml_inputs
26 | 
27 | score_training=$(sbatch --dependency afterany:$previous_job \
28 | 								--parsable \
29 | 								--account INOUYE-COVID19-SL2-CPU \
30 | 								--job-name "score train" \
31 | 								--time 36:0:0 \
32 | 								-c 32 -N 1 \
33 |                 --output $log_dir/score_training_%j.o \
34 |                 --error $log_dir/score_training_%j.e \
35 | 								--partition skylake-himem \
36 |                 --wrap "Python scripts/05_genetic_score_training/01_run_omics_pgs_training with_br.py SomaScan 1 0.000001 0.000001 0.000001 0.000001")
37 | 
38 | 
39 | 
40 | echo "Submitted jobs $score_training"
41 | 
42 | 


--------------------------------------------------------------------------------
/02_ldthin/01_identify_snps.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | chr_id = as.numeric(Sys.getenv("SLURM_ARRAY_TASK_ID"))
 4 | 
 5 | ref_dir = "geno_files/genotype_data"
 6 | out_dir = sprintf("%s/ldthinned", ref_dir)
 7 | 
 8 | pvar = fread(sprintf("%s/impute_%s_interval_dedup.pvar", ref_dir, chr_id))
 9 | 
10 | # Remove multi-allelic sites:
11 | multi = pvar[grepl("^rs", ID), .N, by=ID][N > 1]
12 | pvar = pvar[!multi, on = .(ID)]
13 | multi = pvar[!grepl("^rs", ID), .N, by=.(`#CHROM`, POS)][N > 1]
14 | pvar = pvar[!multi, on = .(`#CHROM`, POS)]
15 | 
16 | # Function for flipping the strand of an allele.
17 | # Uses a series of gsub calls to replace A's with T's,
18 | # G's with C's, and vice-versa. Also works for alleles
19 | # with more than one nucleotide (e.g. indels).
20 | flip_strand <- function(x) {
21 |   # Swap each letter for a dummy, we need this intermediate
22 |   # step so we can distinguish between alleles when swapping.
23 |   # E.g if we did A -> T then T -> A we'd end up with all A's
24 |   # and no T's. instead we do A -> V -> T and T -> X -> A.
25 |   x <- gsub("A", "V", x)
26 |   x <- gsub("T", "X", x)
27 |   x <- gsub("C", "Y", x)
28 |   x <- gsub("G", "Z", x)
29 |   x <- gsub("V", "T", x)
30 |   x <- gsub("X", "A", x)
31 |   x <- gsub("Y", "G", x)
32 |   x <- gsub("Z", "C", x)
33 |   return(x)
34 | }
35 | 
36 | # Remove strand ambiguous alleles:
37 | pvar = pvar[REF != flip_strand(ALT)]
38 | 
39 | # Filter to SNPs
40 | pvar = pvar[nchar(REF) == 1 & nchar(ALT) == 1]
41 | 
42 | # Write out list of variants to extract prior to LD-thinning
43 | fwrite(pvar[,.(ID)], col.names=FALSE, quote=FALSE, file=sprintf("%s/chr%s_keep.txt", out_dir, chr_id))
44 | 
45 | 


--------------------------------------------------------------------------------
/02_ldthin.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/02_ldthin/
18 | mkdir -p $log_dir
19 | 
20 | # Step 1: identify SNPs to keep
21 | mkdir -p geno_files/genotype_data/ldthinned
22 | keep_job=$(sbatch --dependency afterany:$previous_job \
23 |                   --parsable \
24 |                   --account INOUYE-COVID19-SL2-CPU \
25 |                   --job-name "Identify SNPs" \
26 |                   --time 1:0:0 \
27 |                   --array 1-22 \
28 |                   --output $log_dir/identify_snps_%A_%a.o \
29 |                   --error $log_dir/identify_snps_%A_%a.e \
30 |                   --partition skylake \
31 |                   --wrap "Rscript scripts/02_ldthin/01_identify_snps.R")
32 | 
33 | # Step 2: LD thin remaining SNPs at R2=0.8
34 | thin_job=$(sbatch --dependency afterok:$keep_job \
35 |                   --parsable \
36 |                   --account INOUYE-COVID19-SL2-CPU \
37 |                   --job-name "LDthin" \
38 |                   --array 1-22 \
39 |                   --time 3:0:0 \
40 |                   --mem 10000 \
41 |                   --output $log_dir/ldthin_%A_%a.o \
42 |                   --error $log_dir/ldthin_%A_%a.e \
43 |                   --partition skylake \
44 |                   scripts/02_ldthin/02_ldthin.sh)
45 | 
46 | echo "Submitted jobs $keep_job, $thin_job"
47 | 
48 | 


--------------------------------------------------------------------------------
/04_extract_QTLs.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/extract_QTLs/
18 | mkdir -p $log_dir
19 | 
20 | # Step 1: curate list of QTLs to extract from the genotype data
21 | # First argument in R script is trans/genome-wide P-value threshold, (must be < 0.001)
22 | # and the second argument is the cis P-value threshold (use 1 if you want to include all cis SNPs)
23 | mkdir -p geno_files/ml_inputs
24 | qc_job=$(sbatch --dependency afterany:$previous_job \
25 | 								--parsable \
26 | 								--account INOUYE-COVID19-SL2-CPU \
27 | 								--job-name "Extract QTLs" \
28 | 								--time 36:0:0 \
29 | 								-c 32 -N 1 \
30 |                 --output $log_dir/curate_QTLs_%j.o \
31 |                 --error $log_dir/curate_QTLs_%j.e \
32 | 								--partition skylake-himem \
33 |                 --wrap "Rscript scripts/04_extract_QTLs/01_extract_QTLs.R '5e-8' '5e-8'")
34 | 
35 | # Step 2: extract the dosages of the effect alleles
36 | ex_job=$(sbatch --dependency afterok:$qc_job \
37 |                 --parsable \
38 |                 --account INOUYE-COVID19-SL2-CPU \
39 |                 --job-name "Extract dosages" \
40 |                 --partition skylake,skylake-himem \
41 |                 --time 2:0:0 \
42 |                 --array 1-200 \
43 |                 --mem 10000 \
44 |                 --output $log_dir/extract_dosages_%A_%a.o \
45 |                 --error $log_dir/extract_dosages_%A_%a.e \
46 |                 scripts/04_extract_QTLs/02_extract_QTL_dosages.sh)
47 | 
48 | echo "Submitted jobs $qc_job, $ex_job"
49 | 
50 | 


--------------------------------------------------------------------------------
/05_genetic_score_training/BayesianRidge.py:
--------------------------------------------------------------------------------
 1 | from sklearn.model_selection import train_test_split
 2 | import random
 3 | from sklearn.linear_model import SGDRegressor
 4 | import numpy as np
 5 | import pandas as pd
 6 | from sklearn.metrics import r2_score,explained_variance_score
 7 | from scipy.stats import pearsonr
 8 | from sklearn.linear_model import BayesianRidge
 9 | from sklearn.model_selection import KFold
10 | from scipy.stats import spearmanr
11 | 
12 | 
13 | 
14 | 
15 | def get_BayesianRidge_prediction(x_train, y_train, x_val, y_val, alpha_1, alpha_2, lambda_1, lambda_2):
16 |     model = BayesianRidge(alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2)
17 |     model.fit(x_train, y_train)
18 |     y_pred = model.predict(x_val)
19 |     return model,pearsonr(y_val, y_pred)[0]
20 | 
21 | 
22 | 
23 | def full_fit_BayesianRidge(x_train, x_test, y_train, y_test, alpha_1, alpha_2, lambda_1, lambda_2):
24 |     # Bayesian Ridge Regression Method
25 |     #
26 |     # Note prior Gamma distribution are set as ( alpha_1, alpha_2) and (lambda_1, lambda_2) which were selected via cross-validation step using training data (see codes below: full_fit_BayesianRidge_para_turing;
27 |     # all traits shared the same)
28 |     # X_train: training genotype data  (Numpy matrix - samples X viriants)
29 |     # X_test: testing genotype data
30 |     # y_train: training trait value data (Numpy vector - 1 X N)
31 |     # y_test: testing trait value data
32 |     # return the learned model, r, explained variance score and spearmanr performance
33 |     model, r = get_BayesianRidge_prediction(x_train, y_train, x_test, y_test, alpha_1, alpha_2, lambda_1, lambda_2)
34 |     y_pred = model.predict(x_test)
35 |     return model,pearsonr(y_test, y_pred)[0],r2_score(y_test, y_pred),explained_variance_score(y_test,y_pred),spearmanr(y_test,y_pred)[0]
36 | 
37 | 
38 | 
39 | #hyper-parameter Tuning - finding the best prior Gamma distributions on the training set of a trait
40 | #Grid search on (1e10, 1e5, 1e1, 0, -1e1, -1e5, -1e10)
41 | #return the best 'alpha_1', 'alpha_2' 'lambda_1' 'lambda_2'
42 | def full_fit_BayesianRidge_para_turing(x_train, x_val, y_train, y_val,para_file):
43 |     nums = (1e10, 1e5, 1e3, 1e1, 0, 1e-1, 1e-3,1e-5,1e-10)
44 |     f=open(para_file,'w')
45 |     alpha_1 = nums
46 |     alpha_2 = nums
47 |     lambda_1 = nums
48 |     lambda_2 = nums
49 |     best_model = None
50 |     best_r = 0
51 |     for a1 in alpha_1:
52 |         for a2 in alpha_2:
53 |             for l1 in lambda_1:
54 |                 for l2 in lambda_2:
55 |                     model,r = get_BayesianRidge_prediction(x_train,y_train,x_val,y_val,a1,a2,l1,l2)
56 |                     text = "Training BayesianRidge with alpha_1: {}, alphs_2: {}, lambda_1: {}, lambda_2:{} - r score {}\n".format(a1,a2,l1,l2,r)
57 |                     print(text)
58 |                     f.write(text)
59 |                     if best_model == None or r > best_r:
60 |                         best_model = model
61 |                         best_r = r
62 |                         best_params =  {'alpha_1':a1, 'alpha_2': a2, 'lambda_1': l1, 'lambda_2': l2}
63 |     print("Best Para: {}".format(best_params))
64 |     return best_model
65 | 
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/01_convert_bgen.job:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure job is submitted directly
 4 | if [ ! -z ${SLURM_JOB_ID+x} ]; then
 5 |    echo "This script should be executed directly, not with sbatch."
 6 |    exit 1
 7 | fi
 8 | 
 9 | # allows user to specify a job to wait for completion before running any of these scripts
10 | if [ ! -z "$1" ]; then
11 |   previous_job=$1
12 | else
13 |   previous_job=1 # run first job immediately
14 | fi
15 | 
16 | # Create logging directory
17 | log_dir=logs/01_convert_bgen/
18 | mkdir -p $log_dir
19 | 
20 | # Step 1: convert the BGEN files to plink pgen/pvar/pfam files.
21 | # These maintain the probabilistic dosage information, while also
22 | # separating the variant information into a separate file, allowing
23 | # us to fix the non-unique variant identifiers (all variants without
24 | # an rsID are given identifier ".")
25 | mkdir -p geno_files/genotype_data/
26 | conv_job=$(sbatch --dependency afterany:$previous_job \
27 |                   --parsable \
28 |                   --account INOUYE-COVID19-SL2-CPU \
29 |                   --job-name "Convert bgen" \
30 |                   --time 1:0:0 \
31 |                   --array 1-22 \
32 |                   --mem 36000 \
33 |                   --output $log_dir/convert_bgen_%A_%a.o \
34 |                   --error $log_dir/convert_bgen_%A_%a.e \
35 |                   --partition skylake \
36 |                   scripts/01_convert_bgen/01_convert_bgen.sh)
37 | 
38 | # Step 2: Give variants unique identifiers and flag duplicates 
39 | # for removal
40 | flag_job=$(sbatch --dependency afterok:$conv_job \
41 |                   --parsable \
42 |                   --account INOUYE-COVID19-SL2-CPU \
43 |                   --job-name "Flag duplicates" \
44 |                   --time 3:0:0 \
45 |                   --mem 12000 \
46 |                   --output $log_dir/flag_duplicates_%j.o \
47 |                   --error $log_dir/flag_duplicates_%j.e \
48 |                   --partition skylake \
49 |                   --wrap "Rscript scripts/01_convert_bgen/02_flag_duplicates.R")
50 | 
51 | # Step 3: Remove the variants flagged for removal
52 | rmdp_job=$(sbatch --dependency afterany:$flag_job \
53 |                   --parsable \
54 |                   --account INOUYE-COVID19-SL2-CPU \
55 |                   --job-name "Remove duplicates" \
56 |                   --time 1:0:0 \
57 |                   --array 1-22 \
58 |                   --mem 6000 \
59 |                   --output $log_dir/remove_duplicates_%A_%a.o \
60 |                   --error $log_dir/remove_duplicates_%A_%a.e \
61 |                   --partition skylake \
62 |                   scripts/01_convert_bgen/03_filter_duplicates.sh)
63 | 
64 | # Step 4: Remove the extra crud in the variant identifiers now that
65 | # the deduplication process has happened.
66 | fvid_job=$(sbatch --dependency afterok:$rmdp_job \
67 |                   --parsable \
68 |                   --account INOUYE-COVID19-SL2-CPU \
69 |                   --job-name "Fix variant IDs" \
70 |                   --time 1:0:0 \
71 |                   --mem 8192 \
72 |                   --output $log_dir/fix_var_ids_%j.o \
73 |                   --error $log_dir/fix_var_ids_%j.e \
74 |                   --partition skylake \
75 |                   --wrap "Rscript scripts/01_convert_bgen/04_fix_var_ids.R")
76 | 
77 | echo "Submitted jobs $conv_job, $flag_job, $rmdp_job, $fvid_job"
78 | 
79 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![DOI:10.1038/s41586-023-05844-9](https://img.shields.io/badge/DOI%3A-10.1038%2Fs41586--023--05844--9-orange)](https://www.nature.com/articles/s41586-023-05844-9)
 2 | 
 3 | [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](http://www.omicspred.org/)
 4 | 
 5 | # An atlas of genetic scores to predict multi-omic traits
 6 | This repository houses and documents the codes used to train genetic scores of omic traits using INTERVAL data and internally validate these scores in the study: Xu Y et al. An atlas of genetic scores to predict multi-omic traits. Nature (2023) https://doi.org/10.1038/s41586-023-05844-9.
 7 | 
 8 | All genetic score models trained in the study, and their internal as well as external validation results were all deposited in a cloud service (boxing.com) and are publicly accessible through our online portal (www.omicspred.org).
 9 | 
10 | 
11 | ## The following  softwares and versions were used to perform the analyses:
12 | 
13 | - Scientific Linux release 7.7 (Nitrogen) (HPC operating system)
14 | - slurm version 19.05.5 (HPC queue manager and job submission system)
15 | - GNU bash version 4.2.46(2) (shell environment used to run bash scripts)
16 | - PLINK v1.90b6.11 64-bit (24 Oct 2019) (www.cog-genomics.org/plink/1.9/)
17 | - PLINK v2.00a2.3LM 64-bit Intel (24 Jan 2020)   (www.cog-genomics.org/plink/2.0/)
18 | - STAR v2.7.3.a (https://github.com/alexdobin/STAR)
19 | - featureCounts v2.0.0 (http://subread.sourceforge.net/)
20 | - QTLtools v1.3.1 (https://qtltools.github.io/qtltools/)
21 | 
22 | - Python version 3.6.8 with the following Python packages:
23 |    - numpy version 1.19.5 
24 |    - pandas version 1.1.5
25 |    - scikit-learn version 0.21.2 
26 |    - scipy version 1.5.4
27 |    - statsmodels version 0.12.2
28 |    - lifelines version 0.26.0    
29 | 
30 | - R version 3.6.1 with the following R packages: 
31 |   - cowplot version 1.0.0
32 |   - data.table version 1.13.6
33 |   - dplyr version 1.0.8
34 |   - foreach version 1.5.1
35 |   - ggplot2 version 3.3.5
36 |   - ggpubr version 0.2.5
37 |   - grid version 3.6.1
38 |   - plyr version 1.8.6
39 |   - reshape2 version 1.4.4
40 |   - RcolorBrewer version 1.1-2
41 |   - stringr version 1.4.0
42 |   - tibble version 3.1.0
43 |   - bigsnpr version 1.10.8
44 | 
45 | ## Description of scripts in each sub-folder:
46 | 
47 | - Genetic score development for multi-omic traits: 
48 |    - **01_convert_bgen**: convert genotype data from bgen to Plink pgen format and remove duplacate variants;
49 |    - **02_ldthin**: remove multi-allelic, ambiguous (A/T, G/G) variants and variants with a MAF < 0.5%, and ld-thin variants with r2=0.8 (i.e. indep-pairwise 1000kb 0.8 in plink2);
50 |    - **03_collate_QTLs**: curate the list of QTLs infomration needed for variant selection from GWAS summary statistics;
51 |    - **04_extract_QTLs**: select the list QTLs with given p-value thresholds, and extract their dosages of the effect alleles as input data of Bayesian Ridge;
52 |    - **05_genetic_score_training**: training genetic score models using Bayesian ridge;
53 |    
54 | 
55 | - Others: 
56 |    - **06_all_omics_UKB_phecode_assoc_test**: perform PheWAS with the genetic scores of omics traits in UK Biobank
57 |    - **GCTB**: scripts attempted to use SbayesS to estimate heritability of omics traits
58 |    - **LDpred2**: Scripts used to develop genetic scores of omic traits using LDpred2-auto.
59 | 


--------------------------------------------------------------------------------
/01_convert_bgen/02_flag_duplicates.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | # Fix missing rsIDs and flag duplicate variants for removal.
 4 | for (chr_id in 1:22) {
 5 |   # Load variant information and add common identifier to help identify duplicates
 6 |   # and fix missing rsIDs.
 7 |   pvar = fread(sprintf("geno_files/genotype_data/impute_%s_interval.pvar", chr_id))
 8 |   pvar[, row := .I]
 9 |   pvar[, sorted_alleles := paste(sort(c(REF, ALT)), collapse=":"), by=row]
10 |   pvar[, var_id := paste(`#CHROM`, POS, sorted_alleles, sep=":")]
11 |   pvar[ID == ".", ID := var_id]
12 | 
13 |   # Load in variant statistics so we can use INFO scores to flag 
14 |   # which of each pair of duplicates to remove
15 |   snpstats = fread(sprintf("/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/interval_grs_scan/data/INTERVAL/reference_files/imputed_genotypes/impute_%s_interval.snpstats", chr_id))
16 |   pvar[snpstats, on = .(`#CHROM`=chromosome, POS=position, REF=A_allele, ALT=B_allele), INFO := i.information]
17 |   pvar[snpstats, on = .(`#CHROM`=chromosome, POS=position, REF=B_allele, ALT=A_allele), INFO := i.information]
18 | 
19 |   # Identify and flag duplicates for removal, keeping the entry with
20 |   # the highest INFO score in each case. There are two cases to deal with:
21 |   #
22 |   # (1) variants that are duplicates by position and alleles
23 |   # (2) variants that are duplicates by rsid (and alleles), but which may
24 |   #     have different positions.
25 |   #
26 |   # The reason we match by alleles as well is that it appears that multi-allelic
27 |   # variants are split into multiple entries even in the BGEN files, so we don't
28 |   # want to incorrectly remove these.
29 | 
30 |   # First, identify any variant that is duplicate by position, or duplicate by
31 |   # id, then filter the pvar table to all remaining variants ('ok').
32 |   dups_by_pos = pvar[,.N, by=.(var_id)][N > 1]
33 |   dups_by_id = pvar[,.N,by=.(ID, sorted_alleles)][N > 1]
34 |   ok = pvar[!dups_by_pos, on = .(var_id)][!dups_by_id, on=.(ID, sorted_alleles)]
35 | 
36 |   # Extract the remaining variants, which are all duplicates in either sense
37 |   dups = pvar[!ok, on = .(row)]
38 | 
39 |   # First, considering all duplicates by position, take the variant with the max
40 |   # INFO score (first one if there are multiple with the same INFO).
41 |   max_info_by_pos = dups[,.SD[which.max(INFO)], by=.(var_id)] 
42 |  
43 |   # Then, from these remaining variants, take the max INFO score by rsID to handle
44 |   # cases where > 1 variant may have the same rsID, but differeing positions.
45 |   max_info_by_id = max_info_by_pos[,.SD[which.max(INFO)], by=.(ID, sorted_alleles)]
46 | 
47 |   # Add these back to the "ok" table
48 |   ok = rbind(ok, max_info_by_id)
49 | 
50 |   # Flag in the pvar table the variants to remove - important to preserve order
51 |   # of variants in the output table as the row number corresponds to row in the
52 |   # genotype data.
53 |   pvar[, remove := FALSE]
54 |   pvar[!ok, on = .(row), remove := TRUE]
55 |   
56 |   # Make sure every variant has a unique identifier so we can accurately flag 
57 |   # variants for removal with plink.
58 |   pvar[, ID := paste(ID, row, sep=":")]
59 | 
60 |   # Flag variants for removal in the identifier
61 |   pvar[(remove), ID := paste(ID, "remove", sep=":")]
62 | 
63 |   # Overwrite pvar file:
64 |   fwrite(pvar[, .(`#CHROM`, POS, ID, REF, ALT)], sep="\t", quote=FALSE,
65 |          file=sprintf("geno_files/genotype_data/impute_%s_interval.pvar", chr_id))
66 | 
67 |   # Remove objects and garbage collect before going to next loop
68 |   rm(list=ls())
69 |   gc()
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/04_extract_QTLs/02_extract_QTL_dosages.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $1 = "" ]]; then 
 4 |   out_dir=geno_files/ml_inputs
 5 | else
 6 |   out_dir=$1
 7 | fi
 8 | 
 9 | # Get array of phenotypes
10 | phenotypes=($(ls $out_dir/*_variant_effects.txt | sed "s#$out_dir/##" | sed 's/_variant_effects.txt//'))
11 | nphen=${#phenotypes[@]}
12 | 
13 | # Determine which phenotypes we're working with for this task:
14 | task=$SLURM_ARRAY_TASK_ID
15 | ntasks=$SLURM_ARRAY_TASK_MAX
16 | 
17 | nphen_per_task=$(echo "a=$nphen; b=$ntasks; if ( a%b )  a/b+1 else a/b" | bc)
18 | task_start=$(echo "a=$task; b=$nphen_per_task; (a-1)*b" | bc)
19 | task_end=$(echo "a=$task_start; b=$nphen_per_task; c=$nphen; if ( (a+b)>c ) c - 1 else a+b-1" | bc)
20 | 
21 | if [ $task_start -gt $task_end ]; then
22 |   echo "No remaining phenotypes for task $task of $ntasks."
23 |   exit 0
24 | else 
25 |   echo "Task $task of $ntasks extracting dosages of phenotypes $task_start - $task_end of $nphen."
26 | fi
27 | 
28 | # Iterate through each protein for this task to extract the genotype dosages
29 | for phenIdx in $(seq $task_start $task_end); do
30 |   # What phenotype are we working with?
31 |   phen=${phenotypes[$phenIdx]}
32 | 
33 |   # Determine the chromosomes we need to extract for this phenotype
34 |   chrs=( $(tail -n +2 $out_dir/${phen}_variant_effects.txt | cut -f 2 | sort | uniq) )
35 | 
36 |   # Determine the samples we need to keep for this phenotype
37 |   grep -P "^${phen}\t" $out_dir/phenotypes.txt | cut -f 3 > $out_dir/${phen}_IID.txt
38 |   echo "#FID"$'\t'"IID" > $out_dir/${phen}.samples
39 |   paste $out_dir/${phen}_IID.txt $out_dir/${phen}_IID.txt >> $out_dir/${phen}.samples
40 |   rm $out_dir/${phen}_IID.txt
41 |   
42 |   # For each chromosome, extract the dosages of the pQTLs
43 |   for chr in ${chrs[@]}; do
44 |     # Get the list of pQTLs on this chromosome to extract
45 |     cut -f 1,2 $out_dir/${phen}_variant_effects.txt | grep -w "${chr}"'$' | cut -f 1 > $out_dir/${phen}_chr${chr}_variant_ids.txt
46 | 
47 |     # Get their effect alleles
48 |     grep -f $out_dir/${phen}_chr${chr}_variant_ids.txt -w $out_dir/${phen}_variant_effects.txt | cut -f 1,4 > $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt 
49 | 
50 |     # Extract the dosages of the effect alleles
51 |     plink2 --pfile geno_files/genotype_data/ldthinned/impute_${chr}_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8 \
52 |            --out $out_dir/${phen}_chr${chr}_dosages \
53 |            --keep $out_dir/${phen}.samples \
54 |            --extract $out_dir/${phen}_chr${chr}_variant_ids.txt \
55 |            --export A --export-allele $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt \
56 |            --memory $SLURM_MEM_PER_NODE \
57 |            --threads $SLURM_CPUS_ON_NODE \
58 |            --silent
59 | 
60 |     # remove temporary files
61 |     rm $out_dir/${phen}_chr${chr}_variant_ids.txt 
62 |     rm $out_dir/${phen}_chr${chr}_variant_effect_alleles.txt
63 |     rm $out_dir/${phen}_chr${chr}_dosages.log
64 |   done
65 | 
66 |   # Combine chromosome data
67 |   for chr in ${chrs[@]}; do
68 |     paste $out_dir/${phen}_dosages.txt $out_dir/${phen}_chr${chr}_dosages.raw > $out_dir/${phen}_dosage_tmpfile
69 |     mv $out_dir/${phen}_dosage_tmpfile $out_dir/${phen}_dosages.txt
70 |     rm $out_dir/${phen}_chr${chr}_dosages.raw
71 |   done
72 | 
73 |   # get the varID column and remove extra sample identifier information from each chromosome
74 |   Rscript scripts/04_extract_QTLs/02_helpers/reformat_dosages.R $out_dir $phen
75 |   rm $out_dir/${phen}_dosages.txt
76 | 
77 |   # sample file no longer needed
78 |   rm $out_dir/${phen}.samples
79 | done
80 | 
81 | 


--------------------------------------------------------------------------------
/05_genetic_score_training/01_run_omics_pgs_training with_br.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import pandas as pd
  3 | import numpy as np
  4 | import datetime
  5 | from sklearn.model_selection import KFold
  6 | from Methods.BayesianRidge import full_fit_BayesianRidge
  7 | from Methods.Traditional_GRS import traditional_GRS_selected_vars
  8 | from sklearn.externals import joblib
  9 | import sys
 10 | import os
 11 | import time
 12 | 
 13 | 
 14 | def read_proteins_list(proteinomics_list_file):
 15 |     df = pd.read_csv(proteinomics_list_file,sep='\t')
 16 |     return list(df['PhenotypeCompName'])
 17 | 
 18 | 
 19 | def read_protein_phenos(proteinomics_phenos_file, protein_name, sample_ids):
 20 |     df_pheno = pd.read_csv(proteinomics_phenos_file, delimiter='\t')
 21 |     df_pheno = df_pheno.loc[df_pheno['PhenotypeCompName'] == protein_name]
 22 |     df_pheno = df_pheno.set_index('IID')
 23 |     return np.array(df_pheno.loc[sample_ids, 'value'])
 24 | 
 25 | 
 26 | def read_protein_genotypes(geno_file):
 27 |     df = pd.read_csv(geno_file,compression='gzip', sep='\t')
 28 |     sample_ids = list(df['varID'])
 29 |     df = df.set_index('varID')
 30 |     var_ids = list(df.columns)
 31 |     X = np.array(df.loc[:,:])
 32 |     return sample_ids,var_ids,X
 33 | 
 34 | 
 35 | def run_experiments_5_folders_one_protein(platform,proteinomics_list_file,proteinomics_genotype_path, proteinomics_phenos_file,results_path,models_path,beta_path,protein_index, alpha_1, alpha_2, lambda_1, lambda_2):
 36 | 
 37 |     #read the full list of protein (or other type of omic trait) unique ids & read the current protein (or ther type of trait) id
 38 |     proteins_list = read_proteins_list(proteinomics_list_file)
 39 |     protein_name = proteins_list[protein_index]
 40 | 
 41 |     print("Start processing {}-{}-{}".format(platform,protein_index,protein_name))
 42 | 
 43 |     #read genotype matrix X and all sample ids and variants ids
 44 |     geno_file = proteinomics_genotype_path + protein_name + "_dosages.txt.gz"
 45 |     sample_ids,var_ids,X = read_protein_genotypes(geno_file)
 46 | 
 47 |     print("Number of Variants {}".format(len(var_ids)))
 48 |     print("Number of Samples {}".format(len(sample_ids)))
 49 | 
 50 |     #read protein levels of the given proteins and all samples
 51 |     y = read_protein_phenos(proteinomics_phenos_file, protein_name, sample_ids)
 52 | 
 53 |     results_file = results_path + platform + "_" +protein_name + "_BR_UNI_prs.txt"
 54 | 
 55 |     f = open(results_file,'w')
 56 |     f.write('Time\tProtein\tN_Vars\tFolder\tBR_r2\tBR_sr\tUNI_r2\tUNI_sr\n')
 57 | 
 58 |     folder_count = 0
 59 |     kf = KFold(n_splits=5, shuffle=True, random_state=21)
 60 |     for train_index, test_index in kf.split(y):
 61 |         folder_count += 1
 62 | 
 63 |         print("folder-{}".format(folder_count))
 64 | 
 65 |         x_train, x_test = X[train_index], X[test_index]
 66 |         y_train, y_test = y[train_index], y[test_index]
 67 | 
 68 |         print("{}-folder-{} Running BayesianRidge...".format(datetime.datetime.now(), folder_count))
 69 |         BR_model,br_r,br_r2,br_envs,br_sr = full_fit_BayesianRidge(x_train, x_test, y_train, y_test, alpha_1, alpha_2, lambda_1, lambda_2)
 70 |         print("r: {}, r2: {}, env: {}, sr: {}".format(br_r, br_r2,br_envs,br_sr))
 71 |         model_file = models_path + platform + "_" + protein_name + "_BR_model_" + str(folder_count) + ".pkl"
 72 |         joblib.dump(BR_model, model_file)
 73 | 
 74 |         print("{}-folder-{} Running Univariant method...".format(datetime.datetime.now(), folder_count))
 75 |         beta_file = beta_path + protein_name + "_variant_effects.txt"
 76 |         grs_r, grs_r2,grs_env,grs_sr = traditional_GRS_selected_vars(beta_file, x_test, y_test, var_ids)
 77 |         print("r: {}, r2: {}, env: {}, sr: {}".format(grs_r, grs_r2,grs_env, grs_sr))
 78 | 
 79 |         write_text = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(datetime.datetime.now(),protein_name,X.shape[1],folder_count,br_r**2,br_sr,grs_r**2, grs_sr)
 80 |         f.write(write_text)
 81 |         f.flush()
 82 |     f.close()
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |         
 87 |         # platforms: "SomaScan", "Olink", "Metabolon", "Nightingale", "RNAseq"
 88 |         platform= str(sys.argv[1])
 89 |         
 90 |         # the omic trait index in a platform
 91 |         protein_index = int(sys.argv[2])
 92 |         
 93 |         # BR priors for traits in the platform
 94 |         alpha_1 = float(sys.argv[3])
 95 |         alpha_2 = float(sys.argv[4])
 96 |         lambda_1= float(sys.argv[5])
 97 |         lambda_2 = float(sys.argv[6])
 98 |         
 99 |         
100 |         # trait level file
101 |         proteinomics_phenos_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform + "_5e8/phenotypes.txt"
102 |         
103 |         # trait list on the platform
104 |         proteinomics_list_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform +  "_5e8/" + platform + "_phenotype_info.txt"
105 |         
106 |         #folder store all genotype files at each folder
107 |         proteinomics_genotype_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform + "_5e8/"
108 |         
109 |         # results path
110 |         results_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/results/" + platform + '_5e8/'
111 | 
112 |         if os.path.isdir(results_path) == False:
113 |             os.mkdir(results_path)
114 |         
115 |         # model path
116 |         models_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/ml_models/" + platform + '_5e8/'
117 | 
118 |         if os.path.isdir(models_path) == False:
119 |             os.mkdir(models_path)
120 |         
121 |         # path stores the the betas of selected QLT variants from GWAS
122 |         beta_path = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/geno_files/ml_inputs_" + platform+ '_5e8/'
123 | 
124 |         
125 |         # model training for one omic trait
126 |         run_experiments_5_folders_one_protein(platform,proteinomics_list_file,proteinomics_genotype_path, proteinomics_phenos_file,results_path,models_path,beta_path,protein_index, alpha_1, alpha_2, lambda_1, lambda_2)
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/06_all_omics_UKB_phecode_assoc_test/01_all_omics_PGS_UKB_disease_assoc.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from sklearn.preprocessing import StandardScaler
  3 | from lifelines import CoxPHFitter
  4 | import sys
  5 | import statsmodels.formula.api as smf
  6 | import os.path
  7 | 
  8 | if __name__ == "__main__":
  9 | 
 10 |     phe_code_file = str(sys.argv[1])
 11 |     phe_code = phe_code_file.replace('Phecode_','').replace('.csv.gz','')
 12 |     
 13 |     pgs_index = int(sys.argv[2])-1
 14 | 
 15 |     # qc file from UKB
 16 |     ukb_qc_file = "/home/yx322/rds/rds-jmmh2-post_qc_data/uk_biobank/reference_files/genetic/reference_files/full_release/QC_documents/sampleQC_fromUKB_withHeaders.txt"
 17 |     # read ukb samples qc data to get array and PCs info
 18 |     df_ukb_qc = pd.read_csv(ukb_qc_file, skiprows=[i for i in range(0, 31)], sep=' ')
 19 |     
 20 |     
 21 |     #### select white british only ####
 22 |     df_ukb_qc = df_ukb_qc.loc[df_ukb_qc['in.white.British.ancestry.subset'] == 1]
 23 |     df_ukb_qc = df_ukb_qc[['#UKB_ID1', 'genotyping.array', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']]
 24 | 
 25 |     
 26 |     pheno_file = "/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/PGSCatalog/PheWAS/_phenotyped/" + phe_code_file
 27 |     df_pheno = pd.read_csv(pheno_file,compression='gzip')
 28 |     df_pheno = df_pheno[['eid','genid','sex','PHECODE_AgeAsTimescale','PHECODE_AgeAsTimescale_Years']]
 29 |     df_pheno = df_pheno.rename(columns={'eid':'idno','PHECODE_AgeAsTimescale':'PHENOTYPE','PHECODE_AgeAsTimescale_Years':'CENSOR_AGE'})
 30 |     
 31 |     # Incomporate pcs and arrary info
 32 |     df_pheno_all = pd.merge(df_pheno, df_ukb_qc, left_on='genid', right_on='#UKB_ID1')
 33 |     
 34 |     omics_pgs_files = ['/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Metabolon_full/UKB_Metabolon.sscore.gz',\
 35 |         "/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Olink_full/UKB_Olink.sscore.gz", \
 36 |         '/rds/project/asb38/rds-asb38-ceu-ukbiobank/projects/P7439/inouyelab/yx322/UKB_omics_PGS/Somalogic_full/UKB_Somalogic.sscore.gz', \
 37 |         '/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/omics_PGS_scores/UKB_Nightingale_5e8/UKB_Nightingale.sscore.gz']
 38 | 
 39 |     platforms = ['Metabolon', 'Olink', 'Somalogic', 'Nightingale']
 40 | 
 41 |     # add in gene expression PGS by chr
 42 |     for chr in range(1, 23):
 43 |         GE_pgs_file = "/home/yx322/rds/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/omics_PGS_scores/UKB_GE_5e-8/chr" + str(chr) + "/UKB_GE.sscore.gz"
 44 |         omics_pgs_files.append(GE_pgs_file)
 45 |         platform_name = "GE_chr" + str(chr)
 46 |         platforms.append(platform_name)
 47 |     
 48 |     i = pgs_index
 49 |     platform = platforms[i]
 50 |     SOMA_PGS_file = omics_pgs_files[i]
 51 |     df_soma_pgs = pd.read_csv(SOMA_PGS_file, sep='\t', compression='gzip')
 52 |     score_cols = list(df_soma_pgs.columns[1:])
 53 |     
 54 |     write_file = '/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/impute_genomics/omics_prs/UKB_phecode_association/raw_assocs/' + platform + '_PGS_UKB_' + phe_code + '_assoc_full_EU.txt'
 55 |     
 56 |     df_save = pd.DataFrame(columns=['Trait', 'HR', 'HR_low', 'HR_high', 'pvalue'])
 57 | 
 58 |     for col_name in score_cols:
 59 | 
 60 |         df_one_pgs = df_soma_pgs[['IID', col_name]]
 61 | 
 62 |         df_pheno_test = pd.merge(df_one_pgs, df_pheno_all, left_on='IID', right_on='idno')
 63 |         df_pheno_test = df_pheno_test[[col_name, 'sex', 'PHENOTYPE', 'CENSOR_AGE', 'genotyping.array', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']]
 64 | 
 65 |         df_pheno_test = pd.get_dummies(df_pheno_test, drop_first=True)
 66 |         df_pheno_test[[col_name, 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9','PC10']] = StandardScaler().fit_transform(df_pheno_test[[col_name, 'PC1', 'PC2', 'PC3', 'PC4', 'PC5','PC6', 'PC7', 'PC8', 'PC9', 'PC10']])
 67 | 
 68 |         # adjust pgs for PCs
 69 |         #reg_trait = col_name + ' ~ PC1+ PC2+ PC3+ PC4+ PC5+ PC6+ PC7+ PC8+ PC9+ PC10'
 70 |         #adj_result = smf.ols(reg_trait, data=df_pheno_test, missing='drop').fit()
 71 |         #df_pheno_test[col_name] = adj_result.resid
 72 |         #df_pheno_test[[col_name]] = StandardScaler().fit_transform(df_pheno_test[[col_name]])
 73 | 
 74 |         df_pheno_test['temp_col'] = df_pheno_test[col_name]
 75 |         reg_trait =  'temp_col ~ PC1+ PC2+ PC3+ PC4+ PC5+ PC6+ PC7+ PC8+ PC9+ PC10'
 76 |         adj_result = smf.ols(reg_trait, data=df_pheno_test, missing='drop').fit()
 77 |         df_pheno_test[col_name] = adj_result.resid
 78 |         del df_pheno_test['temp_col']
 79 |         # remove nan rows
 80 |         index_nan = list(df_pheno_test[df_pheno_test.isnull().any(axis=1)].index)
 81 |         df_pheno_test.drop(index_nan,inplace=True)
 82 |         
 83 |         df_pheno_test[[col_name]] = StandardScaler().fit_transform(df_pheno_test[[col_name]])
 84 |         
 85 |         if 'sex_Male' in df_pheno_test.columns:
 86 |             try:
 87 |                 cph1 = CoxPHFitter()
 88 |                 cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male'])
 89 |             except:
 90 |                 cph1 = CoxPHFitter()
 91 |                 cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male'],step_size=0.1)
 92 |         else:
 93 |             try:
 94 |                 cph1 = CoxPHFitter()
 95 |                 cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE')
 96 |             except:
 97 |                 cph1 = CoxPHFitter()
 98 |                 cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE',step_size=0.1)
 99 | 
100 |         #cph1 = CoxPHFitter()
101 |         # cph1.fit(df_testing_analysis_dummies, duration_col='duration', event_col='chd_case',strata=['sex_Male'])
102 |         #cph1.fit(df_pheno_test, duration_col='CENSOR_AGE', event_col='PHENOTYPE', strata=['sex_Male'])
103 |         results = [col_name, cph1.hazard_ratios_[col_name], cph1.summary.loc[col_name, 'exp(coef) lower 95%'], cph1.summary.loc[col_name, 'exp(coef) upper 95%'], cph1.summary.loc[col_name, 'p']]
104 | 
105 |         #print(results)
106 | 
107 |         df_save = df_save.append({'Trait': results[0], 'HR': results[1], 'HR_low': results[2], 'HR_high': results[3], 'pvalue': results[4]}, ignore_index=True)
108 | 
109 |     df_save = df_save.sort_values('pvalue')
110 |     df_save.to_csv(write_file, sep='\t', index=False)
111 | 
112 | 


--------------------------------------------------------------------------------
/03_collate_QTLs/01_collate_QTLs.R:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------------------------------------
  2 | # Load libraries/dependencies
  3 | # --------------------------------------------------------------------------------------
  4 | library(data.table)
  5 | library(openxlsx)
  6 | library(foreach)
  7 | library(doMC)
  8 | 
  9 | # --------------------------------------------------------------------------------------
 10 | # Set global script options
 11 | # --------------------------------------------------------------------------------------
 12 | 
 13 | out_dir = "geno_files"
 14 | ldthinned = "geno_files/genotype_data/ldthinned" # variant set to consider
 15 | 
 16 | # Data comes from other projects:
 17 | soma_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/somalogic_proteomics/interval/gwas/BAKEOFF151001/gwas_output/imputed/somalogic/meta"
 18 | nmr_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/nightingale_metabolomics/interval/gwas/nmr/results/HPC_results"
 19 | metabo_GWAS = "/rds/project/jmmh2/rds-jmmh2-results/private/metabolomics/metabolon_hd4/interval_gwas/raw_results"
 20 | olink_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/scallop/jp549/olink-merged-output"
 21 | olink_neu_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/interval_gwas_discovery/neu/interval_subset_olink/neuro/full_set/output/formatted_assoc_results"
 22 | 
 23 | ## ======================================================================================
 24 | ## First, we want to load the summary statistics for all platforms, and filter to the 
 25 | ## ld-thinned variant set, and add a basic filter of P < 0.01 - we want to find a 
 26 | ## reasonable P-value threshold across all platforms but need to load the summary stats
 27 | ## for all measurements
 28 | ## ======================================================================================
 29 | 
 30 | varset = foreach(chr_id = 1:22, .combine=rbind) %do% {
 31 |   fread(sprintf("%s/impute_%s_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8.pvar", ldthinned, chr_id))
 32 | }
 33 | setnames(varset, c("chr", "pos", "id", "ref", "alt"))
 34 | 
 35 | ##' # --------------------------------------------------------------------------------------
 36 | ##' # Load in the NMR GWAS summary stats
 37 | ##' # ---------------------------------------------------------------------------------------
 38 | ##' 
 39 | ##' nmr_files = list.files(path=nmr_GWAS, pattern="*.gz$")
 40 | ##' nmr_ss = foreach(ff = nmr_files, .combine=rbind) %do% {
 41 | ##'   # Load from the summary stats variants with P < 0.05 (filtered using awk). 
 42 | ##'   # The header row is discarded because it is malformed.
 43 | ##'   ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $11 < 0.01 ) { print $2,$3,$5,$6,$9,$11 } }\'', nmr_GWAS, ff))
 44 | ##'   setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "beta", "pval"))
 45 | ##'   ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0]
 46 | ##'   ss[, phenotype := gsub(".tar.gz", "", ff)]
 47 | ##'   return(ss)
 48 | ##' }
 49 | ##' fwrite(nmr_ss, file=sprintf("%s/nightingale_p_less_0.1.txt", out_dir), sep="\t", quote=F)
 50 | ##' rm(nmr_ss)
 51 | ##' gc()
 52 | ##' 
 53 | ##' # --------------------------------------------------------------------------------------
 54 | ##' # Load in the Metabolon HD4 GWAS summary stats
 55 | ##' # ---------------------------------------------------------------------------------------
 56 | ##' 
 57 | ##' metabo_files = list.files(path=metabo_GWAS, pattern="*.gz$")
 58 | ##' metabo_ss = foreach(ff = metabo_files, .combine=rbind) %do% {
 59 | ##'   ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $9 < 0.01 ) { print $1,$2,$3,$7,$9 } }\'', metabo_GWAS, ff))
 60 | ##'   setnames(ss, c("markername", "effect_allele", "other_allele", "beta", "pval"))
 61 | ##'   ss[, chr := as.integer(gsub("chr", "", gsub(":.*", "", markername)))]
 62 | ##'   ss[, pos := as.integer(gsub(":.*", "", gsub("chr.*?:", "", markername)))]
 63 | ##'   ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0]
 64 | ##'   ss = ss[, .(chr, pos, effect_allele, other_allele, beta, pval)]
 65 | ##'   ss[, phenotype := gsub("_.*", "", gsub("INTERVAL_", "", ff))]
 66 | ##'   return(ss)
 67 | ##' }
 68 | ##' fwrite(metabo_ss, file=sprintf("%s/metabolon_p_less_0.1.txt", out_dir), sep="\t", quote=F)
 69 | ##' rm(metabo_ss)
 70 | ##' gc()
 71 | 
 72 | # --------------------------------------------------------------------------------------
 73 | # Load in the olink data
 74 | # ---------------------------------------------------------------------------------------
 75 | 
 76 | olink_files = list.files(path=olink_GWAS, pattern="*.gz$")
 77 | olink_ss = foreach(ff = olink_files, .combine=rbind) %do% {
 78 |   ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $22 < 0.01 ) { print $3,$4,$5,$6,$22 } }\'', olink_GWAS, ff))
 79 |   setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval"))
 80 |   ss[, chr := as.integer(chr)]
 81 |   ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0]
 82 |   ss[, phenotype := gsub("_chr_merged.gz", "", gsub("INTERVAL_", "", ff))]
 83 |   return(ss)
 84 | }
 85 | 
 86 | olink_neu_files = list.files(path=olink_neu_GWAS, pattern="*.gz$")
 87 | olink_neu_ss = foreach(ff = olink_neu_files, .combine=rbind) %do% {
 88 |   ss = fread(cmd = sprintf('zcat %s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $22 < 0.01 ) { print $3,$4,$5,$6,$22 } }\'', olink_neu_GWAS, ff))
 89 |   setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval"))
 90 |   ss[, chr := as.integer(chr)]
 91 |   ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0]
 92 |   ss[, phenotype := paste0("neu_", gsub("_olink.*", "", ff))]
 93 |   return(ss)
 94 | }
 95 | 
 96 | olink_ss = rbind(olink_ss, olink_neu_ss)
 97 | fwrite(olink_ss, file=sprintf("%s/olink_p_less_0.1.txt", out_dir), sep="\t", quote=F)
 98 | rm(olink_ss, olink_neu_ss)
 99 | gc()
100 | 
101 | # --------------------------------------------------------------------------------------
102 | # Load in the somalogic data
103 | # ---------------------------------------------------------------------------------------
104 | 
105 | soma_dirs = list.files(path=soma_GWAS)
106 | soma_ss = foreach(dd = soma_dirs, .combine=rbind) %do% {
107 |   soma_files = list.files(path=sprintf("%s/%s", soma_GWAS, dd), pattern="*.gz$")
108 |   foreach(ff = soma_files, .combine=rbind) %do% {
109 |     ss = fread(cmd = sprintf('zcat %s/%s/%s | tail -n +2 | awk \'BEGIN { OFS="\t" } { if ( $8 < -2 ) { print $1,$2,$4,$5,$8 } }\'', soma_GWAS, dd, ff))
110 |     setnames(ss, c("chr", "pos", "effect_allele", "other_allele", "pval"))
111 |     ss[, pval := 10^pval]
112 |     ss = ss[varset[, .(chr, pos)], on = .(chr, pos), nomatch=0]
113 |     ss[, phenotype := dd]
114 |     return(ss)
115 |   }
116 | }
117 | 
118 | fwrite(soma_ss, file=sprintf("%s/somalogic_p_less_0.1.txt", out_dir), sep="\t", quote=F)
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/LDpred2/LDpred2_auto.R:
--------------------------------------------------------------------------------
  1 | library(bigsnpr)
  2 | library(data.table)
  3 | library(foreach)
  4 | library(tictoc)
  5 | library(ggplot2)
  6 | library(bit64)
  7 | 
  8 | 
  9 | setwd("/rds/project/jmmh2/rds-jmmh2-projects/inouye_lab_other/OmicsPred_LDpred_benchmarks")
 10 | args = commandArgs(trailingOnly=TRUE)
 11 | 
 12 | # the trait name from the gwas
 13 | gwas <- args[1]
 14 | 
 15 | # Make sure we're running on the appropriate compute node to make use of ramdisks partition
 16 | # File back shared memory obejcts can only effectively be used when using the /ramdisks/ partition
 17 | # as a temporary working directory, which is only available here on the skylake partitions.
 18 | # Trying to use the lustre filesystem means LDpred2 grinds to a halt unless run on a single core
 19 | # (and only one instance across the whole cluster) due to the consistency checks made by the lustre
 20 | # filesystem on shared memory objects.
 21 | 
 22 | if (!(Sys.getenv("SLURM_JOB_PARTITION") %like% "skylake")) {
 23 | 	stop("Script must be run on compute node on skylake or skylake-himem partitions")
 24 | }
 25 | 
 26 | 
 27 | ## create ramdir
 28 | starttime = Sys.time()
 29 | ramdir <- "/ramdisks/ldpred2/"
 30 | if (dir.exists(ramdir)) system(sprintf("rm -rf %s", ramdir), wait=TRUE)
 31 | system(sprintf("mkdir -p %s", ramdir), wait=TRUE)
 32 | 
 33 | 
 34 | # Copy genotype data to ramdisks
 35 | system(sprintf("cp data/INTERVAL_genotypes/filtered_interval_chr*{rds,bk} %s", ramdir), wait=TRUE)
 36 | 
 37 | 
 38 | gwas_file <- sprintf("data/GWAS_SumStats/gwas_filteredCleaned_%s.txt",gwas)
 39 | stopifnot(file.exists(gwas_file))
 40 | 
 41 | # Setup output directory
 42 | outdir <- sprintf("output/ldpred2/train/%s", gwas)
 43 | system(sprintf("mkdir -p %s", outdir), wait=TRUE)
 44 | 
 45 | # Set up temporary directories - clean up if already exists
 46 | tmpdir <- sprintf("tmp/ldpred2/%s", gwas)
 47 | if (dir.exists(tmpdir)) system(sprintf("rm -rf %s", tmpdir), wait=TRUE)
 48 | system(sprintf("mkdir -p %s", tmpdir), wait=TRUE)
 49 | 
 50 | ### Do per-SNP QC of GWAS summary statistics
 51 | # Load the gwas_ss, match to the genotype data, and obtain per-SNP standard deviations and 
 52 | # allele frequencies for downstream SNP QC.
 53 | gwas_ss <- fread(gwas_file)
 54 | endtime = Sys.time()
 55 | data_prep_time = as.numeric(difftime(endtime, starttime, units="secs"))
 56 | 
 57 | 
 58 | # This loop needs at least 80GB of memory and takes ~16min to run
 59 | tic("#1")
 60 | starttime = Sys.time()
 61 | gwas_ss <- foreach(this_chr =  1:22, .combine=rbind) %dopar% { 
 62 | 	cat("\nChromosome:", this_chr, "\n")
 63 | 	# Attach file-backed genotype data
 64 | 	geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, this_chr))
 65 | 
 66 | 	## Match summary stats to genotype data. A few notes:
 67 | 	# - Summary stats for all GWAS have already been filtered to HapMap3 variant
 68 | 	#   set that intersects with the variants in INTERVAL
 69 | 	# - Strand orientation of alleles has also already been harmonized to INTERVAL for
 70 | 	#   all GWAS
 71 | 
 72 | 	cat("- snp_match \n")
 73 | 	map <- geno$map[-2]
 74 | 	names(map) <- c("chr", "rsid", "pos", "a0", "a1")
 75 | 	
 76 | 	# snp_match not needed as it's been done before
 77 | 	matched_snps <- snp_match(as.data.frame(gwas_ss[chr == this_chr]), map, strand_flip=FALSE)	
 78 | 	setDT(matched_snps)	
 79 | 	# matched_snps <- gwas_ss[chr == this_chr]
 80 | 
 81 | 	cat("- allele frequency \n")
 82 | 	# Obtain allele frequencies in the training data - dosages count 'a0'
 83 | 	# this is the time consuming step
 84 | 	matched_snps[, a1freq := Matrix::colSums(geno$genotype[, `_NUM_ID_`], na.rm=TRUE) / (Matrix::colSums(!is.na(geno$genotype[, `_NUM_ID_`]))*2)]
 85 | 	matched_snps[, test := Matrix::colSums(geno$genotype[, `_NUM_ID_`], na.rm=TRUE) ]
 86 | 
 87 | 	cat("- sd of allele frequency \n")
 88 | 	# Compute the standard deviation of the allele frequency
 89 | 	# See https://privefl.github.io/bigsnpr-extdoc/polygenic-scores-pgs.html
 90 | 	matched_snps[, sd_val := sqrt(2 * a1freq * (1 - a1freq))]
 91 | 
 92 | 	# Return
 93 | 	return(matched_snps)
 94 | }
 95 | toc()
 96 | endtime = Sys.time()
 97 | geno_load_time = as.numeric(difftime(endtime, starttime, units="secs"))
 98 | # about 20 mins
 99 | 
100 | 
101 | # Do Per SNP QC of the summary stats.
102 | # See https://privefl.github.io/bigsnpr-extdoc/polygenic-scores-pgs.html and
103 | # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8016455/
104 | 
105 | gwas_ss[sd_val > 0, sd_y_est := median(sd_val * beta_se * sqrt(n_eff))]
106 | gwas_ss[sd_val > 0, sd_ss := sd_y_est / (beta_se * sqrt(n_eff))]
107 | gwas_ss[, fail_qc := sd_ss < (0.5 * sd_val) | sd_ss > (sd_val + 0.1) | sd_ss < 0.1 | sd_val < 0.05]
108 | 
109 | gwas_qc <- gwas_ss[, .(rsid=rsid.ss, chr, pos, effect_allele=a1, other_allele=a0, gwas_beta=beta, gwas_se=beta_se, 
110 | 													 n_eff, trainingset_EAF=a1freq, sd_val, sd_y_est, sd_ss, fail_qc)]
111 | 
112 | fwrite(gwas_qc, sep="\t", quote=FALSE, compress="gzip", file=sprintf("%s/ldpred2_gwasqc.txt.gz", outdir))
113 | 
114 | 
115 | # Diagnostic plot
116 | g <- ggplot(gwas_qc, aes(x=sd_val, y=sd_ss, color=fail_qc)) +
117 |   theme_bigstatsr() +
118 |   geom_point(shape=19, size=0.5, alpha=0.5) +
119 |   geom_abline(intercept=0, slope=1, linetype=2, colour="red") +
120 |   scale_colour_manual(name="SNP failed ldpred2 QC", values=c("TRUE"="purple", "FALSE"="yellow")) +
121 |   xlab("SNP dosage SD in training dataset") + 
122 |   ylab("SNP dosage SD in GWAS") +
123 |   theme(legend.position="bottom")
124 | ggsave(g, width=7.2, height=6, file=sprintf("%s/ldpred2_gwasqc_%s.png", outdir, gwas))
125 | 
126 | 
127 | tic("#2") # ~10min (or 40min sometimes...)
128 | starttime = Sys.time()
129 | for (this_chr in 1:22) {
130 |   cat("\nLoading correlation matrix for chromosome ", this_chr, "\n")
131 |   
132 | 	# Load correlation matrix precomputed for all candidate variants
133 | 	corr_file <- sprintf("data/ldpred2/filtered_interval_ldcorr_chr%s.rds", this_chr)
134 | 	stopifnot(file.exists(corr_file))
135 | 	corr0 <- readRDS(corr_file)
136 | 
137 | 	# Filter to those passing QC for this GWAS
138 | 	cat("- Filter out failed QC \n")
139 | 	corr0 <- gwas_ss[chr == this_chr & !(fail_qc), corr0[`_NUM_ID_`, `_NUM_ID_`]]
140 | 
141 | 	# Compute LD score
142 | 	cat("- Compute LD score \n")
143 | 	gwas_ss[chr == this_chr & !(fail_qc), LDsum := Matrix::colSums(corr0^2)]
144 | 
145 | 	foo <- sum(is.na(gwas_ss[chr == this_chr & !(fail_qc),]$LDsum))
146 | 	if (foo > 0) {
147 | 		cat("(!) LDsum missing:", foo)
148 | 	}
149 | 
150 | 	# Aggregate into a single sparse big matrix
151 | 	if (this_chr == 1) { 
152 |     cat("Initialized SFBM\n")
153 | 		genocorr <- as_SFBM(corr0, backingfile=sprintf("%s/ldcorr_passqc", tmpdir), compact = TRUE)
154 | 	} else {
155 |     cat("Adding matrix to SFBM\n")
156 | 		genocorr$add_columns(corr0, nrow(genocorr))
157 | 	}
158 | }
159 | toc()
160 | endtime= Sys.time()
161 | corr_load_time = as.numeric(difftime(endtime, starttime, units="secs"))
162 | 
163 | 
164 | cat("Moving SFBM backing file to /ramdisks\n")
165 | system(sprintf("cp %s/ldcorr_passqc.sbk %s/", tmpdir, ramdir), wait=TRUE)
166 | system(sprintf("rm %s/ldcorr_passqc.sbk", tmpdir), wait=TRUE)
167 | system(sprintf("ln -s %s/ldcorr_passqc.sbk %s/", ramdir, tmpdir), wait=TRUE)
168 | 
169 | ## Calculate LDSC results (~3min)
170 | tic("LDSC")
171 | starttime = Sys.time()
172 | ldsc <- gwas_ss[!(fail_qc), snp_ldsc(
173 | 		ld_score = LDsum, ld_size = .N, 
174 | 		chi2 = (beta / beta_se)^2,
175 | 		sample_size = n_eff,
176 | 		ncores = 1
177 | 	)]
178 | toc()
179 | endtime= Sys.time()
180 | ldsc_cal_time = as.numeric(difftime(endtime, starttime, units="secs"))
181 | 
182 | #save ldsc estimates
183 | write.csv(ldsc,sprintf("%s/ldsc_results.csv", outdir),quote=FALSE)
184 | 
185 | # Extract estimated heritability
186 | h2_est <- ldsc[["h2"]]
187 | 
188 | # assgin a small heritability estimate when a negative value is returned in ldsc
189 | if (h2_est<0){
190 |   h2_est=0.001
191 | }
192 | 
193 | 
194 | ### Run auto model
195 | cat("Running auto model\n")
196 | 
197 | tic("Auto model")
198 | starttime = Sys.time()
199 | multi_auto <- snp_ldpred2_auto(
200 | 	genocorr, gwas_ss[!(fail_qc)], h2_init = h2_est, allow_jump_sign=FALSE,
201 | 	vec_p_init = seq_log(1e-4, 0.2, length.out = 30),
202 | 	ncores = nb_cores()
203 | )
204 | toc()
205 | endtime= Sys.time()
206 | ldpred_train_time = as.numeric(difftime(endtime, starttime, units="secs"))
207 | 
208 | 
209 | 
210 | 
211 | # check for "chain" convergence
212 | auto_params <- rbindlist(lapply(multi_auto, function(x) {
213 | 	data.table(p_init = x$p_init, h2_init = x$h2_init, p_est = x$p_est, h2_est = x$h2_est)
214 | }))
215 | auto_params[, paramset := .I]
216 | 
217 | auto_path <- foreach(pIdx = seq_along(multi_auto), .combine=rbind) %do% {
218 | 	auto = multi_auto[[pIdx]]
219 | 	data.table(paramset = pIdx, path_iter = seq_along(auto$path_p_est), 
220 | 						 p_est = auto$path_p_est, h2_est = auto$path_h2_est)
221 | }
222 | 
223 | g1 <- ggplot(auto_path) + aes(x = path_iter, y=p_est) +
224 | 	theme_bigstatsr() + 
225 | 	geom_hline(data = auto_params, aes(yintercept=p_est), col="blue") +
226 | 	geom_point(shape=19, size=0.5) +
227 | 	scale_y_log10(name="p") + xlab("") +
228 | 	facet_wrap(~ paramset, ncol=10, labeller = label_both) + 
229 | 	theme(strip.background=element_blank(), strip.text=element_text(size=6), 
230 | 				axis.text=element_text(size=6), axis.title=element_text(size=10))
231 | 
232 | g2 <- ggplot(auto_path) + aes(x = path_iter, y=h2_est) +
233 | 	theme_bigstatsr() + 
234 | 	geom_hline(data = auto_params, aes(yintercept=h2_est), col="blue") +
235 | 	geom_point(shape=19, size=0.5) +
236 | 	ylab("h2") + xlab("") +
237 | 	facet_wrap(~ paramset, ncol=10, labeller = label_both) +
238 | 	theme(strip.background=element_blank(), strip.text=element_text(size=6), 
239 | 				axis.text=element_text(size=6), axis.title=element_text(size=10))
240 | 
241 | g <- plot_grid(g1, g2, nrow=2) 
242 | ggsave(g, width=20, height=10, units="in", file=sprintf("%s/ldpred2_auto_chain_convergence.png", outdir))
243 | 
244 | 
245 | ## select genetic score models to keep
246 | # and use the mean of betas of these selected models as the beta of the final genetic score model with LDpred2-auto
247 | # see https://privefl.github.io/bigsnpr/articles/LDpred2.html
248 | (range <- sapply(multi_auto, function(auto) diff(range(auto$corr_est))))
249 | (keep <- (range > (0.95 * quantile(range, 0.95))))
250 | beta_auto <- rowMeans(sapply(multi_auto[keep], function(auto) auto$beta_est))
251 | 
252 | 
253 | pgs_auto <- foreach(this_chr = 1:22, .combine=`+`) %do% {
254 |   geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, this_chr))
255 |   geno <- snp_fastImputeSimple(geno$genotypes, ncores=nb_cores()) # doesn't work with missing genotypes, so need to impute as median
256 |   big_prodVec(
257 |     X = geno, 
258 |     y.col = beta_auto[gwas_ss[!(fail_qc), which(chr == this_chr)]], 
259 |     ind.col = gwas_ss[!(fail_qc) & chr == this_chr, `_NUM_ID_`],
260 |     ncores = nb_cores()
261 |   )
262 | }
263 | 
264 | # get all sample IDs of the INTERVAL data
265 | geno <- snp_attach(sprintf("%s/filtered_interval_chr%s.rds", ramdir, 1))
266 | sample_IDs = geno$fam$sample.ID
267 | 
268 | # save calculated genetic scores of the LDpred2-auto model for INTERVAL individuals
269 | auto_pgs_data = data.table(sample_IDs,pgs_auto)
270 | write.table(auto_pgs_data,sprintf("%s/ldpred2_auto_sample_pgs.csv", outdir),quote=FALSE,sep="\t",row.names=FALSE)
271 | 
272 | # save the genetic score model developed using LDpred2-auto
273 | gwas_ss_sub = gwas_ss[!(fail_qc)]
274 | gwas_ss_sub$auto_beta = beta_auto
275 | auto_pgs_model = gwas_ss_sub[,c('rsid.ss','chr','pos','a1','a0',"auto_beta")]
276 | colnames(auto_pgs_model) =c("rsid",'chr','pos','effect_allele','other_allele','effect')
277 | write_file = sprintf("%s/ldpred2_auto_pgs_model.txt", outdir)
278 | write.table(auto_pgs_model,write_file,quote=FALSE,sep="\t",row.names=FALSE)
279 | 
280 | # save running times at each stage of the genetic score development with LDpred2-auto
281 | values = c(data_prep_time,geno_load_time,corr_load_time,ldsc_cal_time,ldpred_train_time)
282 | time_name = c('data_prep_time','geno_load_time','corr_load_time','ldsc_cal_time','ldpred_train_time')
283 | df_time <- data.frame(time_name, values)
284 | write_file = sprintf("%s/running_time.txt", outdir)
285 | write.table(df_time,write_file,quote=FALSE,sep="\t",row.names=FALSE)
286 | 
287 | # save the estimated heritability of all the trained genetic score models with LDpred2-auto above
288 | Kept = keep
289 | Heritability = sapply(multi_auto, get, x="h2_est")
290 | df_heri <- data.frame(Kept, Heritability)
291 | write_file = sprintf("%s/auto_model_heritability.txt", outdir)
292 | write.table(df_heri,write_file,quote=FALSE,sep="\t",row.names=FALSE)
293 | 
294 | 
295 | 
296 | ### Clean up
297 | 
298 | system(sprintf("rm -rf %s", tmpdir), wait=TRUE)
299 | system(sprintf("rm -rf %s", ramdir), wait=TRUE)


--------------------------------------------------------------------------------
/04_extract_QTLs/01_extract_QTLs.R:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------------------------------------
  2 | # Load libraries/dependencies
  3 | # --------------------------------------------------------------------------------------
  4 | library(data.table)
  5 | library(foreach)
  6 | library(doMC)
  7 | library(AnnotationHub) # Bioconductor package
  8 | library(annotables) # remotes::install_github("stephenturner/annotables")
  9 | 
 10 | args = commandArgs(trailingOnly=TRUE)
 11 | trans_pthresh = as.numeric(args[1])
 12 | cis_pthresh = as.numeric(args[2])
 13 | 
 14 | if (is.na(trans_pthresh) || trans_pthresh > 0.001 || trans_pthresh < 0) {
 15 |   stop("Trans/genome-wide P-value threshold must <= 0.001")
 16 | }
 17 | 
 18 | if (is.na(cis_pthresh) || cis_pthresh > 1 || cis_pthresh < 0) {
 19 |   stop("Cis P-value threshold must be between 0 and 1")
 20 | }
 21 | 
 22 | 
 23 | ncores = 25
 24 | 
 25 | parallelise_fread = function() {
 26 |   setDTthreads(ncores)
 27 |   registerDoMC(1)
 28 | }
 29 | 
 30 | parallelise_foreach = function() {
 31 |   setDTthreads(1)
 32 |   registerDoMC(ncores)
 33 | }
 34 | 
 35 | # --------------------------------------------------------------------------------------
 36 | # Set paths
 37 | # --------------------------------------------------------------------------------------
 38 | 
 39 | out_dir = "geno_files/ml_inputs"
 40 | geno_dir = "geno_files/genotype_data/ldthinned" # variant set to consider
 41 | trait_dir = "/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/interval_grs_scan/analyses/processed_traits"
 42 | trait_dir2 = "/rds/project/jmmh2/rds-jmmh2-projects/polygenic/internal/INTERVAL_gwasqc_technical_covariates_only/qced"
 43 | 
 44 | olink_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/scallop/jp549/olink-merged-output"
 45 | olink_neu_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/olink_proteomics/interval_gwas_discovery/neu/interval_subset_olink/neuro/full_set/output/formatted_assoc_results"
 46 | soma_GWAS = "/rds/project/jmmh2/rds-jmmh2-projects/somalogic_proteomics/interval/gwas/BAKEOFF151001/gwas_output/imputed/somalogic/meta"
 47 | 
 48 | processed_GWAS = "geno_files"
 49 | 
 50 | tmpdir = sprintf("%s/tmpdir", out_dir)
 51 | dir.create(tmpdir, showWarnings=FALSE)
 52 | 
 53 | # Define high complexity regions to extend 1MB cis windows when handling protein cis-QTLs
 54 | # See flashpca exclusion regions: https://github.com/gabraham/flashpca
 55 | # Coordinates are HG19
 56 | complex_ld <- data.table(
 57 |   region_chr=c(5, 6, 8, 11),
 58 |   region_start=c(44000000, 25000000, 8000000, 45000000),
 59 |   region_end=c(51500000, 33500000, 12000000, 57000000),
 60 |   region_name=c("r1", "MHC", "r3", "r4")
 61 | )
 62 | 
 63 | 
 64 | ## ======================================================================================
 65 | ## First, we want to load the summary statistics for all platforms, and filter to the
 66 | ## ld-thinned variant set, and add a basic filter of P < 0.01 - we want to find a
 67 | ## reasonable P-value threshold across all platforms but need to load the summary stats
 68 | ## for all measurements
 69 | ## ======================================================================================
 70 | 
 71 | parallelise_fread()
 72 | varset = foreach(chr_id = 1:22, .combine=rbind) %do% {
 73 |   fread(sprintf("%s/impute_%s_interval_dedup_unambig_SNPs_maf0.005_ldthin0.8.pvar", geno_dir, chr_id))
 74 | }
 75 | setnames(varset, c("chr", "pos", "id", "ref", "alt"))
 76 | setkey(varset, chr, pos)
 77 | 
 78 | # --------------------------------------------------------------------------------------
 79 | # Load in NMR GWAS results, filter to P < trans_pthresh and output variant effect files
 80 | # --------------------------------------------------------------------------------------
 81 | 
 82 | if (!file.exists(sprintf("%s/Nightingale_phenotype_info.txt", out_dir))) {
 83 |   nmr_SS = fread(sprintf("%s/nightingale_p_less_0.1.txt", processed_GWAS))
 84 |   nmr_SS = nmr_SS[pval < trans_pthresh]
 85 | 
 86 |   # Make sure we've accurately filtered to varset snps
 87 |   nmr_SS = rbind(
 88 |     nmr_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0],
 89 |     nmr_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0]
 90 |   )
 91 |   nmr_SS = nmr_SS[order(pos)][order(chr)][order(phenotype)]
 92 | 
 93 |   nmr_SS[varset, on = .(chr, pos), rsid := id] # add rsid
 94 |   nmr_SS = nmr_SS[, .SD[which.min(pval)], by=.(chr, pos, phenotype)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate.
 95 | 
 96 |   # Load phenotype data and info
 97 |   nmr_info = fread(sprintf("%s/nmr_metabolomics/trait_info.tsv", trait_dir))
 98 |   nmr_pheno = fread(sprintf("%s/nmr_metabolomics/traits.tsv", trait_dir))
 99 | 
100 |   # Some of the trait names have "_" at the end (measurements with %). Since we use "_" as
101 |   # a file name separator, we'll just replace these 
102 |   nmr_info[, variable := gsub("_", ".pct", variable)]
103 |   nmr_pheno[, variable := gsub("_", ".pct", variable)]
104 |   nmr_SS[, phenotype := gsub("_", ".pct", phenotype)]
105 | 
106 |   # Filter phenotype data 
107 |   nmr_pheno = nmr_pheno[!is.na(value)]
108 | 
109 |   # Fix column names:
110 |   setnames(nmr_info, "variable", "PhenotypeCompName")
111 |   setnames(nmr_pheno, "variable", "PhenotypeCompName")
112 |   setnames(nmr_SS, "phenotype", "PhenotypeCompName")
113 | 
114 |   # Filter info sheet and phenotype data to measurements 
115 |   # with at least 1 variant passing the P-value threshol
116 |   nmr_info = nmr_info[PhenotypeCompName %chin% nmr_SS$PhenotypeCompName]
117 |   nmr_pheno = nmr_pheno[PhenotypeCompName %chin% nmr_SS$PhenotypeCompName]
118 | 
119 |   # write out variant effects file for each phenotype
120 |   foreach(phen_id = unique(nmr_SS$PhenotypeCompName)) %do% {
121 |     fwrite(nmr_SS[PhenotypeCompName == phen_id, .(rsid, chr, pos, effect_allele, other_allele, effect=beta, pval)],
122 |            sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))
123 |   }
124 | 
125 |   # Write out info sheet for NMR variables:
126 |   if (nrow(nmr_info) > 0) {
127 |     fwrite(nmr_info[, .(PhenotypeCompName, Name, Description, Units, Group, Sub.Group)],
128 |            sep="\t", quote=FALSE, file=sprintf("%s/Nightingale_phenotype_info.txt", out_dir))
129 |   }
130 | 
131 |   # Free objects to free memory
132 |   rm(nmr_info, nmr_SS)
133 |   gc()
134 | } else {
135 |   cat("Nightingale NMR GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n")
136 |   nmr_info = fread(sprintf("%s/Nightingale_phenotype_info.txt", out_dir))
137 |   nmr_pheno = fread(sprintf("%s/nmr_metabolomics/traits.tsv", trait_dir))
138 |   nmr_pheno[, variable := gsub("_", ".pct", variable)]
139 |   nmr_pheno = nmr_pheno[!is.na(value)]
140 |   setnames(nmr_pheno, "variable", "PhenotypeCompName")
141 |   nmr_pheno = nmr_pheno[PhenotypeCompName %chin% nmr_info$PhenotypeCompName]
142 |   rm(nmr_info); gc()
143 | } 
144 | 
145 | # --------------------------------------------------------------------------------------
146 | # Do the same for the metabolon data
147 | # --------------------------------------------------------------------------------------
148 | 
149 | if (!file.exists(sprintf("%s/Metabolon_phenotype_info.txt", out_dir))) {
150 |   metabo_SS = fread(sprintf("%s/metabolon_p_less_0.1.txt", processed_GWAS))
151 |   metabo_SS = metabo_SS[pval < trans_pthresh]
152 | 
153 |   # Make sure we've accurately filtered to varset snps
154 |   metabo_SS = rbind(
155 |     metabo_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0],
156 |     metabo_SS[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0]
157 |   )
158 |   metabo_SS = metabo_SS[order(pos)][order(chr)][order(phenotype)]
159 | 
160 |   metabo_SS[varset, on = .(chr, pos), rsid := id] # add rsid
161 |   metabo_SS = metabo_SS[, .SD[which.min(pval)], by=.(chr, pos, phenotype)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate.
162 | 
163 |   # Load phenotype data and info
164 |   metabo_info = fread(sprintf("%s/metabolon_metabolomics/trait_info.tsv", trait_dir))
165 |   metabo_pheno = fread(sprintf("%s/metabolon_metabolomics/traits.tsv", trait_dir))
166 | 
167 |   # Make it so variable nmae in phenotype data table matches GWAS
168 |   metabo_pheno[, variable := gsub("^m", "M", variable)]
169 | 
170 |   # Filter phenotype data
171 |   metabo_pheno = metabo_pheno[!is.na(value)]
172 | 
173 |   # Filter info table to analysed metabolites:
174 |   metabo_info = metabo_info[comp_id %in% metabo_pheno$variable]
175 | 
176 |   # Fix column names:
177 |   setnames(metabo_info, "comp_id", "PhenotypeCompName")
178 |   setnames(metabo_pheno, "variable", "PhenotypeCompName")
179 |   setnames(metabo_SS, "phenotype", "PhenotypeCompName")
180 | 
181 |   # Filter info sheet and phenotype data to measurements 
182 |   # with at least 1 variant passing the P-value threshol
183 |   metabo_info = metabo_info[PhenotypeCompName %chin% metabo_SS$PhenotypeCompName]
184 |   metabo_pheno = metabo_pheno[PhenotypeCompName %chin% metabo_SS$PhenotypeCompName]
185 | 
186 |   # write out variant effects file for each phenotype
187 |   foreach(phen_id = unique(metabo_SS$PhenotypeCompName)) %do% {
188 |     fwrite(metabo_SS[PhenotypeCompName == phen_id, .(rsid, chr, pos, effect_allele, other_allele, effect=beta, pval)],
189 |            sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))
190 |   }
191 | 
192 |   # Write out info sheet
193 |   if (nrow(metabo_info) > 0) {
194 |     fwrite(metabo_info[, .(PhenotypeCompName, MASS, RI, BIOCHEMICAL, SUPER_PATHWAY, SUB_PATHWAY)],
195 |            sep="\t", quote=FALSE, file=sprintf("%s/Metabolon_phenotype_info.txt", out_dir))
196 |   }
197 | 
198 |   # Free objects to free memory
199 |   rm(metabo_info, metabo_SS)
200 |   gc()
201 | } else {
202 |   cat("Metabolon HD4 GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n")
203 |   metabo_info = fread(sprintf("%s/Metabolon_phenotype_info.txt", out_dir))
204 |   metabo_pheno = fread(sprintf("%s/metabolon_metabolomics/traits.tsv", trait_dir))
205 |   metabo_pheno[, variable := gsub("^m", "M", variable)]
206 |   metabo_pheno = metabo_pheno[!is.na(value)]
207 |   setnames(metabo_pheno, "variable", "PhenotypeCompName")
208 |   metabo_pheno = metabo_pheno[PhenotypeCompName %chin% metabo_info$PhenotypeCompName]
209 |   rm(metabo_info); gc()
210 | }
211 | 
212 | # --------------------------------------------------------------------------------------
213 | # Olink data is slightly more complicated: we need to also load the cis-region for each
214 | # protein, and to average proteins measured on multiple platforms
215 | # --------------------------------------------------------------------------------------
216 | 
217 | if (!file.exists(sprintf("%s/Olink_phenotype_info.txt", out_dir))) {
218 |   olink_SS = fread(sprintf("%s/olink_p_less_0.1.txt", processed_GWAS))
219 | 
220 |   # Load phenotype data and info
221 |   olink_info = fread(sprintf("%s/olink_proteins/trait_info.tsv", trait_dir))
222 |   olink_pheno = fread(sprintf("%s/olink_proteins/traits.tsv", trait_dir))
223 | 
224 |   # Make variable names match across data.tables
225 |   olink_SS[, varmatch := tolower(phenotype)]
226 |   olink_SS[, varmatch := gsub("\\.", "", varmatch)]
227 |   olink_SS[, varmatch := gsub("--", "", varmatch)]
228 |   olink_SS[, varmatch := gsub("^inf1", "inf", varmatch)]
229 |   olink_SS[varmatch %like% "inf_dner___q8nft8", varmatch := "inf_dner___q8nft8"] # trailing whitespace
230 |   olink_SS[varmatch == "inf_4ebp1___q13541", varmatch := "inf_ebp1___q13541"]
231 | 
232 |   # Olink proteins are unique by UniProt identifier, so we will use this as the unique
233 |   # phenotype ID. some fixes are needed.
234 |   olink_info[, PhenotypeCompName := UniProt]
235 |   olink_info[, PhenotypeCompName := gsub(";", ".", PhenotypeCompName)]
236 |   olink_info[PhenotypeCompName == "", PhenotypeCompName := Olink_id]
237 | 
238 |   # Add unique identifier to olink_SS table so we have full list of genome-wide P < trans_pthresh SNPs for
239 |   # each protein
240 |   olink_SS[olink_info, on = .(varmatch=variable), PhenotypeCompName := i.PhenotypeCompName]
241 | 
242 |   # Obtain the genomic location of each protein
243 |   loc = olink_info[, .(UniProt = strsplit(UniProt, "\\.")[[1]]), by=.(PhenotypeCompName)]
244 | 
245 |   system("mkdir -p $HOME/.cache/AnnotationHub", wait=TRUE) # so we dont get interactive prompt below
246 |   ah = AnnotationHub()
247 |   orgdb = query(ah, c("OrgDb", "org.Hs.eg.db"))[[1]]
248 |   txdb <- query(ah, c("TxDB", "TxDb.Hsapiens.UCSC.hg19.knownGene"))[[1]]
249 | 
250 |   up2gene = select(orgdb, unique(loc$UniProt), c("UNIPROT", "GENENAME", "SYMBOL", "ENTREZID"), "UNIPROT")
251 |   setDT(up2gene)
252 |   up2gene[, ENTREZID := as.integer(ENTREZID)]
253 |   up2gene = up2gene[!is.na(ENTREZID)]
254 |   gene2loc = as.data.table(grch37)
255 |   gene2loc = gene2loc[, .(entrez, chr, start)]
256 |   up2loc = up2gene[gene2loc, on = .(ENTREZID=entrez), nomatch=0]
257 |   up2loc = up2loc[!grepl("_", chr)]
258 |   loc[up2loc, on = .(UniProt=UNIPROT), c("chr", "start") := .(chr, start)]
259 | 
260 |   # manually annotate a few missing ones by manually looking up UniProt entry
261 |   # and cross referencing with NCBI gene
262 |   loc[UniProt == "Q8WWJ7", c("chr", "start") := .(11, 60739113)]
263 |   loc[UniProt == "Q8NF90", c("chr", "start") := .(4, 81187742)]
264 |   loc[UniProt == "P16284", c("chr", "start") := .(17, 62396775)]
265 |   loc = rbind(loc, data.table(PhenotypeCompName = "OID00195", UniProt=NA, chr=1, start=11917521))
266 | 
267 |   # Add to olink info table
268 |   olink_info[loc, on = .(PhenotypeCompName), c("chr", "TSS") := .(paste(i.chr, collapse="|"), paste(i.start, collapse="|")), by=.EACHI]
269 | 
270 |   # Now load summary stats for all SNPs passing P < trans_pthresh genome wide and 
271 |   # cis-snps < cis_pthresh for each protein
272 |   parallelise_foreach()
273 |   foreach(phen_id = unique(olink_SS$PhenotypeCompName), .combine=c) %dopar% {
274 |     if (file.exists(sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))) {
275 |       cat("Olink protein", phen_id, "already filtered at P <", trans_pthresh, "(trans),", cis_pthresh, "(cis) by previous run, skipping.\n")
276 |       return(NULL) 
277 |     }
278 |     gc()
279 |     phen_ss = foreach(var_id = olink_SS[PhenotypeCompName == phen_id, unique(phenotype)], .combine=rbind) %do% {
280 |       # Load full summary stats
281 |       varmatch = olink_SS[PhenotypeCompName == phen_id & phenotype == var_id, unique(varmatch)]
282 |       panel = olink_info[variable == varmatch, panel]
283 |       if (panel == "neu") {
284 |         this_ss = fread(sprintf("%s/%s_olink_neuro_full_set_autosomal_imputed_all_chrs_combined.snptest.out.gz", olink_neu_GWAS, gsub("^neu_", "", var_id)), tmpdir=tmpdir)
285 |       } else {
286 |         this_ss = fread(sprintf("%s/INTERVAL_%s_chr_merged.gz", olink_GWAS, var_id), tmpdir=tmpdir)
287 |       }
288 |       this_ss = this_ss[, .(chr=chromosome, pos=position, effect_allele=alleleB, other_allele=alleleA, effect=frequentist_add_beta_1, pval=frequentist_add_pvalue)] 
289 |       this_ss = this_ss[pval < pmax(trans_pthresh, cis_pthresh)]
290 | 
291 |       # Filter to varset snps
292 |       if (nrow(this_ss) > 0) {
293 |         this_ss = rbind(
294 |           this_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0],
295 |           this_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0]
296 |         )
297 |         this_ss = this_ss[!is.na(effect)]
298 |       }
299 |       return(this_ss)
300 |     }
301 | 
302 |     # R, do you even garbage collect? WTF.
303 |     if(exists("this_ss")) { rm(this_ss) }
304 |     gc()
305 | 
306 |     # Remove variants that did not pass the p-value threshold for all panels for each protein
307 |     npan = olink_info[PhenotypeCompName == phen_id, length(unique(phenotype))]
308 |     varn = phen_ss[,.N, by=.(chr, pos, effect_allele, other_allele)]
309 |     pass = varn[N == npan]
310 |     pass[,N := NULL]
311 |     phen_ss = phen_ss[pass, on = .(chr, pos, effect_allele, other_allele)]
312 | 
313 |     # average across measurements
314 |     phen_ss = phen_ss[, .(effect=mean(effect), pval=mean(pval)), by = .(chr, pos, effect_allele, other_allele)]
315 | 
316 |     # Get SNPs at genome-wide P < trans_pthresh 
317 |     gw = phen_ss[pval < trans_pthresh]
318 | 
319 |     # Get cis-SNPs with P < cis_pthresh
320 |     chrs = strsplit(olink_info[PhenotypeCompName == phen_id, unique(chr)], "\\|")[[1]]
321 |     starts = strsplit(olink_info[PhenotypeCompName == phen_id, unique(TSS)], "\\|")[[1]]
322 |     cis = foreach(idx = seq_along(chrs), .combine=rbind) %do% {
323 |       window = data.table(chr=as.integer(chrs[idx]), TSS=as.integer(starts[idx]))
324 |       window[, start := pmax(0, TSS - 1e6)]
325 |       window[, end := TSS + 1e6]
326 |       window[complex_ld, on = .(chr=region_chr, start<=region_end, start>=region_start), start := region_start]
327 |       window[complex_ld, on = .(chr=region_chr, end<=region_end, end>=region_start), start := region_start]
328 |       phen_ss[window, on = .(chr, pos>=start, pos<=end), .(chr, pos=x.pos, effect_allele, other_allele, effect, pval)]
329 |     }
330 |     cis = cis[pval < cis_pthresh]
331 | 
332 |     # remove full phen_ss object to free up memory
333 |     rm(phen_ss)
334 |     gc()
335 | 
336 |     phen_ss = unique(rbind(gw, cis))
337 | 
338 |     # If any QTLs, proceed
339 |     if (nrow(phen_ss) > 0) {
340 | 
341 |       phen_ss = phen_ss[, .SD[which.min(pval)], by=.(chr, pos)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate.
342 |       phen_ss[varset, on = .(chr, pos), rsid := id] # add rsid
343 |       phen_ss = phen_ss[, .(rsid, chr, pos, effect_allele, other_allele, effect, pval)][order(pos)][order(chr)]
344 | 
345 |       # write out
346 |       fwrite(phen_ss, sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))
347 | 
348 |       # Free up memory and garbage collect
349 |       rm(phen_ss)
350 |       gc()
351 | 
352 |       return(phen_id)
353 |     }
354 |   }
355 |   parallelise_fread()
356 |   gc()
357 | 
358 |   # Filter phenotype data 
359 |   olink_pheno = olink_pheno[!is.na(value)]
360 | 
361 |   # Average phenotype data across different platform measures:
362 |   olink_pheno[olink_info, on = .(variable), PhenotypeCompName := PhenotypeCompName]
363 |   olink_pheno = olink_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)]
364 | 
365 |   # Filter info sheet and phenotype data to measurements 
366 |   # with at least 1 variant passing the P-value threshol
367 |   olink_info = olink_info[PhenotypeCompName %chin% unique(olink_SS$PhenotypeCompName)]
368 |   olink_pheno = olink_pheno[PhenotypeCompName %chin% unique(olink_SS$PhenotypeCompName)]
369 | 
370 |   # Make sure info table has one entry per phenotype and write out
371 |   if (nrow(olink_info) > 0) {
372 |     olink_info = olink_info[, .(panels = paste(panel, collapse=","), protein = paste(unique(protein), collapse="/")),
373 |                             by = .(PhenotypeCompName, UniProt, chr, TSS)]
374 |     fwrite(olink_info[, .(PhenotypeCompName, UniProt, protein, chr, TSS, panels)],
375 |            sep="\t", quote=FALSE, file=sprintf("%s/Olink_phenotype_info.txt", out_dir))
376 |   }
377 | 
378 |   # Free objects to free memory
379 |   rm(olink_info, olink_SS)
380 |   gc()
381 | } else {
382 |   cat("Olink protein GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n")
383 |   olink_info = fread(sprintf("%s/Olink_phenotype_info.txt", out_dir))
384 |   olink_info_full = fread(sprintf("%s/olink_proteins/trait_info.tsv", trait_dir))
385 |   olink_pheno = fread(sprintf("%s/olink_proteins/traits.tsv", trait_dir))
386 |   olink_info = olink_info[, .(protein=strsplit(protein, "/")[[1]], panel = strsplit(panels, ",")[[1]]), by=.(PhenotypeCompName, UniProt)]
387 |   olink_info[olink_info_full, on = .(UniProt, protein), variable := i.variable]
388 |   olink_pheno = olink_pheno[!is.na(value)]
389 |   olink_pheno[olink_info, on = .(variable), PhenotypeCompName := PhenotypeCompName]
390 |   olink_pheno = olink_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)]
391 |   olink_pheno = olink_pheno[PhenotypeCompName %chin% olink_info$PhenotypeCompName]
392 |   rm(olink_info); gc()
393 | }
394 | 
395 | # --------------------------------------------------------------------------------------
396 | # Do the same for SomaLogic data. Only this time, we don't have a list of P < 0.01 
397 | # GWAS summary stats to work from.
398 | # --------------------------------------------------------------------------------------
399 | 
400 | if (!file.exists(sprintf("%s/Somalogic_phenotype_info.txt", out_dir))) {
401 | 	# Load phenotype data and info
402 | 	soma_info = fread(sprintf("%s/somalogic_proteins/trait_info.tsv", trait_dir))
403 | 	soma_pheno = fread(sprintf("%s/somalogic_proteins/traits.tsv", trait_dir))
404 | 
405 | 	# Remove bad aptamers and make unique protien names
406 | 	soma_info = soma_info[Type == "Protein"]
407 | 	soma_info = soma_info[, .(SeqId, SOMAMER_ID, variable, Target, TargetFullName, UniProt=UniProt.Id.Current.at.Uniprot, Gene=Gene.Name, chr, TSS=start)]
408 | 
409 | 	soma_info[, PhenotypeCompName := gsub("\\..*", "", SOMAMER_ID)]
410 | 
411 | 	# Some are not unique with respect to different Full Target Names, so we need to fix these:
412 | 	soma_info[PhenotypeCompName == "VEGFA", PhenotypeCompName := Target]
413 | 	soma_info[PhenotypeCompName == "PLG", PhenotypeCompName := Target]
414 | 	soma_info[PhenotypeCompName == "C3", PhenotypeCompName := Target]
415 | 	soma_info[PhenotypeCompName == "C4A", PhenotypeCompName := Target]
416 | 	soma_info[Target == "C5b, 6 Complex", PhenotypeCompName := "C5b"]
417 | 	soma_info[Target == "MPIF-1", PhenotypeCompName := "MPIF.1"]
418 | 	soma_info[Target == "Ck-b-8-1", PhenotypeCompName := "Ck.b.8.1"]
419 | 	soma_info[PhenotypeCompName == "CGA", PhenotypeCompName := Target]
420 | 	soma_info[PhenotypeCompName == "Luteinizing hormone", PhenotypeCompName := "CGA.LHB"]
421 | 	soma_info[PhenotypeCompName == "Glycoprotein hormones a-chain", PhenotypeCompName := "CGA"]
422 | 	soma_info[Target == "SCGF-beta", PhenotypeCompName := "SCGFb"]
423 | 	soma_info[Target == "SCGF-alpha", PhenotypeCompName := "SCGFa"]
424 | 	soma_info[Target == "Coagulation Factor Xa", PhenotypeCompName := "F10a"]
425 | 	soma_info[Target == "FN1.3", PhenotypeCompName := "FN1.3"]
426 | 	soma_info[Target == "Haptoglobin, Mixed Type", PhenotypeCompName := "HPm"]
427 | 	soma_info[PhenotypeCompName == "LRP1", PhenotypeCompName := Target]
428 | 	soma_info[PhenotypeCompName == "LYN", PhenotypeCompName := Target]
429 | 	soma_info[Target == "PILRA isoform FDF03-M14", PhenotypeCompName := "PILRA.M14"]
430 | 	soma_info[Target == "PILRA isoform FDF03-deltaTM", PhenotypeCompName := "PILRA.dTM"]
431 | 	soma_info[Target == "Ubiquitin+1", PhenotypeCompName := "RPS27Aplus1"]
432 | 	soma_info[Target == "alpha-1-antichymotrypsin complex", PhenotypeCompName := "SERPINA3cmplx"]
433 | 	soma_info[Target == "14-3-3 protein beta/alpha", PhenotypeCompName := "14.3.3.pba"]
434 | 	soma_info[Target == "14-3-3", PhenotypeCompName := "14.3.3"]
435 | 	soma_info[PhenotypeCompName == "C5", PhenotypeCompName := Target]
436 | 	soma_info[PhenotypeCompName == "F2", PhenotypeCompName := Target]
437 | 	soma_info[PhenotypeCompName == "EGFR", PhenotypeCompName := Target]
438 | 	soma_info[PhenotypeCompName == "FN1", PhenotypeCompName := Target]
439 | 	soma_info[PhenotypeCompName == "NRXN1", PhenotypeCompName := Target]
440 | 	soma_info[PhenotypeCompName == "ADCYAP1", PhenotypeCompName := gsub("-", ".", Target)]
441 | 	soma_info[PhenotypeCompName == "CKB", PhenotypeCompName := gsub("-", ".", Target)]
442 | 	soma_info[PhenotypeCompName == "EGFR", PhenotypeCompName := gsub("-", ".", Target)]
443 | 	soma_info[PhenotypeCompName == "FGA", PhenotypeCompName := gsub("-", ".", Target)]
444 | 	soma_info[PhenotypeCompName == "FGF8", PhenotypeCompName := gsub("-", ".", Target)]
445 | 	soma_info[PhenotypeCompName == "PPBP", PhenotypeCompName := gsub("-", ".", Target)]
446 | 	soma_info[Target == "CLF-1/CLC Complex", PhenotypeCompName := "CLF1.CLC.complex"]
447 | 	soma_info[Target == "CK2-A1:B", PhenotypeCompName := "CK2.A1.B"]
448 | 	soma_info[Target == "Coagulation Factor IX", PhenotypeCompName := "CF.IX"]
449 | 	soma_info[Target == "Coagulation Factor IXab", PhenotypeCompName := "CF.IXab"]
450 | 	soma_info[Target == "GDF-11/8", PhenotypeCompName := "GDF11.8"]
451 | 	soma_info[Target == "IgG2, Kappa", PhenotypeCompName := "IgG2"]
452 | 	soma_info[Target == "IgG4, Kappa", PhenotypeCompName := "IgG4"]
453 | 	soma_info[Target == "N-terminal pro-BNP", PhenotypeCompName := "NPPB.Nt"]
454 | 	soma_info[Target == "Activated Protein C", PhenotypeCompName := "PROC.activated"]
455 | 	soma_info[Target == "TLR4:MD-2 complex", PhenotypeCompName := "TLR4.MD2.complex"]
456 | 	soma_info[Target == "Activin A", PhenotypeCompName := "INHBA.A"]
457 | 	soma_info[Target == "Activin AB", PhenotypeCompName := "INHBA.AB"]
458 | 	soma_info[Target == "Lymphotoxin a1/b2", PhenotypeCompName := "LTA.A1.B2"]
459 | 	soma_info[Target == "Lymphotoxin a2/b1", PhenotypeCompName := "LTA.A2.B1"]
460 | 	soma_info[PhenotypeCompName == "POMC", PhenotypeCompName := gsub("-", ".", TargetFullName)]
461 | 	soma_info[Target == "SEM6C", Target := "SEMA6C"] 
462 | 	soma_info[PhenotypeCompName == "14.3.3", UniProt := "P61981|Q04917"]
463 | 
464 |   # Actually we want to train models for each SeqId apparently...
465 |   setnames(soma_info, "PhenotypeCompName", "UniqueShortName")
466 |   soma_info[, PhenotypeCompName := paste0("SeqId_", gsub("-", "_", SeqId))]
467 | 
468 | 	# Now load summary stats for all SNPs passing P < trans_pthresh genome wide and 
469 | 	# all cis-snps for each protein. Note this code handles averaging across multiple
470 |   # aptamers if we want to return to predicting protein levels.
471 | 	parallelise_foreach()
472 | 	has_qtls = foreach(phen_id = unique(soma_info$PhenotypeCompName), .combine=c) %dopar% {
473 |     if (file.exists(sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))) {
474 |       cat("Somalogic protein", phen_id, "already filtered at P <", trans_pthresh, "(trans),", cis_pthresh, "(cis) by previous run, skipping.\n")
475 |       return(phen_id) 
476 |     }
477 | 		gc()
478 | 		# Load all summary stats for all aptamers targetting this protein, filtering to SNPs
479 | 		# in the LD-thinned variant set.
480 | 		phen_ss = foreach(var_id = soma_info[PhenotypeCompName == phen_id, unique(SOMAMER_ID)], .combine=rbind) %do% {
481 | 			this_ss = foreach(chr_id = 1:22, .combine=rbind) %do% {
482 | 				chr_ss = fread(sprintf("%s/%s/%s_chrom_%s_meta_1.tbl.gz", soma_GWAS, var_id, var_id, chr_id), tmpdir=tmpdir)
483 | 				chr_ss = chr_ss[, .(PhenotypeCompName=phen_id, SOMAMER_ID=var_id, chr=chromosome, pos=position, 
484 | 														effect_allele=toupper(Allele1), other_allele=toupper(Allele2), effect=Effect, 
485 | 														pval=10^`log(P)`)]
486 |         chr_ss = chr_ss[pval < pmax(trans_pthresh, cis_pthresh)]
487 |         if (nrow(chr_ss) > 0) {
488 |           chr_ss = rbind(
489 |             chr_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=alt, other_allele=ref), nomatch=0],
490 |             chr_ss[varset[, .(chr, pos, ref, alt)], on = .(chr, pos, effect_allele=ref, other_allele=alt), nomatch=0]
491 |           )
492 |         }
493 | 				return(chr_ss)
494 | 			}
495 | 			this_ss[!is.na(effect)]
496 | 		}
497 | 
498 |     # Remove variants that did not pass the p-value threshold for all measurements per protein
499 |     napt = soma_info[PhenotypeCompName == phen_id, length(unique(SOMAMER_ID))]
500 |     varn = phen_ss[,.N, by=.(chr, pos, effect_allele,other_allele)]
501 |     pass = varn[N == napt]
502 |     pass[,N := NULL]
503 |     phen_ss = phen_ss[pass, on = .(chr, pos, effect_allele,other_allele)]
504 | 
505 | 		# average across measurements
506 | 		phen_ss = phen_ss[, .(effect=mean(effect), pval=mean(pval)), by = .(chr, pos, effect_allele, other_allele)]
507 | 
508 | 		# R, do you even garbage collect? WTF.
509 | 		if(exists("this_ss")) { rm(this_ss) }
510 | 		if(exists("chr_ss")) { rm(chr_ss) }
511 | 		gc()
512 | 
513 | 		# Identify and extract all SNPs with P < trans_pthresh for all aptamers
514 | 		gw_snps = unique(phen_ss[pval < trans_pthresh, .(chr, pos)])
515 | 		gw = phen_ss[gw_snps, on = .(chr, pos)]
516 | 
517 | 		# Identify and extract all SNPS in cis with any gene with P < cis_pthresh
518 | 		chrs = strsplit(soma_info[PhenotypeCompName == phen_id, unique(chr)], "\\|")[[1]]
519 | 		starts = strsplit(soma_info[PhenotypeCompName == phen_id, unique(TSS)], "\\|")[[1]]
520 | 		cis = foreach(idx = seq_along(chrs), .combine=rbind) %do% {
521 | 			window = data.table(chr=as.integer(chrs[idx]), TSS=as.integer(starts[idx]))
522 | 			window[, start := pmax(0, TSS - 1e6)]
523 | 			window[, end := TSS + 1e6]
524 | 			window[complex_ld, on = .(chr=region_chr, start<=region_end, start>=region_start), start := region_start]
525 | 			window[complex_ld, on = .(chr=region_chr, end<=region_end, end>=region_start), start := region_start]
526 | 			chr_ss = phen_ss[window, on = .(chr, pos>=start, pos<=end), .(chr, pos=x.pos, effect_allele, other_allele, effect, pval)]
527 | 			chr_ss[pval < cis_pthresh]
528 | 		}
529 | 		rm(phen_ss)
530 | 		gc()
531 | 
532 | 		phen_ss = unique(rbind(gw, cis))
533 | 
534 | 		if(nrow(phen_ss) > 0) {
535 | 			phen_ss = phen_ss[, .SD[which.min(pval)], by=.(chr, pos)] # some duplicate results (duplicate SNPs with different INFO). Take best estimate.
536 | 			phen_ss[varset, on = .(chr, pos), rsid := id] # add rsid
537 | 			phen_ss = phen_ss[, .(rsid, chr, pos, effect_allele, other_allele, effect, pval)][order(pos)][order(chr)]
538 | 
539 | 			# write out
540 | 			fwrite(phen_ss, sep="\t", quote=FALSE, file=sprintf("%s/%s_variant_effects.txt", out_dir, phen_id))
541 | 
542 | 			# Free up some memory
543 | 			rm(phen_ss)
544 | 			gc()
545 | 
546 | 			return(phen_id)
547 | 		}
548 | 	}
549 | 	parallelise_fread()
550 | 
551 | 	# Filter phenotype data
552 | 	soma_pheno = soma_pheno[variable %chin% unique(soma_info$variable)]
553 | 	soma_pheno = soma_pheno[!is.na(value)]
554 | 
555 | 	# Average phenotype data across different platform measures:
556 | 	soma_pheno[soma_info, on = .(variable), PhenotypeCompName := PhenotypeCompName]
557 | 	soma_pheno = soma_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)]
558 | 
559 | 	# Filter info sheet and phenotype data to measurements 
560 | 	# with at least 1 variant passing the P-value threshol
561 | 	soma_info = soma_info[PhenotypeCompName %chin% has_qtls]
562 | 	soma_pheno = soma_pheno[PhenotypeCompName %chin% has_qtls]
563 | 
564 | 	if (nrow(soma_info) > 0) {
565 | 		soma_info = soma_info[, .(SeqId=paste(SeqId, collapse=","), SOMAMER_ID=paste(SOMAMER_ID, collapse=",")),
566 | 													 by = .(PhenotypeCompName, UniqueShortName, Target, TargetFullName, UniProt, Gene, chr, TSS)]
567 | 		fwrite(soma_info, sep="\t", quote=FALSE, file=sprintf("%s/Somalogic_phenotype_info.txt", out_dir))
568 | 	}
569 | 
570 | 	# Adjust phenotype levels for measurement batch:
571 | 	if (nrow(soma_pheno) > 0) {
572 | 		batch = fread(sprintf("%s/somalogic_proteins/covariates.tsv", trait_dir))
573 | 		soma_pheno = soma_pheno[batch, on = .(IID), nomatch=0]
574 | 		soma_pheno[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)]
575 | 		soma_pheno[, batch := NULL]
576 | 	}
577 | } else {
578 |   cat("Somalogic protein GWAS summary statistics filtered by previous run. Loading and filtering phenotype data...\n")
579 |   soma_info = fread(sprintf("%s/Somalogic_phenotype_info.txt", out_dir))
580 |   soma_pheno = fread(sprintf("%s/somalogic_proteins/traits.tsv", trait_dir))
581 | 	batch = fread(sprintf("%s/somalogic_proteins/covariates.tsv", trait_dir))
582 |   soma_pheno = soma_pheno[variable %chin% unique(soma_info$variable)]
583 |   soma_pheno = soma_pheno[!is.na(value)]
584 |   soma_pheno = soma_pheno[soma_info[,.(variable, PhneotypeCompName)], on = .(variable), nomatch=0]
585 |   soma_pheno = soma_pheno[, .(value=mean(value)), by = .(PhenotypeCompName, IID)]
586 |   rm(soma_info); gc()
587 | 	soma_pheno = soma_pheno[batch, on = .(IID), nomatch=0]
588 | 	soma_pheno[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)]
589 | 	soma_pheno[, batch := NULL]
590 | }
591 | 
592 | # --------------------------------------------------------------------------------------
593 | # Combine phenotype data into one table and write out
594 | # --------------------------------------------------------------------------------------
595 | 
596 | pcs = fread("/rds/project/jmmh2/rds-jmmh2-post_qc_data/interval/reference_files/genetic/reference_files/annot_INT_50PCs_pcs.txt")
597 | 
598 | pheno = rbind(
599 |   Nightingale = nmr_pheno,
600 |   Metabolon = metabo_pheno,
601 |   Olink = olink_pheno,
602 |   Somalogic = soma_pheno,
603 |   idcol = "platform")
604 | 
605 | # Adjust for 10 genotype PCs
606 | pheno = pheno[pcs, on = .(IID=ID), nomatch=0]
607 | pheno[,value := lm(value ~ PC_1 + PC_2 + PC_3 + PC_4 + PC_5 + 
608 |                            PC_6 + PC_7 + PC_8 + PC_9 + PC_10)$residuals, 
609 |       by = .(PhenotypeCompName)]
610 | 
611 | if (nrow(pheno) > 0) {
612 |   fwrite(pheno[, .(PhenotypeCompName, platform, IID, value)],
613 |          sep="\t", quote=FALSE, file=sprintf("%s/phenotypes.txt", out_dir))
614 | }
615 | 
616 | # Phenotype data adjusted for technical covariates only.
617 | soma_pheno_tech = fread(sprintf("%s/soma4000_gwasQC_adj_technical.txt", trait_dir2))
618 | soma_pheno_tech = melt(soma_pheno_tech, id.vars="aliquot_id", variable.name="SOMAMER_ID")
619 | apt2prot = soma_info[,.(SeqID=strsplit(SeqID, ",")[[1]], SOMAMER_ID=strsplit(SOMAMER_ID, ",")[[1]]), by=PhenotypeCompName]
620 | soma_pheno_tech = soma_pheno_tech[apt2prot, on = .(SOMAMER_ID)]
621 | idmap = fread("/rds/project/jmmh2/rds-jmmh2-projects/polygenic/general/INTERVAL_data/1074/omicsMap.csv")
622 | soma_pheno_tech[idmap, on = .(aliquot_id=soma4000_gwasQC_bl), IID := Affymetrix_gwasQC_bl]
623 | soma_pheno_tech[, platform := "Somalogic"]
624 | soma_pheno_tech = soma_pheno_tech[, .(value=mean(value)), by = .(PhenotypeCompName, platform, IID)]
625 | soma_pheno_tech = soma_pheno_tech[batch, on = .(IID), nomatch=0]
626 | soma_pheno_tech[, value := lm(value ~ factor(batch))$residuals, by = .(PhenotypeCompName)]
627 | soma_pheno_tech = soma_pheno_tech[pcs, on = .(IID=ID), nomatch=0]
628 | soma_pheno_tech[, value := lm(value ~ PC_1 + PC_2 + PC_3 + PC_4 + PC_5 + PC_6 + PC_7 + PC_8 + PC_9 + PC_10)$residuals, by = .(PhenotypeCompName)]
629 | soma_pheno_tech = soma_pheno_tech[PhenotypeCompName %chin% pheno[platform == "Somalogic", PhenotypeCompName]]
630 | 
631 | if (nrow(soma_pheno_tech) > 0) {
632 |   fwrite(soma_pheno_tech[,.(PhenotypeCompName, platform, IID, value)],, sep="\t", quote=FALSE, file=sprintf("%s/phenotypes_no_agesex.txt", out_dir))
633 | 
634 |   # write out age and sex information
635 |   agesex = fread(sprintf("%s/phenotypes.tsv", trait_dir))
636 |   agesex = agesex[,.(IID, sex=sexPulse, age=agePulse)]
637 |   agesex = agesex[IID %in% unique(soma_pheno_tech$IID)]
638 |   fwrite(agesex, sep="\t", quote=FALSE, file=sprintf("%s/agesex.txt", out_dir))
639 | }
640 | 
641 | 
642 | # remove leftover temporary directory
643 | system(sprintf("rm -rf %s", tmpdir), wait=TRUE)
644 | 
645 | 


--------------------------------------------------------------------------------