├── .gitattributes
├── figures
    └── 4_figures
    │   ├── parts
    │       └── arrow.png
    │   ├── proteins
    │       ├── amie.png
    │       ├── cbs.png
    │       ├── ccr5.png
    │       ├── cp.png
    │       ├── dbr1.png
    │       ├── gal4.png
    │       ├── ha.png
    │       ├── infa.png
    │       ├── np.png
    │       ├── pab1.png
    │       ├── pten.png
    │       ├── ras.png
    │       ├── src.png
    │       ├── tem1.png
    │       ├── tp53.png
    │       ├── tpk1.png
    │       ├── tpmt.png
    │       ├── ubi.png
    │       ├── yap1.png
    │       ├── adrb2.png
    │       ├── aph3ii.png
    │       ├── brca1.png
    │       ├── calm1.png
    │       ├── cxcr4.png
    │       ├── hsp90.png
    │       ├── ste12.png
    │       ├── sumo1.png
    │       ├── ube2i.png
    │       ├── ube4b.png
    │       └── proteing.png
    │   └── position_examples
    │       ├── cbs_phe_pi.png
    │       ├── ccr5_domains.png
    │       ├── tem1_asp_sa.png
    │       ├── adrb2_domains.png
    │       ├── cbs_asp_ionic.png
    │       ├── cxcr4_domains.png
    │       ├── gal4_cys_zinc.png
    │       ├── np_cys_aromatic.png
    │       ├── ras_met_buried.png
    │       ├── tem1_asp_ligand.png
    │       ├── ccr5_cys_aromatic.png
    │       ├── pab1_arg_not_neg.png
    │       ├── ccr5_cys_disulphide.png
    │       ├── adrb2_ala_small_hydro.png
    │       ├── aph3ii_arg_not_proline.png
    │       └── ras_aliphatic_entropy.png
├── meta
    ├── fasta
    │   ├── streptococcus_proteing.fa
    │   ├── s_cerevisiae_ubi.fa
    │   ├── e_coli_infa.fa
    │   ├── e_coli_ccdb.fa
    │   ├── h_sapiens_sumo1.fa
    │   ├── bacteriophage_ms2_cp.fa
    │   ├── h_sapiens_calm1.fa
    │   ├── h_sapiens_ube2i.fa
    │   ├── h_sapiens_ras.fa
    │   ├── a_victoria_gfp.fa
    │   ├── h_sapiens_tpk1.fa
    │   ├── h_sapiens_tpmt.fa
    │   ├── b_subtilis_gdh.fa
    │   ├── e_coli_tem1.fa
    │   ├── e_coli_aph3ii.fa
    │   ├── s_cerevisiae_gcn4.fa
    │   ├── h_sapiens_ccr5.fa
    │   ├── h_sapiens_cxcr4.fa
    │   ├── h_sapiens_mapk1.fa
    │   ├── h_sapiens_adrb2.fa
    │   ├── h_sapiens_tdp43.fa
    │   ├── p_aeruginosa_amie.fa
    │   ├── h_sapiens_tp53.fa
    │   ├── h_sapiens_pten.fa
    │   ├── h_sapiens_yap1.fa
    │   ├── h_sapiens_dbr1.fa
    │   ├── h_sapiens_src.fa
    │   ├── h_sapiens_cbs.fa
    │   ├── H3N2_A_Aichi_2_1968_np.fa
    │   ├── strep_protein_g_precursor.fa
    │   ├── s_cerevisiae_pab1.fa
    │   ├── H3N2_A_Perth_16_2009_ha.fa
    │   ├── s_cerevisiae_hsc82.fa
    │   ├── s_cerevisiae_ste12.fa
    │   ├── s_cerevisiae_hsp90.fa
    │   ├── h_sapiens_braf.fa
    │   ├── s_cerevisiae_gal4.fa
    │   ├── m_musculus_ube4b.fa
    │   ├── s_pyrogenes_cas9.fa
    │   └── h_sapiens_brca1.fa
    ├── subtypes
    │   ├── kmeans_profile.yaml
    │   ├── hdbscan_pca.yaml
    │   ├── kmeans_pca.yaml
    │   ├── pam_profile_cos.yaml
    │   ├── pam_profile_man.yaml
    │   ├── hdbscan_profile.yaml
    │   ├── pam_pca_no_sig_cos.yaml
    │   ├── pam_pca_no_sig_man.yaml
    │   ├── hclust_pca_dynamic.yaml
    │   ├── hdbscan_pca_no_sig.yaml
    │   ├── kmeans_pca_no_sig.yaml
    │   ├── gmm_pca.yaml
    │   ├── gmm_profile.yaml
    │   ├── gmm_pca_no_sig.yaml
    │   ├── hclust_profile_dynamic.yaml
    │   ├── hclust_profile_dynamic_cos.yaml
    │   ├── hclust_pca_no_sig_dynamic_cos_deep_0.yaml
    │   ├── hclust_pca_no_sig_dynamic_cos_deep_1.yaml
    │   └── hclust_pca_no_sig_dynamic.yaml
    ├── study_template.yaml
    ├── final_subtypes.yaml
    ├── residue_hydrophobicity.tsv
    └── README.md
├── data
    ├── long_combined_mutational_scans.tsv
    ├── studies
    │   ├── weile_2017_tpk1
    │   │   ├── standardise_weile_2017_tpk1.R
    │   │   └── weile_2017_tpk1.yaml
    │   ├── sun_2018_cbs
    │   │   ├── standardise_sun_2018_cbs.R
    │   │   └── sun_2018_cbs.yaml
    │   ├── weile_2017_calm1
    │   │   ├── standardise_weile_2017_calm1.R
    │   │   └── weile_2017_calm1.yaml
    │   ├── weile_2017_sumo1
    │   │   ├── standardise_weile_2017_sumo1.R
    │   │   └── weile_2017_sumo1.yaml
    │   ├── ahler_2019_src
    │   │   ├── standardise_ahler_2019_src.R
    │   │   └── ahler_2019_src.yaml
    │   ├── weile_2017_ube2i
    │   │   ├── standardise_weile_2017_ube2i.R
    │   │   └── weile_2017_ube2i.yaml
    │   ├── kelsic_2016_infa
    │   │   ├── kelsic_2016_infa.yaml
    │   │   └── standardise_kelsic_2016_infa.R
    │   ├── ashenberg_2017_np
    │   │   ├── standardise_ashenberg_2017_np.R
    │   │   └── ashenberg_2017_np.yaml
    │   ├── melamed_2013_pab1
    │   │   ├── standardise_melamed_2013_pab1.R
    │   │   └── melamed_2013_pab1.yaml
    │   ├── roscoe_2013_ubi
    │   │   ├── roscoe_2013_ubi.yaml
    │   │   └── standardise_roscoe_2013_ubi.R
    │   ├── bandaru_2017_ras
    │   │   ├── bandaru_2017_ras.yaml
    │   │   └── standardise_bandaru_2017_ras.R
    │   ├── bolognesi_2019_tdp43
    │   │   ├── standardise_bolognesi_2019_tdp43.R
    │   │   └── bolognesi_2019_tdp43.yaml
    │   ├── hietpas_2011_hsp90
    │   │   ├── standardise_hietpas_2011_hsp90.R
    │   │   └── hietpas_2011_hsp90.yaml
    │   ├── steinberg_2016_tem1
    │   │   ├── standardise_steinberg_2016_tem1.R
    │   │   └── steinberg_2016_tem1.yaml
    │   ├── matreyek_2018_pten
    │   │   ├── standardise_matreyek_2018_pten.R
    │   │   └── matreyek_2018_pten.yaml
    │   ├── matreyek_2018_tpmt
    │   │   ├── standardise_matreyek_2018_tpmt.R
    │   │   └── matreyek_2018_tpmt.yaml
    │   ├── jiang_2013_hsp90
    │   │   ├── standardise_jiang_2013_hsp90.R
    │   │   └── jiang_2013_hsp90.yaml
    │   ├── olson_2014_proteing
    │   │   ├── olson_2014_proteing.yaml
    │   │   └── standardise_olson_2014_proteing.R
    │   ├── roscoe_2014_ubi
    │   │   ├── roscoe_2014_ubi.yaml
    │   │   └── standardise_roscoe_2014_ubi.R
    │   ├── firnberg_2014_tem1
    │   │   ├── firnberg_2014_tem1.yaml
    │   │   └── standardise_firnberg_2014_tem1.R
    │   ├── mishra_2016_hsp90
    │   │   ├── standardise_mishra_2016_hsp90.R
    │   │   └── mishra_2016_hsp90.yaml
    │   ├── kitzman_2015_gal4
    │   │   ├── standardise_kitzman_2015_gal4.R
    │   │   └── kitzman_2015_gal4.yaml
    │   ├── sarkisyan_2016_gfp
    │   │   ├── sarkisyan_2016_gfp.yaml
    │   │   └── standardise_sarkisyan_2016_gfp.R
    │   ├── findlay_2018_brca1
    │   │   ├── standardise_findlay_2018_brca1.R
    │   │   └── findlay_2018_brca1.yaml
    │   ├── hartman_2018_cp
    │   │   ├── hartman_2018_cp.yaml
    │   │   └── standardise_hartman_2018_cp.R
    │   ├── brenan_2016_mapk1
    │   │   ├── standardise_brenan_2016_mapk1.R
    │   │   └── brenan_2016_mapk1.yaml
    │   ├── wrenbeck_2017_amie
    │   │   ├── wrenbeck_2017_amie.yaml
    │   │   └── standardise_wrenbeck_2017_amie.R
    │   ├── jones_2019_adrb2
    │   │   ├── jones_2019_adrb2.yaml
    │   │   └── standardise_jones_2019_adrb2.R
    │   ├── starita_2015_brca1
    │   │   └── standardise_starita_2015_brca1.R
    │   ├── heredia_2018_cxcr4
    │   │   ├── heredia_2018_cxcr4.yaml
    │   │   └── standardise_heredia_2018_cxcr4.R
    │   ├── doud_2015_np
    │   │   ├── doud_2015_np.yaml
    │   │   └── standardise_doud_2015_np.R
    │   ├── findlay_2014_dbr1
    │   │   ├── findlay_2014_dbr1.yaml
    │   │   └── standardise_findlay_2014_dbr1.R
    │   ├── lee_2018_ha
    │   │   ├── standardise_lee_2018_ha.R
    │   │   └── lee_2018_ha.yaml
    │   ├── araya_2012_yap1
    │   │   ├── araya_2012_yap1.yaml
    │   │   └── standardise_araya_2012_yap1.R
    │   ├── heredia_2018_ccr5
    │   │   ├── heredia_2018_ccr5.yaml
    │   │   └── standardise_heredia_2018_ccr5.R
    │   ├── giacomelli_2018_tp53
    │   │   ├── giacomelli_2018_tp53.yaml
    │   │   └── standardise_giacomelli_2018_tp53.R
    │   ├── wagenaar_2014_braf
    │   │   ├── standardise_wagenaar_2014_braf.R
    │   │   └── wagenaar_2014_braf.yaml
    │   ├── dorrity_2018_ste12
    │   │   ├── dorrity_2018_ste12.yaml
    │   │   └── standardise_dorrity_2018_ste12.R
    │   ├── starita_2013_ube4b
    │   │   ├── standardise_starita_2013_ube4b.R
    │   │   └── starita_2013_ube4b.yaml
    │   └── melnikov_2014_aph3ii
    │   │   └── standardise_melnikov_2014_aph3ii.R
    └── pdb
    │   └── README.md
├── cluster.yaml
├── snakemake.yaml
├── .rsync_exclude
├── docs
    ├── subtypes_readme.txt
    ├── combined_mutational_scans_readme.txt
    └── foldx_eqn.md
├── .gitignore
├── bin
    ├── utils
    │   ├── setup_study_folder.sh
    │   └── farm_sync.sh
    ├── data_processing
    │   ├── standardise_study_template.R
    │   ├── make_gene_fasta.py
    │   ├── filter_pdb.py
    │   └── foldx_variants.py
    ├── analysis
    │   ├── 0_data
    │   │   ├── validate_kitzman_2015_gal4.R
    │   │   ├── summarise_standardised_data.R
    │   │   ├── sift_correlation.R
    │   │   ├── validate_araya_2012_yap1.R
    │   │   ├── validate_sarkisyan_2016_gfp.R
    │   │   ├── validate_starita_2013_ube4b.R
    │   │   └── check_normalisation.R
    │   └── 2_subtypes
    │   │   ├── characterise_subtypes.R
    │   │   ├── compare_hclust_dynamic_deep_split.R
    │   │   ├── sequence_context.R
    │   │   └── all_positions.R
    ├── figures
    │   ├── figureS5.R
    │   ├── figureS3.R
    │   ├── figureS8_27.R
    │   ├── figureS29.R
    │   └── figureS4.R
    └── pipeline
    │   └── sequence_statistics.smk
├── LICENSE
└── src
    ├── continuous.R
    ├── pymol_utils.py
    └── dimensionality_reduction.R


/.gitattributes:
--------------------------------------------------------------------------------
1 | data/long_combined_mutational_scans.tsv filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/figures/4_figures/parts/arrow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/parts/arrow.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/amie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/amie.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/cbs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cbs.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ccr5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ccr5.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/cp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cp.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/dbr1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/dbr1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/gal4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/gal4.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ha.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/infa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/infa.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/np.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/np.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/pab1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/pab1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/pten.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/pten.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ras.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ras.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/src.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/src.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/tem1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tem1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/tp53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tp53.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/tpk1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tpk1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/tpmt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tpmt.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ubi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ubi.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/yap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/yap1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/adrb2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/adrb2.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/aph3ii.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/aph3ii.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/brca1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/brca1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/calm1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/calm1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/cxcr4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cxcr4.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/hsp90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/hsp90.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ste12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ste12.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/sumo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/sumo1.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ube2i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ube2i.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/ube4b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ube4b.png


--------------------------------------------------------------------------------
/figures/4_figures/proteins/proteing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/proteing.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/cbs_phe_pi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cbs_phe_pi.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/ccr5_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_domains.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/tem1_asp_sa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/tem1_asp_sa.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/adrb2_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/adrb2_domains.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/cbs_asp_ionic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cbs_asp_ionic.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/cxcr4_domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cxcr4_domains.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/gal4_cys_zinc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/gal4_cys_zinc.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/np_cys_aromatic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/np_cys_aromatic.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/ras_met_buried.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ras_met_buried.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/tem1_asp_ligand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/tem1_asp_ligand.png


--------------------------------------------------------------------------------
/meta/fasta/streptococcus_proteing.fa:
--------------------------------------------------------------------------------
1 | >streptococcal protein G; from wt seq given in Olson et al. 2014
2 | MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE
3 | 


--------------------------------------------------------------------------------
/meta/subtypes/kmeans_profile.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic K-means clustering"
2 | method: 'kmeans'
3 | columns: 'A:Y'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   nstart: 10
8 | 


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/ccr5_cys_aromatic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_cys_aromatic.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/pab1_arg_not_neg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/pab1_arg_not_neg.png


--------------------------------------------------------------------------------
/meta/subtypes/hdbscan_pca.yaml:
--------------------------------------------------------------------------------
1 | desc: "HDBSCAN clustering on PCs"
2 | method: 'hdbscan'
3 | columns: 'PC1:PC20'
4 | args:
5 |   dist_method: 'manhattan'
6 |   minPts: 4 
7 | 


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/ccr5_cys_disulphide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_cys_disulphide.png


--------------------------------------------------------------------------------
/meta/subtypes/kmeans_pca.yaml:
--------------------------------------------------------------------------------
1 | desc: "K-means clustering using all PCs"
2 | method: 'kmeans'
3 | columns: 'PC1:PC20'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   nstart: 10
8 | 


--------------------------------------------------------------------------------
/meta/subtypes/pam_profile_cos.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic PAM clustering"
2 | method: 'pam'
3 | columns: 'A:Y'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   distance_method: 'cosine'
8 | 


--------------------------------------------------------------------------------
/meta/subtypes/pam_profile_man.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic PAM clustering"
2 | method: 'pam'
3 | columns: 'A:Y'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   distance_method: 'manhattan'
8 | 


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/adrb2_ala_small_hydro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/adrb2_ala_small_hydro.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/aph3ii_arg_not_proline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/aph3ii_arg_not_proline.png


--------------------------------------------------------------------------------
/figures/4_figures/position_examples/ras_aliphatic_entropy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ras_aliphatic_entropy.png


--------------------------------------------------------------------------------
/meta/subtypes/hdbscan_profile.yaml:
--------------------------------------------------------------------------------
1 | desc: "HDBSCAN clustering on ER profiles"
2 | method: 'hdbscan'
3 | columns: 'A:Y'
4 | args:
5 |   dist_method: 'manhattan'
6 |   minPts: 3 
7 | 


--------------------------------------------------------------------------------
/meta/subtypes/pam_pca_no_sig_cos.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic PAM clustering"
2 | method: 'pam'
3 | columns: 'PC2:PC20'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   distance_method: 'cosine'
8 | 


--------------------------------------------------------------------------------
/data/long_combined_mutational_scans.tsv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8b015d600da366adea1437a0c13aab834ddabee5a6368c5d2fd12e5157d5c14c
3 | size 103160026
4 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_ubi.fa:
--------------------------------------------------------------------------------
1 | >ubi modified from first repeat of UBI4|P0CG63 from Uniprot
2 | MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN
3 | IQKESTLHLVLRLRGG
4 | 
5 | 


--------------------------------------------------------------------------------
/meta/subtypes/pam_pca_no_sig_man.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic PAM clustering"
2 | method: 'pam'
3 | columns: 'PC2:PC20'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   distance_method: 'manhattan'
8 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_pca_dynamic.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using PCs with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'PC1:PC20'
4 | args:
5 |   distance_method: 'manhattan'
6 | 
7 | 


--------------------------------------------------------------------------------
/cluster.yaml:
--------------------------------------------------------------------------------
1 | __default__:
2 |     output: '{rule}.{wildcards}.%J'
3 |     error: '{rule}.{wildcards}.%J'
4 |     queue: 'research-rh74'
5 |     name: 'Subtypes.{rule}.{wildcards}'
6 |     memory: 4096 
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/subtypes/hdbscan_pca_no_sig.yaml:
--------------------------------------------------------------------------------
1 | desc: "HDBSCAN clustering on PCs excluding positional significance"
2 | method: 'hdbscan'
3 | columns: 'PC2:PC20'
4 | args:
5 |   dist_method: 'manhattan'
6 |   minPts: 4 
7 | 


--------------------------------------------------------------------------------
/meta/subtypes/kmeans_pca_no_sig.yaml:
--------------------------------------------------------------------------------
1 | desc: "K-means clustering using PCs excluding positional significance"
2 | method: 'kmeans'
3 | columns: 'PC2:PC20'
4 | args:
5 |   k: 4
6 |   min_size: 5
7 |   nstart: 10
8 | 


--------------------------------------------------------------------------------
/meta/fasta/e_coli_infa.fa:
--------------------------------------------------------------------------------
1 | >sp|P69222|IF1_ECOLI Translation initiation factor IF-1 OS=Escherichia coli (strain K12) OX=83333 GN=infA PE=1 SV=2
2 | MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPY
3 | DLSKGRIVFRSR
4 | 
5 | 


--------------------------------------------------------------------------------
/meta/fasta/e_coli_ccdb.fa:
--------------------------------------------------------------------------------
1 | >sp|P62554|CCDB_ECOLI Toxin CcdB OS=Escherichia coli (strain K12) OX=83333 GN=ccdB PE=1 SV=1
2 | MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDES
3 | WRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI
4 | 
5 | 


--------------------------------------------------------------------------------
/meta/subtypes/gmm_pca.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number"
2 | method: 'gmm'
3 | columns: 'PC1:PC20'
4 | args:
5 |   G: [ 2, 3, 4, 5 ]
6 |   modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 
7 | 


--------------------------------------------------------------------------------
/meta/subtypes/gmm_profile.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number"
2 | method: 'gmm'
3 | columns: 'A:Y'
4 | args:
5 |   G: [ 2, 3, 4, 5 ]
6 |   modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 
7 | 


--------------------------------------------------------------------------------
/meta/subtypes/gmm_pca_no_sig.yaml:
--------------------------------------------------------------------------------
1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number"
2 | method: 'gmm'
3 | columns: 'PC1:PC20'
4 | args:
5 |   G: [ 2, 3, 4, 5 ]
6 |   modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 
7 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_sumo1.fa:
--------------------------------------------------------------------------------
1 | >sp|P63165|SUMO1_HUMAN Small ubiquitin-related modifier 1 OS=Homo sapiens OX=9606 GN=SUMO1 PE=1 SV=1
2 | MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMN
3 | SLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV
4 | 
5 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_profile_dynamic.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'A:Y'
4 | args:
5 |   distance_method: 'manhattan'
6 |   treecut_args:
7 |     deepSplit: 2
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_profile_dynamic_cos.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'A:Y'
4 | args:
5 |   distance_method: 'cosine'
6 |   treecut_args:
7 |     deepSplit: 2
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_pca_no_sig_dynamic_cos_deep_0.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'PC2:PC20'
4 | args:
5 |   distance_method: 'cosine'
6 |   treecut_args:
7 |     deepSplit: 0
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_pca_no_sig_dynamic_cos_deep_1.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'PC2:PC20'
4 | args:
5 |   distance_method: 'cosine'
6 |   treecut_args:
7 |     deepSplit: 1
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/fasta/bacteriophage_ms2_cp.fa:
--------------------------------------------------------------------------------
1 | >sp|P03612|CAPSD_BPMS2 Capsid protein OS=Escherichia phage MS2 OX=329852 PE=1 SV=2
2 | MASNFTQFVLVDNGGTGDVTVAPSNFANGVAEWISSNSRSQAYKVTCSVRQSSAQNRKYT
3 | IKVEVPKVATQTVGGVELPVAAWRSYLNMELTIPIFATNSDCELIVKAMQGLLKDGNPIP
4 | SAIAANSGIY
5 | 
6 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_calm1.fa:
--------------------------------------------------------------------------------
1 | >sp|P0DP23|CALM1_HUMAN Calmodulin-1 OS=Homo sapiens OX=9606 GN=CALM1 PE=1 SV=1
2 | MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADG
3 | NGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDE
4 | EVDEMIREADIDGDGQVNYEEFVQMMTAK
5 | 
6 | 


--------------------------------------------------------------------------------
/meta/subtypes/hclust_pca_no_sig_dynamic.yaml:
--------------------------------------------------------------------------------
1 | desc: "Hierarchical clustering using PCs excluding positional significance, with dynamic cuts"
2 | method: 'hclust_dynamic'
3 | columns: 'PC2:PC20'
4 | args:
5 |   distance_method: 'manhattan'
6 |   treecut_args:
7 |     deepSplit: 3 
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/study_template.yaml:
--------------------------------------------------------------------------------
 1 | study: 
 2 | gene: 
 3 | domain: 
 4 | uniprot_id:
 5 | gene_type:
 6 | species:
 7 | strain:
 8 | seq: 
 9 | experiment:
10 | transform:
11 | authour:
12 | year:
13 | title:
14 | doi:
15 | pmid:
16 | url:
17 | input_files:
18 | qc:
19 |   filter: False
20 |   notes:
21 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_ube2i.fa:
--------------------------------------------------------------------------------
1 | >sp|P63279|UBC9_HUMAN SUMO-conjugating enzyme UBC9 OS=Homo sapiens OX=9606 GN=UBE2I PE=1 SV=1
2 | MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKL
3 | RMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELL
4 | NEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS
5 | 
6 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_ras.fa:
--------------------------------------------------------------------------------
1 | >sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens OX=9606 GN=HRAS PE=1 SV=1
2 | MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG
3 | QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDL
4 | AARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPG
5 | CMSCKCVLS
6 | 
7 | 


--------------------------------------------------------------------------------
/snakemake.yaml:
--------------------------------------------------------------------------------
 1 | foldx:
 2 |   variants_per_run: 300
 3 | 
 4 | sift:
 5 |   uniref90_fa_path: '/hps/research1/beltrao/ally/databases/uniref90/uniref90_2019_1.fasta'
 6 | 
 7 | porter5:
 8 |   path: '/nfs/research1/beltrao/ally/software/packages/Porter5/Porter5.py'
 9 | 
10 | misc:
11 |   fasta_line_length: 80
12 | 
13 | 


--------------------------------------------------------------------------------
/.rsync_exclude:
--------------------------------------------------------------------------------
 1 | studies/*/*.yaml
 2 | studies/*/*.R
 3 | study_template.yaml
 4 | pdb/*
 5 | fasta/**
 6 | clustering/**
 7 | structures.yaml
 8 | final_subtypes.yaml
 9 | README.md
10 | subtypes/*.yaml
11 | residue_hydrophobicity.tsv
12 | graveyard/*/*.yaml
13 | graveyard/*/*.R
14 | foldx_eqn.md
15 | subtype_descriptions.md
16 | uniprot_*
17 | 
18 | 


--------------------------------------------------------------------------------
/meta/fasta/a_victoria_gfp.fa:
--------------------------------------------------------------------------------
1 | >sp|P42212|GFP_AEQVI Green fluorescent protein OS=Aequorea victoria OX=6100 GN=GFP PE=1 SV=1
2 | MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL
3 | VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV
4 | NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD
5 | HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK
6 | 
7 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_tpk1.fa:
--------------------------------------------------------------------------------
1 | >sp|Q9H3S4|TPK1_HUMAN Thiamin pyrophosphokinase 1 OS=Homo sapiens OX=9606 GN=TPK1 PE=1 SV=1
2 | MEHAFTPLEPLLSTGNLKYCLVILNQPLDNYFRHLWNKALLRACADGGANRLYDITEGER
3 | ESFLPEFINGDFDSIRPEVREYYATKGCELISTPDQDHTDFTKCLKMLQKKIEEKDLKVD
4 | VIVTLGGLAGRFDQIMASVNTLFQATHITPFPIIIIQEESLIYLLQPGKHRLHVDTGMEG
5 | DWCGLIPVGQPCMQVTTTGLKWNLTNDVLAFGTLVSTSNTYDGSGVVTVETDHPLLWTMA
6 | IKS
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_tpmt.fa:
--------------------------------------------------------------------------------
1 | >sp|P51580|TPMT_HUMAN Thiopurine S-methyltransferase OS=Homo sapiens OX=9606 GN=TPMT PE=1 SV=1
2 | MDGTRTSLDIEEYSDTEVQKNQVLTLEEWQDKWVNGKTAFHQEQGHQLLKKHLDTFLKGK
3 | SGLRVFFPLCGKAVEMKWFADRGHSVVGVEISELGIQEFFTEQNLSYSEEPITEIPGTKV
4 | FKSSSGNISLYCCSIFDLPRTNIGKFDMIWDRGALVAINPGDRKCYADTMFSLLGKKFQY
5 | LLCVLSYDPTKHPGPPFYVPHAEIERLFGKICNIRCLEKVDAFEERHKSWGIDCLFEKLY
6 | LLTEK
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/fasta/b_subtilis_gdh.fa:
--------------------------------------------------------------------------------
1 | >sp|P12310|DHG_BACSU Glucose 1-dehydrogenase OS=Bacillus subtilis (strain 168) OX=224308 GN=gdh PE=2 SV=2
2 | MYPDLKGKVVAITGAASGLGKAMAIRFGKEQAKVVINYYSNKQDPNEVKEEVIKAGGEAV
3 | VVQGDVTKEEDVKNIVQTAIKEFGTLDIMINNAGLENPVPSHEMPLKDWDKVIGTNLTGA
4 | FLGSREAIKYFVENDIKGNVINMSSVHEVIPWPLFVHYAASKGGIKLMTETLALEYAPKG
5 | IRVNNIGPGAINTPINAEKFADPKQKADVESMIPMGYIGEPEEIAAVAAWLASKEASYVT
6 | GITLFADGGMTQYPSFQAGRG
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/fasta/e_coli_tem1.fa:
--------------------------------------------------------------------------------
1 | >tr|Q6SJ61|Q6SJ61_ECOLX Beta-lactamase OS=Escherichia coli OX=562 GN=TEM-1 PE=3 SV=1
2 | MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP
3 | EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL
4 | CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM
5 | PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS
6 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/fasta/e_coli_aph3ii.fa:
--------------------------------------------------------------------------------
1 | >tr|Q08JA8|Q08JA8_ECOLX APH(3') family aminoglycoside O-phosphotransferase OS=Escherichia coli OX=562 GN=aph PE=3 SV=1
2 | MIEQDGLHAGSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNEL
3 | QDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAM
4 | RRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPD
5 | GEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRF
6 | LVLYGIAAPDSQRIAFYRLLDEFF
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/subtypes_readme.txt:
--------------------------------------------------------------------------------
 1 | # Subtype assignments EV3
 2 | 
 3 | Table of subtypes assigned to each position in the deep landscape dataset (EV2) by our algorithm
 4 | 
 5 | ## Columns
 6 | cluster: Assigned subtype, denoted Xi for subtype i of amino acid X, with i=O for outliers and i=P for permissive positions
 7 | study: Study the position came from
 8 | gene: Gene
 9 | position: Position in the gene (relative to cannonical Uniprot sequence)
10 | wt: Wild-type amino acid
11 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_gcn4.fa:
--------------------------------------------------------------------------------
1 | >sp|P03069|GCN4_YEAST General control protein GCN4 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=GCN4 PE=1 SV=1
2 | MSEYQPSLFALNPMGFSPLDGSKSTNENVSASTSTAKPMVGQLIFDKFIKTEEDPIIKQD
3 | TPSNLDFDFALPQTATAPDAKTVLPIPELDDAVVESFFSSSTDSTPMFEYENLEDNSKEW
4 | TSLFDNDIPVTTDDVSLADKAIESTEEVSLVPSNLEVSTTSFLPTPVLEDAKLTQTRKVK
5 | KPNSVVKKSHHVGKDDESRLDHLGVVAYNRKQRSIPLSPIVPESSDPAALKRARNTEAAR
6 | RSRARKLQRMKQLEDKVEELLSKNYHLENEVARLKKLVGER
7 | 
8 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_ccr5.fa:
--------------------------------------------------------------------------------
1 | >sp|P51681|CCR5_HUMAN C-C chemokine receptor type 5 OS=Homo sapiens OX=9606 GN=CCR5 PE=1 SV=1
2 | MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFVGNMLVILILINCKR
3 | LKSMTDIYLLNLAISDLFFLLTVPFWAHYAAAQWDFGNTMCQLLTGLYFIGFFSGIFFII
4 | LLTIDRYLAVVHAVFALKARTVTFGVVTSVITWVVAVFASLPGIIFTRSQKEGLHYTCSS
5 | HFPYSQYQFWKNFQTLKIVILGLVLPLLVMVICYSGILKTLLRCRNEKKRHRAVRLIFTI
6 | MIVYFLFWAPYNIVLLLNTFQEFFGLNNCSSSNRLDQAMQVTETLGMTHCCINPIIYAFV
7 | GEKFRNYLLVFFQKHIAKRFCKCCSIFQQEAPERASSVYTRSTGEQEISVGL
8 | 
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Unrecognized_molecules.txt
 2 | rotabase.txt
 3 | Rplots.pdf
 4 | .snakemake/**
 5 | /data/*
 6 | !/data/studies/
 7 | /data/studies/*/**
 8 | !/data/studies/*/*.R
 9 | !/data/studies/*/*.yaml
10 | !/data/graveyard/
11 | /data/graveyard/*/**
12 | !/data/graveyard/*/*.R
13 | !/data/graveyard/*/*.yaml
14 | /figures/**
15 | /meta/*
16 | !/meta/fasta/
17 | !/meta/clustering/
18 | /docs/dataset_ideas.md
19 | /docs/subtypes_draft.pdf
20 | /docs/MSB-2021-10305R-Data_Edited_Final.pdf
21 | .Rproj.user
22 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_cxcr4.fa:
--------------------------------------------------------------------------------
1 | >sp|P61073|CXCR4_HUMAN C-X-C chemokine receptor type 4 OS=Homo sapiens OX=9606 GN=CXCR4 PE=1 SV=1
2 | MEGISIYTSDNYTEEMGSGDYDSMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVI
3 | LVMGYQKKLRSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNL
4 | YSSVLILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEA
5 | DDRYICDRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSKGHQKRKALKT
6 | TVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPI
7 | LYAFLGAKFKTSAQHALTSVSRGSSLKILSKGKRGGHSSVSTESESSSFHSS
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_mapk1.fa:
--------------------------------------------------------------------------------
1 | >sp|P28482|MK01_HUMAN Mitogen-activated protein kinase 1 OS=Homo sapiens OX=9606 GN=MAPK1 PE=1 SV=3
2 | MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFE
3 | HQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQH
4 | LSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDH
5 | TGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHI
6 | LGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHK
7 | RIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS
8 | 
9 | 


--------------------------------------------------------------------------------
/bin/utils/setup_study_folder.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Setup a folder for a study with the given ID
 3 | 
 4 | # Config
 5 | proj_root="$HOME/phd/subtypes"
 6 | template_yaml="$proj_root/meta/study_template.yaml"
 7 | template_script="$proj_root/bin/standardise_study_template.R"
 8 | 
 9 | # Setup dir with yaml meta file and raw subdir
10 | mkdir "$1" || exit
11 | mkdir "$1/raw"
12 | cp "$template_yaml" "$1/$1.yaml" || echo "Couldn't copy template YAML"
13 | cp "$template_script" "$1/standardise_$1.R" || echo "Couldn't copy template R script"
14 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_adrb2.fa:
--------------------------------------------------------------------------------
 1 | >sp|P07550|ADRB2_HUMAN Beta-2 adrenergic receptor OS=Homo sapiens OX=9606 GN=ADRB2 PE=1 SV=3
 2 | MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAK
 3 | FERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTAS
 4 | IETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQE
 5 | AINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRF
 6 | HVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQD
 7 | NLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNT
 8 | GEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL
 9 | 
10 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_tdp43.fa:
--------------------------------------------------------------------------------
 1 | >sp|Q13148|TADBP_HUMAN TAR DNA-binding protein 43 OS=Homo sapiens OX=9606 GN=TARDBP PE=1 SV=1
 2 | MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNPVSQCMRGVRLVEGI
 3 | LHAPDAGWGNLVYVVNYPKDNKRKMDETDASSAVKVKRAVQKTSDLIVLGLPWKTTEQDL
 4 | KEYFSTFGEVLMVQVKKDLKTGHSKGFGFVRFTEYETQVKVMSQRHMIDGRWCDCKLPNS
 5 | KQSQDEPLRSRKVFVGRCTEDMTEDELREFFSQYGDVMDVFIPKPFRAFAFVTFADDQIA
 6 | QSLCGEDLIIKGISVHISNAEPKHNSNRQLERSGRFGGNPGGFGNQGGFGNSRGGGAGLG
 7 | NNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQ
 8 | REPNQAFGSGNNSYSGSNSGAAIGWGSASNAGSGSGFNGGFGSSMDSKSSGWGM
 9 | 
10 | 


--------------------------------------------------------------------------------
/meta/fasta/p_aeruginosa_amie.fa:
--------------------------------------------------------------------------------
1 | >sp|P11436|AMIE_PSEAE Aliphatic amidase OS=Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1) OX=208964 GN=amiE PE=1 SV=2
2 | MRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEY
3 | SLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLV
4 | LIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAM
5 | KGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDG
6 | RTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAEC
7 | PFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_tp53.fa:
--------------------------------------------------------------------------------
 1 | >sp|P04637|P53_HUMAN Cellular tumor antigen p53 OS=Homo sapiens OX=9606 GN=TP53 PE=1 SV=4 NOTE: change P72R to match Giacomelli et al. 2018 seq
 2 | MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP
 3 | DEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK
 4 | SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE
 5 | RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS
 6 | SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP
 7 | PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG
 8 | GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD
 9 | 
10 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_tpk1/standardise_weile_2017_tpk1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Weile et al. 2017 (TPK1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE)
 7 | 
 8 | # Import and process data
 9 | meta <- read_yaml('data/studies/weile_2017_tpk1/weile_2017_tpk1.yaml')
10 | dm_data <- read_mavedb('data/studies/weile_2017_tpk1/raw/urn_mavedb_00000001-d-2_scores.csv', score_transform = transform_vamp_seq)
11 | 
12 | # Save output
13 | standardise_study(dm_data, meta$study, meta$transform)
14 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_pten.fa:
--------------------------------------------------------------------------------
 1 | >sp|P60484|PTEN_HUMAN Phosphatidylinositol 3,4,5-trisphosphate 3-phosphatase and dual-specificity protein phosphatase PTEN OS=Homo sapiens OX=9606 GN=PTEN PE=1 SV=1
 2 | MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSK
 3 | HKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVA
 4 | AIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSY
 5 | LLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMY
 6 | FEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEI
 7 | DSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEAS
 8 | SSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV
 9 | 
10 | 


--------------------------------------------------------------------------------
/data/studies/sun_2018_cbs/standardise_sun_2018_cbs.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Sun et al. 2018 (CBS) (Preprint)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/sun_2018_cbs/sun_2018_cbs.yaml')
 8 | dm_data <- read_mavedb('data/studies/sun_2018_cbs/raw/urn_mavedb_00000005-a-4_scores.csv', score_transform = function(x){log2(x + min(x[x > 0], na.rm = TRUE))}) %>%
 9 |   drop_na(position, score) # Drop WT position and variants with no fitness
10 | 
11 | # Save output
12 | standardise_study(dm_data, meta$study, meta$transform)
13 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_calm1/standardise_weile_2017_calm1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Weile et al. 2017 (CALM1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE)
 7 | 
 8 | # Import and process data
 9 | meta <- read_yaml('data/studies/weile_2017_calm1/weile_2017_calm1.yaml')
10 | dm_data <- read_mavedb('data/studies/weile_2017_calm1/raw/urn_mavedb_00000001-c-2_scores.csv', score_transform = transform_vamp_seq)
11 | 
12 | # Save output
13 | standardise_study(dm_data, meta$study, meta$transform)
14 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_sumo1/standardise_weile_2017_sumo1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Weile et al. 2017 (SUMO1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE)
 7 | 
 8 | # Import and process data
 9 | meta <- read_yaml('data/studies/weile_2017_sumo1/weile_2017_sumo1.yaml')
10 | dm_data <- read_mavedb('data/studies/weile_2017_sumo1/raw/urn_mavedb_00000001-b-1_scores.csv', score_transform = transform_vamp_seq)
11 | 
12 | # Save output
13 | standardise_study(dm_data, meta$study, meta$transform)
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |    Copyright 2020 EMBL - European Bioinformatics Institute
 2 | 
 3 |    Licensed under the Apache License, Version 2.0 (the "License");
 4 |    you may not use this file except in compliance with the License.
 5 |    You may obtain a copy of the License at
 6 | 
 7 |      http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 |    Unless required by applicable law or agreed to in writing, software
10 |    distributed under the License is distributed on an "AS IS" BASIS,
11 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 |    See the License for the specific language governing permissions and
13 |    limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_yap1.fa:
--------------------------------------------------------------------------------
 1 | >sp|P46937|YAP1_HUMAN Transcriptional coactivator YAP1 OS=Homo sapiens OX=9606 GN=YAP1 PE=1 SV=2
 2 | MDPGQQPPPQPAPQGQGQPPSQPPQGQGPPSGPGQPAPAATQAAPQAPPAGHQIVHVRGD
 3 | SETDLEALFNAVMNPKTANVPQTVPMRLRKLPDSFFKPPEPKSHSRQASTDAGTAGALTP
 4 | QHVRAHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLRQSSFEIPDDVPLPAGWEMA
 5 | KTSSGQRYFLNHIDQTTTWQDPRKAMLSQMNVTAPTSPPVQQNMMNSASGPLPDGWEQAM
 6 | TQDGEIYYINHKNKTTSWLDPRLDPRFAMNQRISQSAPVKQPPPLAPQSPQGGVMGGSNS
 7 | NQQQQMRLQQLQMEKERLRLKQQELLRQAMRNINPSTANSPKCQELALRSQLPTLEQDGG
 8 | TQNPVSSPGMSQELRTMTTNSSDPFLNSGTYHSRDESTDSGLSMSSYSVPRTPDDFLNSV
 9 | DEMDTGDTINQSTLPSQQNRFPDYLEAIPGTNVDLGTLEGDGMNIEGEELMPSLQEALSS
10 | DILNDMESVLAATKLDKESFLTWL
11 | 


--------------------------------------------------------------------------------
/data/studies/ahler_2019_src/standardise_ahler_2019_src.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Ahler et al. 2019 (Src)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/ahler_2019_src/ahler_2019_src.yaml')
 8 | dm_data <- bind_rows(read_mavedb('data/studies/ahler_2019_src/raw/urn_mavedb_00000041-b-1_scores.csv', position_offset = 1, score_col = activity_score),
 9 |                      read_mavedb('data/studies/ahler_2019_src/raw/urn_mavedb_00000041-a-1_scores.csv', position_offset = 269, score_col = activity_score))
10 | 
11 | # Save output
12 | standardise_study(dm_data, meta$study, meta$transform)
13 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_ube2i/standardise_weile_2017_ube2i.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Weile et al. 2017 (UBE2I)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE)
 7 | 
 8 | # Import and process data
 9 | meta <- read_yaml('data/studies/weile_2017_ube2i/weile_2017_ube2i.yaml')
10 | dm_data <- read_mavedb('data/studies/weile_2017_ube2i/raw/urn_mavedb_00000001-a-1_scores.csv', score_transform = transform_vamp_seq) %>%
11 |   filter(!position == 159) # Drop position not in WT protein
12 | 
13 | # Save output
14 | standardise_study(dm_data, meta$study, meta$transform)
15 | 


--------------------------------------------------------------------------------
/data/studies/kelsic_2016_infa/kelsic_2016_infa.yaml:
--------------------------------------------------------------------------------
 1 | study: 'kelsic_2016_infa' 
 2 | gene: 'infA'
 3 | uniprot_id: 'P69222'
 4 | gene_type: 'Translation'
 5 | species: 'E. coli'
 6 | seq: "MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPY\
 7 |       DLSKGRIVFRSR" 
 8 | experiment: 'Growth'
 9 | transform: 'VAMP-seq'
10 | authour: 'Kelsic et al.'
11 | year: 2016
12 | title: 'RNA Structural Determinants of Optimal Codons Revealed by MAGE-Seq'
13 | lab: ['Kishony']
14 | doi: '10.1016/j.cels.2016.11.004'
15 | pmid: '28009265'
16 | url: 'https://www.sciencedirect.com/science/article/pii/S2405471216303684'
17 | input_files:
18 |   - 'cels_206_mmc5.csv' 
19 | source: 'SI'
20 | qc:
21 |   filter: False
22 |   notes:
23 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_dbr1.fa:
--------------------------------------------------------------------------------
 1 | >sp|Q9UK59|DBR1_HUMAN Lariat debranching enzyme OS=Homo sapiens OX=9606 GN=DBR1 PE=1 SV=2
 2 | MRVAVAGCCHGELDKIYETLALAERRGPGPVDLLLCCGDFQAVRNEADLRCMAVPPKYRH
 3 | MQTFYRYYSGEKKAPVLTLFIGGNHEASNHLQELPYGGWVAPNIYYLGLAGVVKYRGVRI
 4 | GGISGIFKSHDYRKGHFECPPYNSSTIRSIYHVRNIEVYKLKQLKQPIDIFLSHDWPRSI
 5 | YHYGNKKQLLKTKSFFRQEVENNTLGSPAASELLEHLKPTYWFSAHLHVKFAALMQHQAK
 6 | DKGQTARATKFLALDKCLPHRDFLQILEIEHDPSAPDYLEYDIEWLTILRATDDLINVTG
 7 | RLWNMPENNGLHARWDYSATEEGMKEVLEKLNHDLKVPCNFSVTAACYDPSKPQTQMQLI
 8 | HRINPQTTEFCAQLGIIDINVRLQKSKEEHHVCGEYEEQDDVESNDSGEDQSEYNTDTSA
 9 | LSSINPDEIMLDEEEDEDSIVSAHSGMNTPSVEPSDQASEFSASFSDVRILPGSMIVSSD
10 | DTVDSTIDREGKPGGTVESGNGEDLTKVPLKRLSDEHEPEQRKKIKRRNQAIYAAVDDDD
11 | DDAA
12 | 
13 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_src.fa:
--------------------------------------------------------------------------------
 1 | >sp|P12931|SRC_HUMAN Proto-oncogene tyrosine-protein kinase Src OS=Homo sapiens OX=9606 GN=SRC PE=1 SV=3
 2 | MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAFAPAAAE
 3 | PKLFGGFNSSDTVTSPQRAGPLAGGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGD
 4 | WWLAHSLSTGQTGYIPSNYVAPSDSIQAEEWYFGKITRRESERLLLNAENPRGTFLVRES
 5 | ETTKGAYCLSVSDFDNAKGLNVKHYKIRKLDSGGFYITSRTQFNSLQQLVAYYSKHADGL
 6 | CHRLTTVCPTSKPQTQGLAKDAWEIPRESLRLEVKLGQGCFGEVWMGTWNGTTRVAIKTL
 7 | KPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKY
 8 | LRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYT
 9 | ARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVER
10 | GYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFLEDYFTSTEPQYQPGENL
11 | 
12 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_cbs.fa:
--------------------------------------------------------------------------------
 1 | >sp|P35520|CBS_HUMAN Cystathionine beta-synthase OS=Homo sapiens OX=9606 GN=CBS PE=1 SV=2
 2 | MPSETPQAEVGPTGCPHRSGPHSAKGSLEKGSPEDKEAKEPLWIRPDAPSRCTWQLGRPA
 3 | SESPHHHTAPAKSPKILPDILKKIGDTPMVRINKIGKKFGLKCELLAKCEFFNAGGSVKD
 4 | RISLRMIEDAERDGTLKPGDTIIEPTSGNTGIGLALAAAVRGYRCIIVMPEKMSSEKVDV
 5 | LRALGAEIVRTPTNARFDSPESHVGVAWRLKNEIPNSHILDQYRNASNPLAHYDTTADEI
 6 | LQQCDGKLDMLVASVGTGGTITGIARKLKEKCPGCRIIGVDPEGSILAEPEELNQTEQTT
 7 | YEVEGIGYDFIPTVLDRTVVDKWFKSNDEEAFTFARMLIAQEGLLCGGSAGSTVAVAVKA
 8 | AQELQEGQRCVVILPDSVRNYMTKFLSDRWMLQKGFLKEEDLTEKKPWWWHLRVQELGLS
 9 | APLTVLPTITCGHTIEILREKGFDQAPVVDEAGVILGMVTLGNMLSSLLAGKVQPSDQVG
10 | KVIYKQFKQIRLTDTLGRLSHILEMDHFALVVHEQIQYHSTGKSSQRQMVFGVVTAIDLL
11 | NFVAAQERDQK
12 | 
13 | 


--------------------------------------------------------------------------------
/meta/final_subtypes.yaml:
--------------------------------------------------------------------------------
 1 | desc: "Final subtypes assembly \
 2 |        The final subtypes set is assembled by combining two runs, \
 3 |        both using hybrid dynamic treecutting on the dendrogram produced \
 4 |        by hierarchical clustering. The cosine distance is used along with \
 5 |        filtering positions where all |ER| < 0.4. Two runs are combined with \
 6 |        deepSplit treecutting parameters chosen varying for each AA, as below."
 7 | deepSplit:
 8 |   "A": 0
 9 |   "C": 0
10 |   "D": 1
11 |   "E": 0
12 |   "F": 1
13 |   "G": 1
14 |   "H": 0
15 |   "I": 0
16 |   "K": 1
17 |   "L": 0
18 |   "M": 0
19 |   "N": 0
20 |   "P": 1
21 |   "Q": 1
22 |   "R": 1
23 |   "S": 0
24 |   "T": 1
25 |   "V": 0
26 |   "W": 0
27 |   "Y": 1
28 | 


--------------------------------------------------------------------------------
/data/pdb/README.md:
--------------------------------------------------------------------------------
 1 | # Structures and Models from SwissMODEL
 2 | 
 3 | These protein structure PDB files were sourced from [SwissMODEL](swissmodel.expasy.org).
 4 | The model or structure chosen for each protein is documented in `meta/structures.yaml`,
 5 | which gives the template ID chosen for each protein and the type of structure model (x-ray
 6 | crystalography, homology model etc.). The Uniprot ID of each protein is available in 
 7 | the study YAML config. Those proteins denoated as "model" are homology models from
 8 | SwissMODEL ([CC BY-SA 4.0 Creative Commons Attribution-ShareAlike 4.0 International License](https://swissmodel.expasy.org/docs/terms_of_use)) 
 9 | while the others are experimental structures from the PDB, as cited in the references.
10 | 


--------------------------------------------------------------------------------
/data/studies/ashenberg_2017_np/standardise_ashenberg_2017_np.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Ashenberg et al. 2017 (Flu NP)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/ashenberg_2017_np/ashenberg_2017_np.yaml')
 8 | dm_data <- read_csv('data/studies/ashenberg_2017_np/raw/journal.ppat.1006288.s013.csv') %>%
 9 |   rename(position = site, raw_score = diffsel) %>%
10 |   mutate(transformed_score = raw_score,
11 |          score = normalise_score(transformed_score),
12 |          class = get_variant_class(wt, mut)) %>%
13 |   drop_na(score) # some muts just aren't measured
14 | 
15 | # Save output
16 | standardise_study(dm_data, meta$study, meta$transform)
17 | 


--------------------------------------------------------------------------------
/meta/fasta/H3N2_A_Aichi_2_1968_np.fa:
--------------------------------------------------------------------------------
 1 | >gb:CY121120|ncbiId:AFM71861.1|UniProtKB:I6TAH8|Organism:Influenza A virus (A/Aichi/2/1968(H3N2))|Strain Name:A/Aichi/2/1968|Protein Name:NP Nucleoprotein|Gene Symbol:NP|Segment:5|Subtype:H3N2|Host:Human
 2 | MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLSA
 3 | FDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMIWH
 4 | SNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFWRGE
 5 | NGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCLPACV
 6 | YGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLSFIRGT
 7 | KVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSVQRNLPF
 8 | DKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSNEGSYFFG
 9 | DNAEEYDN
10 | 
11 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_sumo1/weile_2017_sumo1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'weile_2017_sumo1' 
 2 | gene: 'SUMO1'
 3 | uniprot_id: 'P63165'
 4 | gene_type: 'PTM'
 5 | species: 'H. sapiens'
 6 | seq: "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMN\
 7 |       SLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV"
 8 | experiment: 'Complement'
 9 | transform: 'VAMP-seq'
10 | authour: 'Weile et al.'
11 | year: 2017
12 | title: 'A framework for exhaustively mapping functional missense variants'
13 | lab: ['Roth', 'Fowler']
14 | doi: '10.15252/msb.20177908'
15 | pmid: '29269382'
16 | url: 'http://msb.embopress.org/content/13/12/957'
17 | input_files:
18 |   - 'urn_mavedb_00000001-b-1_scores.csv' 
19 | source: 'MaveDB'
20 | mavedb_urn: 'urn:mavedb:00000001-b'
21 | qc:
22 |   filter: False
23 |   notes:
24 | 


--------------------------------------------------------------------------------
/meta/fasta/strep_protein_g_precursor.fa:
--------------------------------------------------------------------------------
 1 | >sp|P19909|SPG2_STRSG Immunoglobulin G-binding protein G OS=Streptococcus sp. group G OX=1320 GN=spg PE=1 SV=1
 2 | MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRNGGELTNLLGNSETT
 3 | LALRNEESATADLTAAAVADTVAAAAAENAGAAAWEAAAAADALAKAKADALKEFNKYGV
 4 | SDYYKNLINNAKTVEGVKDLQAQVVESAKKARISEATDGLSDFLKSQTPAEDTVKSIELA
 5 | EAKVLANRELDKYGVSDYHKNLINNAKTVEGVKDLQAQVVESAKKARISEATDGLSDFLK
 6 | SQTPAEDTVKSIELAEAKVLANRELDKYGVSDYYKNLINNAKTVEGVKALIDEILAALPK
 7 | TDTYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEKPE
 8 | VIDASELTPAVTTYKLVINGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDAT
 9 | KTFTVTEKPEVIDASELTPAVTTYKLVINGKTLKGETTTKAVDAETAEKAFKQYANDNGV
10 | DGVWTYDDATKTFTVTEMVTEVPGDAPTEPEKPEASIPLVPLTPATPIAKDDAKKDDTKK
11 | EDAKKPEAKKEDAKKAETLPTTGEGSNPFFTAAALAVMAGAGALAVASKRKED
12 | 


--------------------------------------------------------------------------------
/data/studies/melamed_2013_pab1/standardise_melamed_2013_pab1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Melamed et al. 2013 (PAB1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/melamed_2013_pab1/melamed_2013_pab1.yaml')
 8 | dm_data <- read_xlsx('data/studies/melamed_2013_pab1/raw/Supplementary_Table_2.xlsx') %>%
 9 |   rename(wt = WT_aa) %>%
10 |   gather(key = 'mut', value = 'raw_score', -position, -wt) %>%
11 |   mutate(transformed_score = raw_score,
12 |          score = normalise_score(transformed_score),
13 |          class = get_variant_class(wt, mut)) %>%
14 |   drop_na(score) # Not all measured
15 | 
16 | # Save output
17 | standardise_study(dm_data, meta$study, meta$transform)
18 | 
19 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_pab1.fa:
--------------------------------------------------------------------------------
 1 | >sp|P04147|PABP_YEAST Polyadenylate-binding protein, cytoplasmic and nuclear OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=PAB1 PE=1 SV=4
 2 | MADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYVGDLEPSVSEAHLYDIFSP
 3 | IGSVSSIRVCRDAITKTSLGYAYVNFNDHEAGRKAIEQLNYTPIKGRLCRIMWSQRDPSL
 4 | RKKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAK
 5 | EAIDALNGMLLNGQEIYVAPHLSRKERDSQLEETKAHYTNLYVKNINSETTDEQFQELFA
 6 | KFGPIVSASLEKDADGKLKGFGFVNYEKHEDAVKAVEALNDSELNGEKLYVGRAQKKNER
 7 | MHVLKKQYEAYRLEKMAKYQGVNLFVKNLDDSVDDEKLEEEFAPYGTITSAKVMRTENGK
 8 | SKGFGFVCFSTPEEATKAITEKNQQIVAGKPLYVAIAQRKDVRRSQLAQQIQARNQMRYQ
 9 | QATAAAAAAAAGMPGQFMPPMFYGVMPPRGVPFNGPNPQQMNPMGGMPKNGMPPQFRNGP
10 | VYGVPPQGGFPRNANDNNQFYQQKQRQALGEQLYKKVSAKTSNEEAAGKITGMILDLPPQ
11 | EVFPLLESDELFEQHYKEASAAYESFKKEQEQQTEQA
12 | 
13 | 


--------------------------------------------------------------------------------
/bin/data_processing/standardise_study_template.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from STUDY
 3 | 
 4 | ## README - delete in real script versions
 5 | # Template script for standardising study data
 6 | # Saves a standardised tsv /data/studies/{study}/{study}.tsv
 7 | # With columns: position, wt, mut, score (which is the normalised score, all this processing is done here)
 8 | # Transform score such that -1 = NULL, 0 = WT, +ve = beneficial
 9 | # Tibble passed to standardise_study() must have at least columns position, wt, mut, score, raw_score
10 | 
11 | source('src/config.R')
12 | source('src/study_standardising.R')
13 | 
14 | # Import and process data
15 | meta <- read_yaml('IMPORT META YAML')
16 | dm_data <- read_csv('IMPORT STUDY DATA')
17 | 
18 | # Save output
19 | standardise_study(dm_data, meta$study, meta$transform)


--------------------------------------------------------------------------------
/data/studies/roscoe_2013_ubi/roscoe_2013_ubi.yaml:
--------------------------------------------------------------------------------
 1 | study: 'roscoe_2013_ubi' 
 2 | gene: 'UBI'
 3 | uniprot_id: 'P0CG63'
 4 | gene_type: 'PTM'
 5 | species: 'S. cerevisiae'
 6 | seq: "MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN\
 7 |       IQKESTLHLVLRLRGG"
 8 | experiment: 'Growth'
 9 | transform: 'None'
10 | authour: 'Roscoe et al.'
11 | year: 2013
12 | title: 'Analyses of the effects of all ubiquitin point mutants on yeast growth rate'
13 | lab: ['Bolon']
14 | doi: '10.1016/j.jmb.2013.01.032'
15 | pmid: '23376099'
16 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283613000636'
17 | notes: 'Uniprot ID is for UBI4, took first repeat of sequence from here as very conserved'
18 | input_files:
19 |   - '1-s2.0-S0022283613000636-mmc3.xlsx' 
20 | source: 'SI - Table S2'
21 | qc:
22 |   filter: False
23 |   notes:
24 | 


--------------------------------------------------------------------------------
/meta/fasta/H3N2_A_Perth_16_2009_ha.fa:
--------------------------------------------------------------------------------
 1 | >gb:KJ609206|ncbiId:AHX37629.1|Organism:Influenza A virus (A/Perth/16/2009(H3N2))|Strain Name:A/Perth/16/2009|Protein Name:HA Hemagglutinin|Gene Symbol:HA|Segment:4|Subtype:H3N2|Host:Human
 2 | MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDS
 3 | PHQILDGKNCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNES
 4 | FNWTGVTQNGTSSACIRRSKNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVLHPGTDKDQIFL
 5 | YAQASGRITVSTKRSQQTVSPNIGSRPRVRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGK
 6 | SSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNVPEKQTRGIFGA
 7 | IAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEG
 8 | RIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCD
 9 | NACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNI
10 | RCNICI
11 | 
12 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_calm1/weile_2017_calm1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'weile_2017_calm1' 
 2 | gene: 'CALM1'
 3 | uniprot_id: 'P0DP23'
 4 | gene_type: 'Regulatory'
 5 | species: 'H. sapiens'
 6 | seq: "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADG\
 7 |       NGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDE\
 8 |       EVDEMIREADIDGDGQVNYEEFVQMMTAK" 
 9 | experiment: 'Complement'
10 | transform: 'VAMP-seq'
11 | authour: 'Weile et al.'
12 | year: 2017
13 | title: 'A framework for exhaustively mapping functional missense variants'
14 | lab: ['Roth', 'Fowler']
15 | doi: '10.15252/msb.20177908'
16 | pmid: '29269382'
17 | url: 'http://msb.embopress.org/content/13/12/957'
18 | input_files:
19 |   - 'urn_mavedb_00000001-c-2_scores.csv' 
20 | source: 'MaveDB'
21 | mavedb_urn: 'urn:mavedb:00000001-c-2'
22 | qc:
23 |   filter: False
24 |   notes:
25 | 


--------------------------------------------------------------------------------
/data/studies/bandaru_2017_ras/bandaru_2017_ras.yaml:
--------------------------------------------------------------------------------
 1 | study: 'bandaru_2017_ras' 
 2 | gene: 'Ras'
 3 | uniprot_id: 'P01112'
 4 | gene_type: 'GTPase'
 5 | species: 'H. sapiens'
 6 | seq: "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG\
 7 |       QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDL\
 8 |       AARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPG\
 9 |       CMSCKCVLS" 
10 | experiment: 'Coupled growth'
11 | transform: 'x/ln(2)'
12 | authour: 'Bandaru et al.'
13 | year: 2017
14 | title: 'Deconstruction of the Ras switching cycle through saturation mutagenesis'
15 | lab: ['Kuriyan', 'Valencia']
16 | doi: '10.7554/eLife.27810'
17 | pmid: '28686159'
18 | url: 'https://elifesciences.org/articles/27810'
19 | input_files:
20 |   - 'elife-27810-supp1-v2.xlsx' 
21 | source: 'SI'
22 | qc:
23 |   filter: False
24 |   notes:
25 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_ube2i/weile_2017_ube2i.yaml:
--------------------------------------------------------------------------------
 1 | study: 'weile_2017_ube2i'
 2 | gene: 'UBE2I'
 3 | uniprot_id: 'P63279'
 4 | gene_type: 'E2 Conjugase'
 5 | species: 'H. sapiens'
 6 | seq: "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKL\
 7 |       RMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELL\
 8 |       NEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" 
 9 | experiment: 'Complement'
10 | transform: 'VAMP-seq'
11 | authour: 'Weile et al.'
12 | year: 2017
13 | title: 'A framework for exhaustively mapping functional missense variants'
14 | lab: ['Roth', 'Fowler']
15 | doi: '10.15252/msb.20177908'
16 | pmid: '29269382'
17 | url: 'http://msb.embopress.org/content/13/12/957'
18 | input_files:
19 |   - 'urn_mavedb_00000001-a-1_scores.csv' 
20 | source: 'MaveDB'
21 | mavedb_urn: 'urn:mavedb:00000001-a'
22 | qc:
23 |   filter: False
24 |   notes:
25 | 


--------------------------------------------------------------------------------
/data/studies/bolognesi_2019_tdp43/standardise_bolognesi_2019_tdp43.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Bolognesi et al. 2019 (TDP43)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/bolognesi_2019_tdp43/bolognesi_2019_tdp43.yaml')
 8 | dm_data <- read_xlsx('data/studies/bolognesi_2019_tdp43/raw/41467_2019_12101_MOESM7_ESM.xlsx', sheet = '1 AA change') %>%
 9 |   rename(position = Pos_abs, wt = WT_AA, mut = Mut, raw_score = toxicity) %>%
10 |   mutate(transformed_score = raw_score / log(2),
11 |          score = normalise_score(transformed_score),
12 |          class = get_variant_class(wt, mut)) %>%
13 |   select(position, wt, mut, score, transformed_score, raw_score, class)
14 | 
15 | # Save output
16 | standardise_study(dm_data, meta$study, meta$transform)
17 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/validate_kitzman_2015_gal4.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Validate the selection experiment combination method for Kitzman et al. 2015 (GAL4)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | dir.create('figures/0_data/per_study/kitzman_2015_gal4')
 7 | 
 8 | # Import and process data
 9 | path <- 'data/studies/kitzman_2015_gal4/raw/41592_2015_BFnmeth3223_MOESM306_ESM.xlsx'
10 | dm_data <- lapply(excel_sheets(path), read_kitzman_sheet, path = path) %>%
11 |   bind_rows(.) %>%
12 |   spread(key = 'label', value = 'log2_enrichment') %>%
13 |   filter(!mut == 'delInFrame')
14 | 
15 | # Plot variants
16 | p <- ggpairs(dm_data, columns = c('NONSEL_24h', 'SEL_A_24h', 'SEL_A_40h', 'SEL_B_40h', 'SEL_C_40h', 'SEL_C_64h'))
17 | ggsave('figures/0_data/per_study/kitzman_2015_gal4/validate_selection_combination.pdf', p, units = 'cm', width = 25, height = 25)


--------------------------------------------------------------------------------
/data/studies/hietpas_2011_hsp90/standardise_hietpas_2011_hsp90.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Hietpas et al. 2011 (HSP90) 
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/hietpas_2011_hsp90/hietpas_2011_hsp90.yaml')
 8 | dm_data <- read_csv('data/studies/hietpas_2011_hsp90/raw/sd02.csv', skip = 5) %>%
 9 |   rename(mut = aa, raw_score = s) %>%
10 |   mutate(wt = str_split(meta$seq, '')[[1]][position]) %>%
11 |   group_by(position, wt, mut) %>% # Average over codons
12 |   summarise(raw_score = mean(raw_score)) %>%
13 |   ungroup() %>%
14 |   mutate(transformed_score = raw_score,
15 |          score = normalise_score(transformed_score),
16 |          class = get_variant_class(wt, mut))
17 | 
18 | # Save output
19 | standardise_study(dm_data, meta$study, meta$transform)
20 | 
21 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_hsc82.fa:
--------------------------------------------------------------------------------
 1 | >tr|A0A140HDC6|A0A140HDC6_YEASX Hsp90 family chaperone OS=Saccharomyces cerevisiae OX=4932 GN=HSC82 PE=3 SV=1
 2 | MAGETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYQALSDPKQLETEP
 3 | DLFIRITPKPEEKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF
 4 | GVGFYSLFLVADRVQVISKNNEDEQYIWESNAGGSFTVTLDEVNERIGRGTVLRLFLKDD
 5 | QLEYLEEKRIKEVIKRHSEFVAYPIQLLVTKEVEKEVPIPEEEKKDEEKKDEDDKKPKLE
 6 | EVDEEEEEKKPKTKKVKEEVQELEELNKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPL
 7 | YVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSF
 8 | VKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFDKFYSAFAKN
 9 | IKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGEFLKAVEK
10 | SPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAEREKEI
11 | KEYEPLTKALKDILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRDSSMS
12 | SYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTNLLFETALLTSGFSLEEPTSF
13 | ASRINRLISLGLNIDEDEETETAPEASTEAPVEEVPADTEMEEVD
14 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_ste12.fa:
--------------------------------------------------------------------------------
 1 | >sp|P13574|STE12_YEAST Protein STE12 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=STE12 PE=1 SV=1
 2 | MKVQITNSRTEEILKVQANNENDEVSKATPGEVEESLRLIGDLKFFLATAPVNWQENQII
 3 | RRYYLNSGQGFVSCVFWNNLYYITGTDIVKCCLYRMQKFGREVVQKKKFEEGIFSDLRNL
 4 | KCGIDATLEQPKSEFLSFLFRNMCLKTQKKQKVFFWFSVAHDKLFADALERDLKRESLNQ
 5 | PSTTKPVNEPALSFSYDSSSDKPLYDQLLQHLDSRRPSSTTKSDNSPPKLESENFKDNEL
 6 | VTVTNQPLLGVGLMDDDAPESPSQINDFIPQKLIIEPNTLELNGLTEETPHDLPKNTAKG
 7 | RDEEDFPLDYFPVSVEYPTEENAFDPFPPQAFTPAAPSMPISYDNVNERDSMPVNSLLNR
 8 | YPYQLSVAPTFPVPPSSSRQHFMTNRDFYSSNNNKEKLVSPSDPTSYMKYDEPVMDFDES
 9 | RPNENCTNAKSHNSGQQTKQHQLYSNNFQQSYPNGMVPGYYPKMPYNPMGGDPLLDQAFY
10 | GADDFFFPPEGCDNNMLYPQTATSWNVLPPQAMQPAPTYVGRPYTPNYRSTPGSAMFPYM
11 | QSSNSMQWNTAVSPYSSRAPSTTAKNYPPSTFYSQNINQYPRRRTVGMKSSQGNVPTGNK
12 | QSVGKSAKISKPLHIKTSAYQKQYKINLETKARPSAGDEDSAHPDKNKEISMPTPDSNTL
13 | VVQSEEGGAHSLEVDTNRRSDKNLPDAT
14 | 
15 | 


--------------------------------------------------------------------------------
/data/studies/bandaru_2017_ras/standardise_bandaru_2017_ras.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Bandaru et al. 2017 (Ras)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/bandaru_2017_ras/bandaru_2017_ras.yaml')
 8 | dm_data <- read_xlsx('data/studies/bandaru_2017_ras/raw/elife-27810-supp1-v2.xlsx') %>%
 9 |   mutate(...1 = replace_na(...1, 'wt')) %>%
10 |   transpose_tibble(col_names = ...1, id_col = 'position') %>%
11 |   mutate_at(vars(-wt), as.numeric) %>%
12 |   mutate(position = as.integer(position)) %>%
13 |   pivot_longer(A:Y, names_to = 'mut', values_to = 'raw_score') %>%
14 |   mutate(transformed_score = raw_score/log(2),
15 |          score = normalise_score(transformed_score),
16 |          class = get_variant_class(wt, mut))
17 | 
18 | # Save output
19 | standardise_study(dm_data, meta$study, meta$transform)
20 | 


--------------------------------------------------------------------------------
/src/continuous.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Functions for characterising Deep Mutational Scanning positions based on continuous ER gradiant
 3 | 
 4 | # Process the various factors
 5 | get_factor_type <- function(x){
 6 |   out <- rep(NA, length(x))
 7 |   out[x == 'all_atom_abs'] <- 'SA'
 8 |   out[x %in% names(FOLDX_TERMS)] <- 'FoldX'
 9 |   out[str_starts(x, 'ss_')] <- 'DSSP'
10 |   out[str_starts(x, 'within_10_0')] <- 'Chem. Env.'
11 |   
12 |   return(out)
13 | }
14 | 
15 | pretty_factors <- function(x){
16 |   out <- rep(NA, length(x))
17 |   typ <- get_factor_type(x)
18 |   
19 |   out[typ == 'SA'] <- 'All Atom Abs.'
20 |   out[typ == 'FoldX'] <- FOLDX_TERMS[x[typ == 'FoldX']]
21 |   out[typ == 'Secondary Structure'] <- DSSP_CLASSES_STR[str_sub(x[typ == 'Secondary Structure'], start = 4)]
22 |   out[typ == 'Chemical Environment'] <- str_sub(x[typ == 'Chemical Environment'], start = -1)
23 |     
24 |   return(out)
25 | }
26 | 


--------------------------------------------------------------------------------
/data/studies/steinberg_2016_tem1/standardise_steinberg_2016_tem1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Steinberg & Ostermeier 2016 (TEM1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/steinberg_2016_tem1/steinberg_2016_tem1.yaml')
 8 | dm_data <- read_xlsx('data/studies/steinberg_2016_tem1/raw/1-s2.0-S0022283616301450-mmc2.xlsx', trim_ws = TRUE) %>%
 9 |   rename_all(~str_replace_all(str_to_lower(.), ' ', '_')) %>%
10 |   drop_na(codon_position) %>%
11 |   select(position = codon_position, wt = wt_aa, mut = mutant_aa, raw_score = tem1_amp_fitness) %>%
12 |   mutate(transformed_score = log2(raw_score),
13 |          score = normalise_score(transformed_score),
14 |          class = get_variant_class(wt, mut)) %>%
15 |   drop_na(score) # Not all measured
16 | 
17 | # Save output
18 | standardise_study(dm_data, meta$study, meta$transform)
19 | 


--------------------------------------------------------------------------------
/data/studies/matreyek_2018_pten/standardise_matreyek_2018_pten.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Matreyek et al. 2018 (PTEN)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/matreyek_2018_pten/matreyek_2018_pten.yaml')
 8 | dm_data <- read_csv('data/studies/matreyek_2018_pten/raw/PTEN.csv',
 9 |                     col_types = cols(.default = col_character(), position = col_integer(), score = col_double())) %>%
10 |   select(-X1) %>%
11 |   rename(wt = start, mut = end, raw_score = score) %>%
12 |   mutate(mut = if_else(mut == 'X', '*', mut),
13 |          class = str_to_title(class),
14 |          transformed_score = transform_vamp_seq(raw_score),
15 |          score = normalise_score(transformed_score)) %>%
16 |   drop_na(score) # Not all measured
17 | 
18 | # Save output
19 | standardise_study(dm_data, meta$study, meta$transform)
20 | 
21 | 


--------------------------------------------------------------------------------
/data/studies/matreyek_2018_tpmt/standardise_matreyek_2018_tpmt.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Matreyek et al. 2018 (TPMT)
 3 | 
 4 | source('src/config.R')
 5 | source('src/study_standardising.R')
 6 | 
 7 | # Import and process data
 8 | meta <- read_yaml('data/studies/matreyek_2018_tpmt/matreyek_2018_tpmt.yaml')
 9 | dm_data <- read_csv('data/studies/matreyek_2018_tpmt/raw/TPMT.csv',
10 |                     col_types = cols(.default = col_character(), position = col_integer(), score = col_double())) %>%
11 |   select(-X1) %>%
12 |   rename(wt = start, mut = end, raw_score = score) %>%
13 |   mutate(mut = if_else(mut == 'X', '*', mut),
14 |          class = str_to_title(class),
15 |          transformed_score = transform_vamp_seq(raw_score),
16 |          score = normalise_score(transformed_score)) %>%
17 |   drop_na(score) # not all measured
18 | 
19 | # Save output
20 | standardise_study(dm_data, meta$study, meta$transform)
21 | 
22 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_hsp90.fa:
--------------------------------------------------------------------------------
 1 | >sp|P02829|HSP82_YEAST ATP-dependent molecular chaperone HSP82 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=HSP82 PE=1 SV=1
 2 | MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPKQLETEP
 3 | DLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF
 4 | GVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDEVNERIGRGTILRLFLKDD
 5 | QLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKEVPIPEEEKKDEEKKDEEKKDEDDK
 6 | KPKLEEVDEEEEKKPKTKKVKEEVQEIEELNKTKPLWTRNPSDITQEEYNAFYKSISNDW
 7 | EDPLYVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPE
 8 | WLSFVKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSA
 9 | FSKNIKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLK
10 | AVEKSPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER
11 | EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRD
12 | SSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYETALLTSGFSLDE
13 | PTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPADTEMEEVD
14 | 
15 | 


--------------------------------------------------------------------------------
/data/studies/weile_2017_tpk1/weile_2017_tpk1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'weile_2017_tpk1' 
 2 | gene: 'TPK1'
 3 | uniprot_id: 'Q9H3S4'
 4 | gene_type: 'Enzyme'
 5 | species: 'H. sapiens'
 6 | seq: "MEHAFTPLEPLLSTGNLKYCLVILNQPLDNYFRHLWNKALLRACADGGANRLYDITEGER\
 7 |       ESFLPEFINGDFDSIRPEVREYYATKGCELISTPDQDHTDFTKCLKMLQKKIEEKDLKVD\
 8 |       VIVTLGGLAGRFDQIMASVNTLFQATHITPFPIIIIQEESLIYLLQPGKHRLHVDTGMEG\
 9 |       DWCGLIPVGQPCMQVTTTGLKWNLTNDVLAFGTLVSTSNTYDGSGVVTVETDHPLLWTMA\
10 |       IKS" 
11 | experiment: 'Complement'
12 | transform: 'VAMP-seq'
13 | authour: 'Weile et al.'
14 | year: 2017
15 | title: 'A framework for exhaustively mapping functional missense variants'
16 | lab: ['Roth', 'Fowler']
17 | doi: '10.15252/msb.20177908'
18 | pmid: '29269382'
19 | url: 'http://msb.embopress.org/content/13/12/957'
20 | input_files:
21 |   - 'urn_mavedb_00000001-d-2_scores.csv' 
22 | source: 'MaveDB'
23 | mavedb_urn: 'urn:mavedb:00000001-d'
24 | qc:
25 |   filter: False
26 |   notes:
27 | 


--------------------------------------------------------------------------------
/data/studies/jiang_2013_hsp90/standardise_jiang_2013_hsp90.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Jiang et al. 2013 (HSP90)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/jiang_2013_hsp90/jiang_2013_hsp90.yaml')
 8 | dm_data <- read_xlsx('data/studies/jiang_2013_hsp90/raw/journal.pgen.1003600.s014.xlsx', skip = 2) %>%
 9 |   select(-...10) %>%
10 |   rename_all(tolower) %>%
11 |   rename(mut = `amino acid`,
12 |          sd = `standard deviation`) %>%
13 |   mutate(raw_score = as.numeric(str_remove(average, '<')), # Set all instances of <0.034 to 0.034
14 |          transformed_score = log2(raw_score),
15 |          score = normalise_score(transformed_score),
16 |          wt = str_split(meta$seq, '')[[1]][position],
17 |          class = get_variant_class(wt, mut))
18 |   
19 | # Save output
20 | standardise_study(dm_data, meta$study, meta$transform)
21 | 
22 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_braf.fa:
--------------------------------------------------------------------------------
 1 | >sp|P15056|BRAF_HUMAN Serine/threonine-protein kinase B-raf OS=Homo sapiens OX=9606 GN=BRAF PE=1 SV=4 MODIFIED: V600E
 2 | MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH
 3 | IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV
 4 | TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS
 5 | LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK
 6 | TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI
 7 | PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR
 8 | DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP
 9 | GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV
10 | AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH
11 | LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATE
12 | KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN
13 | NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS
14 | LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH
15 | 
16 | 


--------------------------------------------------------------------------------
/data/studies/olson_2014_proteing/olson_2014_proteing.yaml:
--------------------------------------------------------------------------------
 1 | study: 'olson_2014_proteing' 
 2 | gene: 'Protein G'
 3 | domain: 'GB1'
 4 | uniprot_id: 'P19909'
 5 | gene_type: 'Immune'
 6 | species: 'Streptococcus'
 7 | seq: "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE"
 8 | experiment: 'Ligand binding'
 9 | transform: 'None (Processed from counts)'
10 | authour: 'Olson et al.'
11 | year: 2014
12 | title: 'A Comprehensive Biophysical Description of Pairwise Epistasis throughout an Entire Protein Domain'
13 | lab: ['Sun']
14 | doi: '10.1016/j.cub.2014.09.072'
15 | pmid: '25455030'
16 | url: 'https://www.sciencedirect.com/science/article/pii/S0960982214012688'
17 | notes: "Uniprot ID is for the Protein G precurssor which has a slightly different sequence.\
18 |         The sequence given here is just for the GB1 domain, as given in the paper."
19 | input_files:
20 |   - '1-s2.0-S0960982214012688-mmc2.xlsx'
21 | source: 'SI - Table S2'
22 | qc:
23 |   filter: False
24 |   notes:
25 | 


--------------------------------------------------------------------------------
/data/studies/roscoe_2014_ubi/roscoe_2014_ubi.yaml:
--------------------------------------------------------------------------------
 1 | study: 'roscoe_2014_ubi' 
 2 | gene: 'UBI'
 3 | uniprot_id: 'P0CG63'
 4 | gene_type: 'PTM'
 5 | species: 'S. cerevisiae'
 6 | seq: "MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN\
 7 |       IQKESTLHLVLRLRGG" 
 8 | experiment: 'Reactivity'
 9 | transform: 'None'
10 | authour: 'Roscoe & Bolon'
11 | year: 2014
12 | title: 'Systematic Exploration of Ubiquitin Sequence, E1 Activation Efficiency, and Experimental Fitness in Yeast'
13 | lab: ['Bolon']
14 | doi: '10.1016/j.jmb.2014.05.019'
15 | pmid: '24862281'
16 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283614002587'
17 | notes: "Uniprot ID is for UBI4, took first repeat of sequence from here as very conserved.
18 |         Also have data for excess E1, but for 18 rather than 75 positions, so I only use
19 |         the limiting case"
20 | input_files:
21 |   - '1-s2.0-S0022283614002587-mmc3.xlsx' 
22 | source: 'SI - Table S2'
23 | qc:
24 |   filter: False
25 |   notes:
26 | 


--------------------------------------------------------------------------------
/data/studies/matreyek_2018_tpmt/matreyek_2018_tpmt.yaml:
--------------------------------------------------------------------------------
 1 | study: 'matreyek_2018_tpmt'
 2 | gene: 'TPMT'
 3 | uniprot_id: 'P51580'
 4 | gene_type: 'Methyltransferase' 
 5 | species: 'H. sapiens'
 6 | seq: "MDGTRTSLDIEEYSDTEVQKNQVLTLEEWQDKWVNGKTAFHQEQGHQLLKKHLD\
 7 |       TFLKGKSGLRVFFPLCGKAVEMKWFADRGHSVVGVEISELGIQEFFTEQNLSYS\
 8 |       EEPITEIPGTKVFKSSSGNISLYCCSIFDLPRTNIGKFDMIWDRGALVAINPGD\
 9 |       RKCYADTMFSLLGKKFQYLLCVLSYDPTKHPGPPFYVPHAEIERLFGKICNIRC\
10 |       LEKVDAFEERHKSWGIDCLFEKLYLLTEK"
11 | experiment: 'VAMP-seq'
12 | transform: "VAMP-seq"
13 | authour: 'Matreyek et al.'
14 | year: 2018
15 | title: 'Multiplex assessment of protein variant abundance by massively parallel sequencing'
16 | lab: ['Fowler', 'Shendure']
17 | doi: '10.1038/s41588-018-0122-z'
18 | pmid: '29785012'
19 | url: 'https://www.nature.com/articles/s41588-018-0122-z'
20 | input_files:
21 |   - 'TPMT.csv' 
22 | source: 'Fowler Lab website: https://abundance.gs.washington.edu/shiny/stability/'
23 | qc:
24 |   filter: False
25 |   notes:
26 | 


--------------------------------------------------------------------------------
/data/studies/steinberg_2016_tem1/steinberg_2016_tem1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'steinberg_2016_tem1'
 2 | gene: 'TEM1'
 3 | uniprot_id: 'Q6SJ61'
 4 | gene_type: 'Metabolic'
 5 | species: 'E. coli'
 6 | seq: "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP\
 7 |       EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL\
 8 |       CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM\
 9 |       PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS\
10 |       RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW"
11 | experiment: 'Concentration of beta-lactam'
12 | transform: 'log2(x)' 
13 | authour: 'Steinberg & Ostermeier'
14 | year: 2016
15 | title: 'Shifting Fitness and Epistatic Landscapes Reflect Trade-offs along an Evolutionary Pathway'
16 | lab: ['Ostermeier']
17 | doi: '10.1016/j.jmb.2016.04.033'
18 | pmid: '27173379'
19 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283616301450'
20 | input_files:
21 |   - '1-s2.0-S0022283616301450-mmc2.xlsx' 
22 | source: 'SI'
23 | qc:
24 |   filter: False
25 |   notes:
26 | 


--------------------------------------------------------------------------------
/data/studies/firnberg_2014_tem1/firnberg_2014_tem1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'firnberg_2014_tem1' 
 2 | gene: 'TEM1'
 3 | uniprot_id: 'Q6SJ61'
 4 | gene_type: 'Metabolic'
 5 | species: 'E. coli'
 6 | seq: "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP\
 7 |       EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL\
 8 |       CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM\
 9 |       PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS\
10 |       RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW" 
11 | experiment: 'Coupled Growth'
12 | transform: 'log2(x)' 
13 | authour: 'Firnberg et al.'
14 | year: 2014
15 | title: 'A Comprehensive, High-Resolution Map of a Gene’s Fitness Landscape'
16 | lab: ['Ostermeier']
17 | doi: '10.1093/molbev/msu081'
18 | pmid: '24567513'
19 | url: 'https://academic.oup.com/mbe/article/31/6/1581/2925654'
20 | input_files:
21 |   - 'firnberg_2014_tem1.xlsx' 
22 | source: 'SI'
23 | qc:
24 |   filter: True 
25 |   notes: "Essentially identical to steinberg_2016_tem1, but with slightly fewer variants"
26 | 


--------------------------------------------------------------------------------
/data/studies/mishra_2016_hsp90/standardise_mishra_2016_hsp90.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Mishra et al. 2016 (HSP90)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/mishra_2016_hsp90/mishra_2016_hsp90.yaml')
 8 | path <- 'data/studies/mishra_2016_hsp90/raw/1-s2.0-S2211124716303175-mmc2.xlsx'
 9 | dm_data <- map(excel_sheets(path), read_mishra_sheet, path = path) %>%
10 |   bind_rows() %>%
11 |   select(position, mut=aa, raw_score=avg) %>%
12 |   mutate(wt = str_split(meta$seq, '')[[1]][position],
13 |          raw_score = na_if(raw_score, -999),
14 |          transformed_score = raw_score,
15 |          score = normalise_score(transformed_score), 
16 |          class = get_variant_class(wt, mut)) %>%
17 |   select(position, wt, mut, transformed_score, raw_score, score, class) %>%
18 |   arrange(position, mut) %>%
19 |   drop_na(score) # Some not measured
20 | 
21 | # Save output
22 | standardise_study(dm_data, meta$study, meta$transform)
23 | 


--------------------------------------------------------------------------------
/data/studies/roscoe_2013_ubi/standardise_roscoe_2013_ubi.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Roscoe et al. 2013 (Ubi) 
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/roscoe_2013_ubi/roscoe_2013_ubi.yaml')
 8 | dm_data <- read_xlsx('data/studies/roscoe_2013_ubi/raw/1-s2.0-S0022283613000636-mmc3.xlsx', skip = 4) %>%
 9 |   rename(position = Position,
10 |          mut = `Amino Acid`,
11 |          selection_chr = Apparent,
12 |          sd_chr = `Quantified Synonyms`) %>%
13 |   mutate(raw_score = as.numeric(selection_chr),
14 |          transformed_score = raw_score,
15 |          score = normalise_score(transformed_score), 
16 |          wt = str_split(meta$seq, '')[[1]][position],
17 |          class = get_variant_class(wt, mut)) %>%
18 |   select(position, wt, mut, transformed_score, raw_score, score, class) %>%
19 |   drop_na(score) # Not all measured sucessfuly
20 | 
21 | # Save output
22 | standardise_study(dm_data, meta$study, meta$transform)
23 | 
24 | 


--------------------------------------------------------------------------------
/data/studies/kitzman_2015_gal4/standardise_kitzman_2015_gal4.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Kitzman et al. 2015 (GAL4)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/kitzman_2015_gal4/kitzman_2015_gal4.yaml')
 8 | path <- 'data/studies/kitzman_2015_gal4/raw/41592_2015_BFnmeth3223_MOESM306_ESM.xlsx'
 9 | dm_data <- lapply(excel_sheets(path), read_kitzman_sheet, path = path) %>%
10 |   bind_rows(.) %>%
11 |   spread(key = 'label', value = 'log2_enrichment') %>%
12 |   mutate(raw_score = rowMeans(select(., SEL_A_24h, SEL_A_40h, SEL_B_40h, SEL_C_40h, SEL_C_64h), na.rm = TRUE) %>% replace_na(NA), # Average over replicates (diff times found to still correlate well)
13 |          transformed_score = raw_score,
14 |          score = normalise_score(transformed_score),
15 |          class = get_variant_class(wt, mut)) %>%
16 |   filter(!mut == 'delInFrame') %>%
17 |   drop_na(score) # Some variants not measured
18 | 
19 | # Save output
20 | standardise_study(dm_data, meta$study, meta$transform)
21 | 


--------------------------------------------------------------------------------
/data/studies/roscoe_2014_ubi/standardise_roscoe_2014_ubi.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Roscoe & Bolon 2014 (Ubiquitin)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/roscoe_2014_ubi/roscoe_2014_ubi.yaml')
 8 | dm_data <- read_xlsx('data/studies/roscoe_2014_ubi/raw/1-s2.0-S0022283614002587-mmc3.xlsx', skip = 3, na = 'NA') %>%
 9 |   rename(position = Position,
10 |          mut = `Amino Acid`,
11 |          raw_score = `log2 (E1react/display)`,
12 |          rel_e1_reactivity = `Relative E1-reactivity (avg WT=1, avg STOP=0)`,
13 |          sd_in_symonoymous_codons = `Standard deviation among synonymous codons`,
14 |          notes = Notes) %>%
15 |   mutate(transformed_score = raw_score,
16 |          score = normalise_score(transformed_score), 
17 |          wt = str_split(meta$seq, '')[[1]][position],
18 |          class = get_variant_class(wt, mut)) %>%
19 |   drop_na(score) # Not all measured sucessfully
20 | 
21 | # Save output
22 | standardise_study(dm_data, meta$study, meta$transform)
23 | 


--------------------------------------------------------------------------------
/data/studies/sarkisyan_2016_gfp/sarkisyan_2016_gfp.yaml:
--------------------------------------------------------------------------------
 1 | study: 'sarkisyan_2016_gfp' 
 2 | gene: 'GFP'
 3 | uniprot_id: 'P42212'
 4 | gene_type: 'Flourescent Protein'
 5 | species: 'A. victoria'
 6 | seq: "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL\
 7 |       VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV\
 8 |       NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD\
 9 |       HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" 
10 | experiment: 'Flourescence level'
11 | transform: 'log2(x/F_wt)'
12 | authour: 'Sarkisyan et al.'
13 | year: 2016
14 | title: 'Local fitness landscape of the green fluorescent protein'
15 | lab: ['Kondrashov']
16 | doi: '10.1038/nature17995'
17 | pmid: '27193686'
18 | url: 'https://www.nature.com/articles/nature17995'
19 | notes: "Used a GFP with F64L, which is reflected in the sequence we use"
20 | input_files:
21 |   - 'amino_acid_genotypes_to_brightness.tsv' 
22 | source: 'SI - Figshare (https://figshare.com/articles/Local_fitness_landscape_of_the_green_fluorescent_protein/3102154)'
23 | qc:
24 |   filter: True 
25 |   notes: "Low coverage"
26 | 


--------------------------------------------------------------------------------
/meta/fasta/s_cerevisiae_gal4.fa:
--------------------------------------------------------------------------------
 1 | >sp|P04386|GAL4_YEAST Regulatory protein GAL4 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=GAL4 PE=1 SV=2
 2 | MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKRSPLTRAHLTEVESR
 3 | LERLEQLFLLIFPREDLDMILKMDSLQDIKALLTGLFVQDNVNKDAVTDRLASVETDMPL
 4 | TLRQHRISATSSSEESSNKGQRQLTVSIDSAAHHDNSTIPLDFMPRDALHGFDWSEEDDM
 5 | SDGLPFLKTDPNNNGFFGDGSLLCILRSIGFKPENYTNSNVNRLPTMITDRYTLASRSTT
 6 | SRLLQSYLNNFHPYCPIVHSPTLMMLYNNQIEIASKDQWQILFNCILAIGAWCIEGESTD
 7 | IDVFYYQNAKSHLTSKVFESGSIILVTALHLLSRYTQWRQKTNTSYNFHSFSIRMAISLG
 8 | LNRDLPSSFSDSSILEQRRRIWWSVYSWEIQLSLLYGRSIQLSQNTISFPSSVDDVQRTT
 9 | TGPTIYHGIIETARLLQVFTKIYELDKTVTAEKSPICAKKCLMICNEIEEVSRQAPKFLQ
10 | MDISTTALTNLLKEHPWLSFTRFELKWKQLSLIIYVLRDFFTNFTQKKSQLEQDQNDHQS
11 | YEVKRCSIMLSDAAQRTVMSVSSYMDNHNVTPYFAWNCSYYLFNAVLVPIKTLLSNSKSN
12 | AENNETAQLLQQINTVLMLLKKLATFKIQTCEKYIQVLEEVCAPFLLSQCAIPLPHISYN
13 | NSNGSAIKNIVGSATIAQYPTLPEENVNNISVKYVSPGSVGPSPVPLKSGASFSDLVKLL
14 | SNRPPSRNSPVTIPRSTPSHRSVTPFLGQQQQLQSLVPLTPSALFGGANFNQSGNIADSS
15 | LSFTFTNSSNGPNLITTQTNSQALSQPIASSNVHDNFMNNEITASKIDDGNNSKPLSPGW
16 | TDQTAYNAFGITTGMFNTTTMDDVYNYLFDDEDTPPNPKKE
17 | 
18 | 


--------------------------------------------------------------------------------
/data/studies/findlay_2018_brca1/standardise_findlay_2018_brca1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Findlay et al. 2018 (BRCA1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/findlay_2018_brca1/findlay_2018_brca1.yaml')
 8 | dm_data <- read_xlsx('data/studies/findlay_2018_brca1/raw/41586_2018_461_MOESM3_ESM.xlsx', skip = 2, na = 'NA') %>%
 9 |   rename_all(list( ~ gsub('[\\/ \\(\\)]+', '_', .))) %>%
10 |   rename(wt_nuc = reference,
11 |          mut_nuc = alt,
12 |          wt = aa_ref,
13 |          mut = aa_alt,
14 |          position = aa_pos) %>%
15 |   drop_na(position) %>%
16 |   group_by(position, wt, mut) %>%
17 |   summarise(raw_score = mean(function.score.mean, na.rm=TRUE)) %>%
18 |   ungroup() %>%
19 |   mutate(transformed_score = raw_score,
20 |          score = normalise_score(transformed_score),
21 |          class = get_variant_class(wt, mut)) %>%
22 |   select(position, wt, mut, score, transformed_score, raw_score, class)
23 | 
24 | # Save output
25 | standardise_study(dm_data, meta$study, meta$transform)
26 | 


--------------------------------------------------------------------------------
/data/studies/hartman_2018_cp/hartman_2018_cp.yaml:
--------------------------------------------------------------------------------
 1 | study: 'hartman_2018_cp' 
 2 | gene: 'CP'
 3 | uniprot_id: 'P03612'
 4 | gene_type: 'Viral Coat'
 5 | species: 'Bacteriophage MS2'
 6 | seq: "MASNFTQFVLVDNGGTGDVTVAPSNFANGVAEWISSNSRSQAYKVTCSVRQSSAQNRKYT\
 7 |       IKVEVPKVATQTVGGVELPVAAWRSYLNMELTIPIFATNSDCELIVKAMQGLLKDGNPIP\
 8 |       SAIAANSGIY"
 9 | experiment: 'Viral coat assembly'
10 | transform: '(x - mean(x_wt))/log10(2)' 
11 | authour: 'Hartman et al.'
12 | year: 2018
13 | title: 'Quantitative characterization of all single amino acid variants of a viral capsid-based drug delivery vehicle'
14 | lab: ['Tullman-Ercek']
15 | doi: '10.1038/s41467-018-03783-y'
16 | pmid: '29643335'
17 | url: 'https://www.nature.com/articles/s41467-018-03783-y'
18 | notes: "In their method they arbilarily set variants with 0 reads to a certain score, \
19 |         but this is much lower than the rest of the scale and skews everything. Here \
20 |         we set it (again rather arbitarily, but less so than them) to just below the \
21 |         lowest measured score (-2.5)"
22 | input_files:
23 |   - '41467_2018_3783_MOESM4_ESM.xlsx' 
24 | source: 'SI'
25 | qc:
26 |   filter: False
27 |   notes:
28 | 


--------------------------------------------------------------------------------
/data/studies/brenan_2016_mapk1/standardise_brenan_2016_mapk1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Brenan et al. 2016 (MAPK1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/brenan_2016_mapk1/brenan_2016_mapk1.yaml')
 8 | dm_data <- read_xlsx('data/studies/brenan_2016_mapk1/raw/1-s2.0-S2211124716313171-mmc2.xlsx', sheet = 'Supplemental_Table_1') %>%
 9 |   rename_all(list( ~ gsub(' ', '_', tolower(.)))) %>%
10 |   rename(wt = wt_aa, mut = mutant_aa, position = erk2_residue) %>%
11 |   mutate(raw_score = `lfc_(etp_vs._dox)`, # Only general condition, other two are for specific drugs
12 |          transformed_score = -raw_score,  # The selection scheme they used favoured lof > wt > gof
13 |          score = normalise_score(transformed_score),
14 |          class = get_variant_class(wt, mut)) %>% 
15 |   mutate_at(vars(nuc_acid_changes, dox_rank, sch_rank, vrt_rank, vrt_specific_allele, sch_specific_allele), as.integer) %>%
16 |   select(position, wt, mut, score, transformed_score, raw_score, class)
17 | 
18 | # Save output
19 | standardise_study(dm_data, meta$study, meta$transform)
20 | 


--------------------------------------------------------------------------------
/data/studies/wrenbeck_2017_amie/wrenbeck_2017_amie.yaml:
--------------------------------------------------------------------------------
 1 | study: 'wrenbeck_2017_amie' 
 2 | gene: 'amiE'
 3 | uniprot_id: 'P11436'
 4 | gene_type: 'Metabolic'
 5 | species: 'P. aeruginosa'
 6 | seq: "MRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEY\
 7 |       SLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLV\
 8 |       LIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAM\
 9 |       KGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDG\
10 |       RTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAEC\
11 |       PFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA" 
12 | experiment: 'Coupled growth'
13 | transform: 'None'
14 | authour: 'Wrenbeck et al.'
15 | year: 2017
16 | title: 'Single-mutation fitness landscapes for an enzyme on multiple substrates reveal specificity is globally encoded'
17 | lab: ['Whitehead']
18 | doi: '10.1038/ncomms15695'
19 | pmid: '28585537'
20 | url: 'https://www.nature.com/articles/ncomms15695'
21 | input_files:
22 |   - 'amiESelectionFitnessData_Acetamide.txt'
23 |   - 'amiESelectionFitnessData_Isobutyramide.txt'
24 |   - 'amiESelectionFitnessData_Propionamide.txt'
25 | source: 'SI'
26 | qc:
27 |   filter: False
28 |   notes:
29 | 


--------------------------------------------------------------------------------
/data/studies/matreyek_2018_pten/matreyek_2018_pten.yaml:
--------------------------------------------------------------------------------
 1 | study: 'matreyek_2018_pten'
 2 | gene: 'PTEN'
 3 | uniprot_id: 'P60484'
 4 | gene_type: 'Phosphatase' 
 5 | species: 'H. sapiens'
 6 | seq: "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVV\
 7 |       RFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDL\
 8 |       DQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRD\
 9 |       KKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFV\
10 |       VCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKM\
11 |       FHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKND\
12 |       LDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYR\
13 |       YSDTTDSDPENEPFDEDQHTQITKV"
14 | experiment: 'VAMP-seq'
15 | transform: "VAMP-seq"
16 | authour: 'Matreyek et al.'
17 | year: 2018
18 | title: 'Multiplex assessment of protein variant abundance by massively parallel sequencing'
19 | lab: ['Fowler', 'Shendure']
20 | doi: '10.1038/s41588-018-0122-z'
21 | pmid: '29785012'
22 | url: 'https://www.nature.com/articles/s41588-018-0122-z'
23 | input_files:
24 |   - 'PTEN.csv' 
25 | source: 'Fowler Lab site: https://abundance.gs.washington.edu/shiny/stability/'
26 | qc:
27 |   filter: False
28 |   notes:
29 | 


--------------------------------------------------------------------------------
/data/studies/jones_2019_adrb2/jones_2019_adrb2.yaml:
--------------------------------------------------------------------------------
 1 | study: 'jones_2019_adrb2'
 2 | gene: 'ADRB2'
 3 | uniprot_id: 'P07550'
 4 | gene_type: 'GPCR'
 5 | species: 'H. sapiens'
 6 | seq: "MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLV\
 7 |       ITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEF\
 8 |       WTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSG\
 9 |       LTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMV\
10 |       FVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHK\
11 |       ALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPL\
12 |       IYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLC\
13 |       EDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL" 
14 | experiment: 'Expression of reporter gene'
15 | transform: "log2(x/mean(x_{subs with blosum62 >= 2}))"
16 | authour: 'Jones et al.'
17 | year: 2019
18 | title: 'Structural and Functional Characterization of G Protein-Coupled Receptors with Deep Mutational Scanning'
19 | lab: ['Kosuri', 'Dror', 'Babu']
20 | doi: 'https://doi.org/10.1101/623108'
21 | pmid: 
22 | url: 'https://www.biorxiv.org/content/10.1101/623108v2.full'
23 | input_files:
24 |   - 'lib-med.csv' 
25 | source: 'SI'
26 | qc:
27 |   filter: False
28 |   notes:
29 | 


--------------------------------------------------------------------------------
/meta/residue_hydrophobicity.tsv:
--------------------------------------------------------------------------------
 1 | # Various protein hydrophbicity scales as found in Bandyopadhyay & Mehler 2008 (https://onlinelibrary.wiley.com/doi/full/10.1002/prot.21958)
 2 | AA	TW	Faupl24	Abodr25	Rose34	Ponnu8	Mijer35	KyteDo36	White4	Eisen2
 3 | C	1.15	1.54	NA	0.91	14.93	7.93	2.5	-0.02	0.38
 4 | I	0.97	1.8	9.3	0.88	14.77	8.83	4.5	-1.12	1.90
 5 | L	0.87	1.7	10.0	0.85	14.10	8.47	3.8	-1.25	1.90
 6 | F	0.85	1.79	9.6	0.88	13.43	9.03	2.8	-1.71	2.30
 7 | V	0.83	1.22	8.5	0.86	15.07	7.73	4.2	-0.46	1.50
 8 | W	0.67	2.25	9.2	0.85	12.95	7.66	-0.9	-2.09	2.60
 9 | Y	0.60	0.96	8.0	0.76	13.29	5.89	-1.3	-0.71	1.60
10 | M	0.54	1.23	8.7	0.85	14.33	8.95	1.9	-0.67	2.40
11 | A	0.33	0.31	5.1	0.74	12.28	5.33	1.8	0.50	0.67
12 | P	0.32	0.72	4.9	0.64	11.19	3.87	-1.6	0.14	1.20
13 | H	0.25	0.13	1.6	0.78	12.84	5.1	-3.2	2.33	0.64
14 | T	0.21	0.26	3.5	0.70	11.65	4.49	-0.7	0.25	0.52
15 | S	0.05	-0.04	3.1	0.66	11.26	4.09	-0.8	0.46	0.01
16 | R	-0.01	-1.01	2.0	0.64	11.49	4.18	-4.5	1.81	-2.10
17 | Q	-0.05	-0.22	1.4	0.62	11.28	3.87	-3.5	0.77	-0.22
18 | N	-0.07	-0.6	0.6	0.63	11.00	3.71	-3.5	0.85	-0.6
19 | D	-0.22	-0.77	0.7	0.62	10.97	3.59	-3.5	3.64	-1.2
20 | E	-0.24	-0.64	1.8	0.62	11.19	3.65	-3.5	3.63	-0.76
21 | K	-0.40	-0.99	1.3	0.52	10.8	2.95	-3.9	2.8	-0.57
22 | 


--------------------------------------------------------------------------------
/bin/figures/figureS5.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Produce figure S5 (Proportion of variance)
 3 | source('src/config.R')
 4 | 
 5 | dms <- read_tsv('data/combined_mutational_scans.tsv')
 6 | 
 7 | pca <- tibble_pca(dms, A:Y)
 8 | 
 9 | pca_summary <- tibble(pc=0:20, sd=c(0, pca$sdev)) %>%
10 |   mutate(prop_var = sd^2/sum(sd^2),
11 |          cum_var = cumsum(prop_var))
12 | 
13 | figure <- ggplot(pca_summary, aes(x = pc)) +
14 |   geom_col(aes(y = prop_var, fill = 'Explained\nVariance')) + 
15 |   geom_line(aes(y = cum_var, colour = 'Cumulative\nExplained\nVariance')) +
16 |   scale_fill_manual(values = c(`Explained\nVariance`='cornflowerblue'), name = '') +
17 |   scale_colour_manual(values = c(`Cumulative\nExplained\nVariance`='red'), name = '') +
18 |   labs(x = 'Principal Component', y = 'Proportion of Variance')
19 | ggsave('figures/4_figures/figureS5.pdf', figure, width = 183, height = 100, units = 'mm')
20 | ggsave('figures/4_figures/figureS5.png', figure, width = 183, height = 100, units = 'mm')
21 | ggsave('figures/4_figures/figureS5.tiff', figure, width = 183, height = 100, units = 'mm')
22 | ggsave('figures/4_figures/figureS5.eps', figure, width = 183, height = 100, units = 'mm', device=cairo_ps, fallback_resolution = 600)
23 | 


--------------------------------------------------------------------------------
/data/studies/kelsic_2016_infa/standardise_kelsic_2016_infa.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Kelsic et al. 2016 (infA)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/kelsic_2016_infa/kelsic_2016_infa.yaml')
 8 | dm_data <- read_csv('data/studies/kelsic_2016_infa/raw/cels_206_mmc5.csv', skip = 1,
 9 |                     col_names = c('codon', 'mut', 'position', 'is_wt', 'raw_score',
10 |                                   'sd', 'fitness_rich',
11 |                                   'fitness_stdev_rich', 'RCS',
12 |                                   'mfe_ddG_43nt_sliding', 'tmp')) %>%
13 |   select(codon:sd) %>%
14 |   mutate(transformed_score = transform_vamp_seq(raw_score)) %>%
15 |   group_by(position, mut) %>%
16 |   summarise_at(vars(transformed_score, raw_score), mean, na.rm=TRUE) %>%
17 |   ungroup() %>%
18 |   mutate(score = normalise_score(transformed_score),
19 |          wt = str_split(meta$seq, '')[[1]][position],
20 |          class = get_variant_class(wt, mut)) %>%
21 |   drop_na(wt, position) # measured codons for stop codon, which we don't want
22 | 
23 | # Save output
24 | standardise_study(dm_data, meta$study, meta$transform)
25 | 


--------------------------------------------------------------------------------
/data/studies/hartman_2018_cp/standardise_hartman_2018_cp.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Hartman et al. 2018 (CP)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/hartman_2018_cp/hartman_2018_cp.yaml')
 8 | dm_data <- read_xlsx('data/studies/hartman_2018_cp/raw/41467_2018_3783_MOESM4_ESM.xlsx', skip = 1, na = 'Not Observed') %>%
 9 |   rename(position = `Residue #`) %>%
10 |   pivot_longer(-position, names_to = 'mut', values_to = 'raw_score') %>%
11 |   drop_na(raw_score) %>% # Not all variants measured
12 |   mutate(position = position + 1,
13 |          wt = str_split(meta$seq, '')[[1]][position],
14 |          class = get_variant_class(wt, mut),
15 |          raw_score = ifelse(raw_score == -4, -2.5, raw_score), # Set nulls to a value closer to rest of scale (they already set this arbitarily)
16 |          transformed_score = raw_score/log10(2), # Transform base
17 |          transformed_score = transformed_score - mean(transformed_score[class == 'Synonymous'], na.rm=TRUE), # 'divide' (in log domain) by WT scores
18 |          score = normalise_score(transformed_score))
19 | 
20 | # Save output
21 | standardise_study(dm_data, meta$study, meta$transform)
22 | 


--------------------------------------------------------------------------------
/data/studies/starita_2015_brca1/standardise_starita_2015_brca1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Starita et al. 2015 (BRCA1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/starita_2015_brca1/starita_2015_brca1.yaml')
 8 | dm_data <- read_xls('data/studies/starita_2015_brca1/raw/genetics.115.175802-6.xls', na = 'NA') %>%
 9 |   rename_all(tolower) %>%
10 |   rename(position = pos) %>%
11 |   # Ref seq given by study has a mysterious, undocumented R at pos 175 where normal refs have K
12 |   # using K here since the change is not explained in the paper and appears erroneous
13 |   mutate(wt = str_split(meta$seq, '')[[1]][position], 
14 |          class = get_variant_class(wt, mut)) %>%
15 |   filter(!variant_id == 'NA-NA') %>%
16 |   # Use E3 score - this is more general funcition based and empirically it seems to find most of the same negative effects as the BARD1 binding assay
17 |   mutate(raw_score = e3_score, 
18 |          transformed_score = transform_vamp_seq(raw_score),
19 |          score = normalise_score(transformed_score)) %>%
20 |   drop_na(score) # Not all measured
21 | 
22 | # Save output
23 | standardise_study(dm_data, meta$study, meta$transform)
24 | 


--------------------------------------------------------------------------------
/data/studies/bolognesi_2019_tdp43/bolognesi_2019_tdp43.yaml:
--------------------------------------------------------------------------------
 1 | study: 'bolognesi_2019_tdp43' 
 2 | gene: 'TDP43'
 3 | domain: 'PRD'
 4 | uniprot_id: 'Q13148'
 5 | gene_type: 'DNA Binding'
 6 | species: 'H. sapiens'
 7 | seq: "MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNPVSQCMRGVRLVEGI\
 8 |       LHAPDAGWGNLVYVVNYPKDNKRKMDETDASSAVKVKRAVQKTSDLIVLGLPWKTTEQDL\
 9 |       KEYFSTFGEVLMVQVKKDLKTGHSKGFGFVRFTEYETQVKVMSQRHMIDGRWCDCKLPNS\
10 |       KQSQDEPLRSRKVFVGRCTEDMTEDELREFFSQYGDVMDVFIPKPFRAFAFVTFADDQIA\
11 |       QSLCGEDLIIKGISVHISNAEPKHNSNRQLERSGRFGGNPGGFGNQGGFGNSRGGGAGLG\
12 |       NNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQ\
13 |       REPNQAFGSGNNSYSGSNSGAAIGWGSASNAGSGSGFNGGFGSSMDSKSSGWGM"
14 | experiment: 'Growth'
15 | transform: '-x/ln(2)'
16 | authour: 'Bolognesi et al.'
17 | year: 2019
18 | title: 'The mutational landscape of a prion-like domain'
19 | lab: ['Lehner']
20 | doi: '10.1038/s41467-019-12101-z'
21 | pmid: '31519910'
22 | url: 'http://www.nature.com/articles/s41467-019-12101-z'
23 | notes: 'Score was in base e rather than 2 and inverted to make toxic positive'
24 | input_files:
25 |   - '41467_2019_12101_MOESM7_ESM.xlsx' 
26 | source: 'SI'
27 | qc:
28 |   filter: True
29 |   notes: 'Measure toxicity of a prion like protein - not really related to normal function'
30 | 


--------------------------------------------------------------------------------
/data/studies/olson_2014_proteing/standardise_olson_2014_proteing.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Olson et al. 2014 (Protein G)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/olson_2014_proteing/olson_2014_proteing.yaml')
 8 | wt <- read_xlsx('data/studies/olson_2014_proteing/raw/1-s2.0-S0960982214012688-mmc2.xlsx', range = "U3:V4") %>%
 9 |   rename(input_count = `Input Count`,
10 |          selection_count = `Selection Count`)
11 | E_wt <- wt$selection_count/wt$input_count
12 | 
13 | dm_data <- read_xlsx('data/studies/olson_2014_proteing/raw/1-s2.0-S0960982214012688-mmc2.xlsx', range = cell_limits(ul = c(3, 14), lr = c(NA, 18))) %>%
14 |   rename(wt = `WT amino acid`,
15 |          position = `Position`,
16 |          mut = `Mutation`,
17 |          input_count = `Input Count`,
18 |          selection_count = `Selection Count`) %>%
19 |   mutate(raw_score = ((selection_count + min(selection_count[selection_count > 0], na.rm = TRUE))/input_count)/E_wt,
20 |          transformed_score = log2(raw_score),
21 |          score = normalise_score(transformed_score), 
22 |          class = get_variant_class(wt, mut))
23 | 
24 | # Save output
25 | standardise_study(dm_data, meta$study, meta$transform)
26 | 


--------------------------------------------------------------------------------
/data/studies/heredia_2018_cxcr4/heredia_2018_cxcr4.yaml:
--------------------------------------------------------------------------------
 1 | study: 'heredia_2018_cxcr4' 
 2 | gene: 'CXCR4'
 3 | uniprot_id: 'P61073'
 4 | gene_type: 'GPCR'
 5 | species: 'H. sapiens'
 6 | seq: "MEGISIYTSDNYTEEMGSGDYDSMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVI\
 7 |       LVMGYQKKLRSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNL\
 8 |       YSSVLILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEA\
 9 |       DDRYICDRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSKGHQKRKALKT\
10 |       TVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPI\
11 |       LYAFLGAKFKTSAQHALTSVSRGSSLKILSKGKRGGHSSVSTESESSSFHSS" 
12 | experiment: 'Surface expression and HIV-1 blocking antibody 2D7 affinity via FACS'
13 | transform: 'None'
14 | authour: 'Heredia et al.'
15 | year: 2018
16 | title: 'Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning'
17 | lab: ['Procko']
18 | doi: '10.4049/jimmunol.1800343'
19 | pmid: '29678950'
20 | url: 'https://www.jimmunol.org/content/200/11/3825'
21 | notes: "Use mean of antibody binding experiment scores as method also incorporated\
22 |         surface expression."
23 | input_files:
24 |   - 'GSE100368_enrichment_ratios_CXCR4.xlsx' 
25 | source: 'NCBI GEO: GSE100368 - https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368'
26 | qc:
27 |   filter: False
28 |   notes:
29 | 


--------------------------------------------------------------------------------
/data/studies/doud_2015_np/doud_2015_np.yaml:
--------------------------------------------------------------------------------
 1 | study: 'doud_2015_np'
 2 | gene: 'NP'
 3 | uniprot_id: 'I6TAH8'
 4 | gene_type: 'viral'
 5 | species: 'Influenza'
 6 | strain: 'Human adapted strain A/Aichi/2/1968, H3N2' 
 7 | seq: "MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLS\
 8 |       AFDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMI\
 9 |       WHSNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFW\
10 |       RGENGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCL\
11 |       PACVYGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLS\
12 |       FIRGTKVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSV\
13 |       QRNLPFDKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSN\
14 |       EGSYFFGDNAEEYDN" 
15 | experiment: 'Growth'
16 | transform: 'log2(x_mut_i/x_wt_i)' 
17 | authour: 'Doud et al.'
18 | year: 2015
19 | title: 'Site-Specific Amino Acid Preferences Are Mostly Conserved in Two Closely Related Protein Homologs'
20 | lab: ['Bloom']
21 | doi: '10.1093/molbev/msv167'
22 | pmid: '26226986'
23 | url: 'https://academic.oup.com/mbe/article/32/11/2944/982113'
24 | input_files:
25 |   - "Supp_file_2_mean_aichi1968_prefs.txt"
26 | source: 'SI - Supplementary data ZIP file'
27 | qc:
28 |   filter: False
29 |   notes:
30 | 


--------------------------------------------------------------------------------
/data/studies/findlay_2014_dbr1/findlay_2014_dbr1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'findlay_2014_dbr1' 
 2 | gene: 'DBR1'
 3 | domain: 'Exon 2'
 4 | uniprot_id: 'Q9UK59'
 5 | gene_type: 'Lauriat Debranching'
 6 | species: 'H. sapiens'
 7 | seq: "MRVAVAGCCHGELDKIYETLALAERRGPGPVDLLLCCGDFQAVRNEADLRCMAVPPKYRH\
 8 |       MQTFYRYYSGEKKAPVLTLFIGGNHEASNHLQELPYGGWVAPNIYYLGLAGVVKYRGVRI\
 9 |       GGISGIFKSHDYRKGHFECPPYNSSTIRSIYHVRNIEVYKLKQLKQPIDIFLSHDWPRSI\
10 |       YHYGNKKQLLKTKSFFRQEVENNTLGSPAASELLEHLKPTYWFSAHLHVKFAALMQHQAK\
11 |       DKGQTARATKFLALDKCLPHRDFLQILEIEHDPSAPDYLEYDIEWLTILRATDDLINVTG\
12 |       RLWNMPENNGLHARWDYSATEEGMKEVLEKLNHDLKVPCNFSVTAACYDPSKPQTQMQLI\
13 |       HRINPQTTEFCAQLGIIDINVRLQKSKEEHHVCGEYEEQDDVESNDSGEDQSEYNTDTSA\
14 |       LSSINPDEIMLDEEEDEDSIVSAHSGMNTPSVEPSDQASEFSASFSDVRILPGSMIVSSD\
15 |       DTVDSTIDREGKPGGTVESGNGEDLTKVPLKRLSDEHEPEQRKKIKRRNQAIYAAVDDDD\
16 |       DDAA" 
17 | experiment: 'Growth'
18 | transform: 'None'
19 | authour: 'Findlay et al.'
20 | year: 2014
21 | title: 'Saturation editing of genomic regions by multiplex homology-directed repair'
22 | lab: ['Shendure']
23 | doi: 'doi:10.1038/nature13695'
24 | pmid: '25141179'
25 | url: 'https://www.nature.com/articles/nature13695'
26 | input_files:
27 |   - '41586_2014_BFnature13695_MOESM383_ESM.xlsx' 
28 | source: 'SI - Table S4'
29 | qc:
30 |   filter: False
31 |   notes:
32 | 


--------------------------------------------------------------------------------
/data/studies/doud_2015_np/standardise_doud_2015_np.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Doud et al. 2015 (NP)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/doud_2015_np/doud_2015_np.yaml')
 8 | dm_data <- read_table2('data/studies/doud_2015_np/raw/Supp_file_2_mean_aichi1968_prefs.txt', skip = 1,
 9 |                        col_names = c('position', 'wt', 'entropy', 'A', 'C', 'D', 'E',
10 |                                      'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q',
11 |                                      'R', 'S', 'T', 'V', 'W', 'Y'),
12 |                        col_types = cols(position=col_integer(), wt=col_character(), .default = col_double())) %>%
13 |   pivot_longer(A:Y, names_to = 'mut', values_to = 'raw_score') %>%
14 |   mutate(wt = str_split(meta$seq, '')[[1]][position]) %>% # is the same in all but 334, where the data has '?', so replace with known WTs
15 |   group_by(position) %>%
16 |   mutate(transformed_score = log2(raw_score / raw_score[which(mut == first(wt))])) %>% # Normalise by the WT at that position
17 |   ungroup() %>%
18 |   mutate(score = normalise_score(transformed_score),
19 |          class = get_variant_class(wt, mut))
20 | 
21 | # Save output
22 | standardise_study(dm_data, meta$study, meta$transform)
23 | 


--------------------------------------------------------------------------------
/data/studies/melamed_2013_pab1/melamed_2013_pab1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'melamed_2013_pab1' 
 2 | gene: 'PAB1'
 3 | domain: 'RRM'
 4 | uniprot_id: 'P04147'
 5 | gene_type: 'RNA Binding'
 6 | species: 'S. cerevisiae'
 7 | seq: "MADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYVGDLEPSVSEAHL\
 8 |       YDIFSPIGSVSSIRVCRDAITKTSLGYAYVNFNDHEAGRKAIEQLNYTPIKGRL\
 9 |       CRIMWSQRDPSLRKKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDEN\
10 |       GKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAPHLSRKERDSQLEETKA\
11 |       HYTNLYVKNINSETTDEQFQELFAKFGPIVSASLEKDADGKLKGFGFVNYEKHE\
12 |       DAVKAVEALNDSELNGEKLYVGRAQKKNERMHVLKKQYEAYRLEKMAKYQGVNL\
13 |       FVKNLDDSVDDEKLEEEFAPYGTITSAKVMRTENGKSKGFGFVCFSTPEEATKA\
14 |       ITEKNQQIVAGKPLYVAIAQRKDVRRSQLAQQIQARNQMRYQQATAAAAAAAAG\
15 |       MPGQFMPPMFYGVMPPRGVPFNGPNPQQMNPMGGMPKNGMPPQFRNGPVYGVPP\
16 |       QGGFPRNANDNNQFYQQKQRQALGEQLYKKVSAKTSNEEAAGKITGMILDLPPQ\
17 |       EVFPLLESDELFEQHYKEASAAYESFKKEQEQQTEQA" 
18 | experiment: 'Growth'
19 | transform: 'None'
20 | authour: 'Melamed et al.'
21 | year: 2013
22 | title: 'Deep mutational scanning of an RRM domain of the Saccharomyces cerevisiae poly(A)-binding protein'
23 | lab: ['Fields']
24 | doi: '10.1261/rna.040709.113'
25 | pmid: '24064791'
26 | url: 'https://rnajournal.cshlp.org/content/19/11/1537'
27 | input_files:
28 |   - 'Supplementary_Table_2.xlsx' 
29 | source: 'SI'
30 | qc:
31 |   filter: False
32 |   notes:
33 | 


--------------------------------------------------------------------------------
/data/studies/brenan_2016_mapk1/brenan_2016_mapk1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'brenan_2016_mapk1' 
 2 | gene: 'MAPK1'
 3 | uniprot_id: 'P28482'
 4 | gene_type: 'Kinase'
 5 | species: 'H. sapiens'
 6 | seq: "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFE\
 7 |       HQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQH\
 8 |       LSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDH\
 9 |       TGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHI\
10 |       LGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHK\
11 |       RIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" 
12 | experiment: 'Growth'
13 | transform: '-x'
14 | authour: 'Brenan et al.'
15 | year: 2016
16 | title: 'Phenotypic Characterization of a Comprehensive Set of MAPK1/ERK2 Missense Mutants'
17 | lab: ['Johannessen']
18 | doi: '10.1016/j.celrep.2016.09.061'
19 | pmid: '27760319'
20 | url: 'https://www.sciencedirect.com/science/article/pii/S2211124716313171'
21 | notes: 'Transform by -1 because the selection scheme they used favoured lof > wt > gof.'
22 | input_files:
23 |   - '1-s2.0-S2211124716313171-mmc2.xlsx' 
24 | source: 'SI - Table S1'
25 | qc:
26 |   filter: True 
27 |   notes: "Experimental setup leads to many substitutions with positive ER values, which\
28 |           are thought to be GOF variants. However in real organisms many GOF changes\
29 |           are still deleterious"
30 | 


--------------------------------------------------------------------------------
/data/studies/lee_2018_ha/standardise_lee_2018_ha.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Lee et al. 2018 (HA)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/lee_2018_ha/lee_2018_ha.yaml')
 8 | 
 9 | # Extract overall protein position - is split into signal peptide and HA1/HA2 chains in original file
10 | get_position <- function(x){
11 |   if (grepl('\\-', x)){
12 |     return(as.numeric(x) + 17)
13 |   } else if (grepl('HA2', x)) {
14 |     x <- gsub('\\(HA2\\)', '', x)
15 |     return(as.numeric(x) + 345)
16 |   } else {
17 |     return(as.numeric(x) + 16)
18 |   }
19 | }
20 | 
21 | dm_data <- read_xlsx('data/studies/lee_2018_ha/raw/pnas.1806133115.sd03.xlsx', sheet = 'avg_prefs') %>%
22 |   rename(position = site) %>%
23 |   gather(key = 'mut', value = 'raw_score', -position, -entropy, -neffective) %>%
24 |   mutate(position = sapply(position, get_position),
25 |          wt = str_split(meta$seq, '')[[1]][position],
26 |          class = get_variant_class(wt, mut)) %>%
27 |   arrange(position, mut) %>%
28 |   group_by(position) %>%
29 |   mutate(transformed_score = log2(raw_score / raw_score[which(mut == first(wt))])) %>%
30 |   ungroup() %>%
31 |   mutate(score = normalise_score(transformed_score))
32 | 
33 | # Save output
34 | standardise_study(dm_data, meta$study, meta$transform)
35 | 


--------------------------------------------------------------------------------
/data/studies/sun_2018_cbs/sun_2018_cbs.yaml:
--------------------------------------------------------------------------------
 1 | study: 'sun_2018_cbs' 
 2 | gene: 'CBS'
 3 | uniprot_id: 'P35520'
 4 | gene_type: 'Metabolic'
 5 | species: 'H. sapiens'
 6 | seq: "MPSETPQAEVGPTGCPHRSGPHSAKGSLEKGSPEDKEAKEPLWIRPDAPSRCTWQLGRPA\
 7 |       SESPHHHTAPAKSPKILPDILKKIGDTPMVRINKIGKKFGLKCELLAKCEFFNAGGSVKD\
 8 |       RISLRMIEDAERDGTLKPGDTIIEPTSGNTGIGLALAAAVRGYRCIIVMPEKMSSEKVDV\
 9 |       LRALGAEIVRTPTNARFDSPESHVGVAWRLKNEIPNSHILDQYRNASNPLAHYDTTADEI\
10 |       LQQCDGKLDMLVASVGTGGTITGIARKLKEKCPGCRIIGVDPEGSILAEPEELNQTEQTT\
11 |       YEVEGIGYDFIPTVLDRTVVDKWFKSNDEEAFTFARMLIAQEGLLCGGSAGSTVAVAVKA\
12 |       AQELQEGQRCVVILPDSVRNYMTKFLSDRWMLQKGFLKEEDLTEKKPWWWHLRVQELGLS\
13 |       APLTVLPTITCGHTIEILREKGFDQAPVVDEAGVILGMVTLGNMLSSLLAGKVQPSDQVG\
14 |       KVIYKQFKQIRLTDTLGRLSHILEMDHFALVVHEQIQYHSTGKSSQRQMVFGVVTAIDLL\
15 |       NFVAAQERDQK" 
16 | experiment: 'Coupled growth'
17 | transform: 'VAMP-seq'
18 | authour: 'Sun et al.'
19 | year: 2018
20 | title: 'A proactive genotype-to-patient-phenotype map for cystathionine beta-synthase'
21 | lab: ['Roth']
22 | doi: '10.1101/473983'
23 | pmid: ''
24 | url: 'https://www.biorxiv.org/content/10.1101/473983v3'
25 | mavedb_urn: 'urn:mavedb:00000005-a'
26 | notes: 'Used low B6 dataset as they were similar but it produced stronger effects'
27 | input_files:
28 |   - 'urn_mavedb_00000005-a-4_scores.csv' 
29 | source: 'MaveDB'
30 | qc:
31 |   filter: False
32 |   notes:
33 | 


--------------------------------------------------------------------------------
/meta/fasta/m_musculus_ube4b.fa:
--------------------------------------------------------------------------------
 1 | >sp|Q9ES00|UBE4B_MOUSE Ubiquitin conjugation factor E4 B OS=Mus musculus OX=10090 GN=Ube4b PE=1 SV=3
 2 | MEELSADEIRRRRLARLAGGQTSQPTTPLTSPQRENPPGPPIAASAPGPSQSLGLNVHNM
 3 | TPATSPIGAAGVAHRSQSSEGVSSLSSSPSNSLETQSQSLSRSQSMDIDGVSCEKSMSQV
 4 | DVDSGIENMEVDENDRREKRSLSDKEPSSGPEVSEEQALQLVCKIFRVSWKDRDRDVIFL
 5 | SSLSAQFKQNPKEVFSDFKDLIGQILMEVLMMSTQTRDENPFASLTATSQPIATAARSPD
 6 | RNLMLNTGSSSGTSPMFCNMGSFSTSSLSSLGASGGASNWDSYSDHFTIETCKETDMLNY
 7 | LIECFDRVGIEEKKAPKMCSQPAVSQLLSNIRSQCISHTALVLQGSLTQPRSLQQPSFLV
 8 | PYMLCRNLPYGFIQELVRTTHQDEEVFKQIFIPILQGLALAAKECSLESDYFKYPLMALG
 9 | ELCETKFGKTHPMCNLVASLPLWLPKSLSPGSGRELQRLSYLGAFFSFSVFAEDDAKVVE
10 | KYFSGPAITLENTRVVSQSLQHYLELGRQELFKILHSILLNGETREAALSYMAALVNANM
11 | KKAQMQADDRLVSTDGFMLNLLWVLQQLSTKIKLETVDPTYIFHPRCRITLPNDETRINA
12 | TMEDVNERLTELYGDQPPFSEPKFPTECFFLTLHAHHLSILPSCRRYIRRLRAIRELNRT
13 | VEDLKNNESQWKDSPLATRHREMLKRCKTQLKKLVRCKACADAGLLDESFLRRCLNFYGL
14 | LIQLMLRILDPAYPDVTLPLNSEVPKVFAALPEFYVEDVAEFLFFIVQYSPQVLYEPCTQ
15 | DIVMFLVVMLCNQNYIRNPYLVAKLVEVMFMTNPSVQPRTQKFFEMIENHPLSTKLLVPS
16 | LMKFYTDVEHTGATSEFYDKFTIRYHISTIFKSLWQNIAHHGTFMEEFNSGKQFVRYINM
17 | LINDTTFLLDESLESLKRIHEVQEEMKNKEQWDQLPRDQQQARQSQLAQDERVSRSYLAL
18 | ATETVDMFHLLTKQVQKPFLRPELGPRLAAMLNFNLQQLCGPKCRDLKVENPEKYGFEPK
19 | KLLDQLTDIYLQLDCARFAKAIADDQRSYSKELFEEVISKMRKAGIKSTIAIEKFKLLAE
20 | KVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDP
21 | FNRQMLTESMLEPVPELKEQIQAWMREKQSSDH
22 | 
23 | 


--------------------------------------------------------------------------------
/data/studies/findlay_2014_dbr1/standardise_findlay_2014_dbr1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Findlay et al. 2014 (DBR1
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/findlay_2014_dbr1/findlay_2014_dbr1.yaml')
 8 | dm_data <- read_xlsx('data/studies/findlay_2014_dbr1/raw/41586_2014_BFnature13695_MOESM383_ESM.xlsx', skip = 5, na = 'NA',
 9 |                      col_names = c('seq', 'log2_enrichment_score_day11_rep1', 'log2_enrichment_score_day11_rep2',
10 |                                    'position', 'wt', 'mut', 'mut_type')) %>%
11 |   mutate(position = as.integer(na_if(position, "3'SS")),
12 |          mut = ifelse(mut == 'WT', wt, mut),
13 |          raw_score = rowMeans(select(., log2_enrichment_score_day11_rep1, log2_enrichment_score_day11_rep2), na.rm = TRUE) %>% replace_na(NA)) %>%
14 |   group_by(position, wt, mut) %>%
15 |   summarise(raw_score = mean(raw_score, na.rm=TRUE)) %>%
16 |   ungroup() %>%
17 |   mutate(transformed_score = raw_score,
18 |          score = normalise_score(transformed_score),
19 |          class = get_variant_class(wt, mut)) %>%
20 |   drop_na(position, raw_score) %>%
21 |   filter(!mut == 'DEL') %>%
22 |   select(position, wt, mut, score, transformed_score, raw_score, class)
23 | 
24 | # Save output
25 | standardise_study(dm_data, meta$study, meta$transform)
26 | 


--------------------------------------------------------------------------------
/data/studies/araya_2012_yap1/araya_2012_yap1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'araya_2012_yap1' 
 2 | gene: 'YAP1'
 3 | domain: 'WW'
 4 | uniprot_id: 'P46937'
 5 | gene_type: 'TF'
 6 | species: 'H. sapiens'
 7 | seq: "MDPGQQPPPQPAPQGQGQPPSQPPQGQGPPSGPGQPAPAATQAAPQAPPAGHQI\
 8 |       VHVRGDSETDLEALFNAVMNPKTANVPQTVPMRLRKLPDSFFKPPEPKSHSRQA\
 9 |       STDAGTAGALTPQHVRAHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLRQ\
10 |       SSFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLSQMNVTAPT\
11 |       SPPVQQNMMNSASGPLPDGWEQAMTQDGEIYYINHKNKTTSWLDPRLDPRFAMN\
12 |       QRISQSAPVKQPPPLAPQSPQGGVMGGSNSNQQQQMRLQQLQMEKERLRLKQQE\
13 |       LLRQAMRNINPSTANSPKCQELALRSQLPTLEQDGGTQNPVSSPGMSQELRTMT\
14 |       TNSSDPFLNSGTYHSRDESTDSGLSMSSYSVPRTPDDFLNSVDEMDTGDTINQS\
15 |       TLPSQQNRFPDYLEAIPGTNVDLGTLEGDGMNIEGEELMPSLQEALSSDILNDM\
16 |       ESVLAATKLDKESFLTWL"
17 | experiment: 'ligand binding'
18 | transform: 'None'
19 | authour: 'Araya et al.'
20 | year: 2012
21 | title: 'A fundamental protein property, thermodynamic stability, revealed solely from large-scale measurements of protein function'
22 | lab: ['Fields', 'Fowler']
23 | doi: '10.1073/pnas.1209751109'
24 | pmid: '23035249'
25 | url: 'http://www.pnas.org/content/109/42/16858'
26 | notes: 'Positions from paper are offset by 9, so add 160 to reach same start of WW domain in sequence'
27 | input_files:
28 |   - 'urn_mavedb_00000002-a-2_scores.csv' 
29 | source: 'MaveDB'
30 | mavedb_urn: 'urn:mavedb:00000002-a-2'
31 | qc:
32 |   filter: False
33 |   notes:
34 | 


--------------------------------------------------------------------------------
/data/studies/hietpas_2011_hsp90/hietpas_2011_hsp90.yaml:
--------------------------------------------------------------------------------
 1 | study: 'hietpas_2011_hsp90'
 2 | gene: 'HSP90'
 3 | uniprot_id: 'P02829'
 4 | gene_type: 'Chaperone'
 5 | species: 'S. cerevisiae'
 6 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPK\
 7 |       QLETEPDLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEAL\
 8 |       SAGADVSMIGQFGVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDE\
 9 |       VNERIGRGTILRLFLKDDQLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKE\
10 |       VPIPEEEKKDEEKKDEEKKDEDDKKPKLEEVDEEEEKKPKTKKVKEEVQEIEEL\
11 |       NKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPLYVKHFSVEGQLEFRAILFIP\
12 |       KRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSFVKGVVDSEDLPLNL\
13 |       SREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSAFSKNIKLGVHED\
14 |       TQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLKAVEKSP\
15 |       FLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\
16 |       EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMK\
17 |       AQALRDSSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYE\
18 |       TALLTSGFSLDEPTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPAD\
19 |       TEMEEVD"
20 | experiment: 'Growth'
21 | transform: 'None'
22 | authour: 'Hietpas et al.'
23 | year: 2011
24 | title: 'Experimental illumination of a fitness landscape'
25 | lab: ['Bolon']
26 | doi: '10.1073/pnas.1016024108'
27 | pmid: '21464309'
28 | url: 'http://www.pnas.org/content/108/19/7896'
29 | input_files:
30 |   - 'sd02.csv' 
31 | source: 'SI - Dataset S2'
32 | qc:
33 |   filter: False
34 |   notes:
35 | 


--------------------------------------------------------------------------------
/data/studies/wrenbeck_2017_amie/standardise_wrenbeck_2017_amie.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Wrenbeck et al. 2017 (amiE)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/wrenbeck_2017_amie/wrenbeck_2017_amie.yaml')
 8 | dm_data <- bind_rows(acetamide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Acetamide.txt', na = c('NS', 'None')),
 9 |                      isobutyramide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Isobutyramide.txt', na = c('NS', 'None')),
10 |                      propionamide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Propionamide.txt', na = c('NS', 'None')),
11 |                      .id = 'condition') %>%
12 |   select(position = location, mut = mutation, raw_score = normalized_fitness, condition) %>%
13 |   pivot_wider(names_from = condition, values_from = raw_score) %>%
14 |   mutate(raw_score = rowMeans(select(., acetamide, isobutyramide, propionamide), na.rm = TRUE) %>% replace_na(NA),
15 |          transformed_score = raw_score,
16 |          score = normalise_score(transformed_score),
17 |          wt = str_split(meta$seq, '')[[1]][position],
18 |          class = get_variant_class(wt, mut)) %>%
19 |   drop_na(score) # Some variants not measured in any condition
20 | 
21 | # Save output
22 | standardise_study(dm_data, meta$study, meta$transform)
23 | 


--------------------------------------------------------------------------------
/data/studies/ahler_2019_src/ahler_2019_src.yaml:
--------------------------------------------------------------------------------
 1 | study: 'ahler_2019_src' 
 2 | gene: 'Src'
 3 | domain: 'Catalytic and SH4'
 4 | uniprot_id: 'P12931'
 5 | gene_type: 'Kinase'
 6 | species: 'H. sapiens'
 7 | seq: "MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAFAPAAAE\
 8 |       PKLFGGFNSSDTVTSPQRAGPLAGGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGD\
 9 |       WWLAHSLSTGQTGYIPSNYVAPSDSIQAEEWYFGKITRRESERLLLNAENPRGTFLVRES\
10 |       ETTKGAYCLSVSDFDNAKGLNVKHYKIRKLDSGGFYITSRTQFNSLQQLVAYYSKHADGL\
11 |       CHRLTTVCPTSKPQTQGLAKDAWEIPRESLRLEVKLGQGCFGEVWMGTWNGTTRVAIKTL\
12 |       KPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKY\
13 |       LRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYT\
14 |       ARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVER\
15 |       GYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFLEDYFTSTEPQYQPGENL" 
16 | experiment: 'Coupled yeast growth'
17 | transform: '-x'
18 | authour: 'Ahler et al.'
19 | year: 2019
20 | title: 'A Combined Approach Reveals a Regulatory Mechanism Coupling Src’s Kinase Activity, Localization, and Phosphotransferase-Independent Functions'
21 | lab: ['Fowler', 'Maly']
22 | doi: '10.1016/j.molcel.2019.02.003'
23 | pmid: '30956043'
24 | url: 'https://www.sciencedirect.com/science/article/pii/S1097276519300930'
25 | input_files:
26 |   - 'urn_mavedb_00000041-b-1_scores.csv'
27 |   - 'urn_mavedb_00000041-a-1_scores.csv'
28 | source: 'MaveDB'
29 | mavedb_urn: 'urn_mavedb_00000041'
30 | qc:
31 |   filter: False
32 |   notes:
33 | 


--------------------------------------------------------------------------------
/bin/data_processing/make_gene_fasta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Generate a fasta file for a gene, given a set of study yamls
 4 | """
 5 | import argparse
 6 | from ruamel.yaml import YAML
 7 | 
 8 | from subtypes_utils import gene_to_filename
 9 | 
10 | FASTA_LINE_LENGTH = 80
11 | 
12 | def main(args):
13 |     """Main script"""
14 |     yaml = YAML(typ='safe')
15 |     seq = None
16 |     gene = None
17 |     for study_yaml in args.yaml:
18 |         with open(study_yaml, 'r') as yaml_file:
19 |             conf = yaml.load(yaml_file)
20 | 
21 |         if seq is None:
22 |             seq = conf['seq']
23 |             gene = conf['gene']
24 |         elif not gene == conf['gene']:
25 |             raise ValueError(f"Studies are for different genes")
26 |         elif not seq == conf['seq']:
27 |             raise ValueError(f"Studies have different sequences for {gene}")
28 | 
29 |     print(f">{gene_to_filename(gene)}")
30 |     for i in range(0, len(seq), FASTA_LINE_LENGTH):
31 |         print(seq[i:(i + FASTA_LINE_LENGTH)])
32 | 
33 | def parse_args():
34 |     """Process input arguments"""
35 |     parser = argparse.ArgumentParser(description=__doc__,
36 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
37 | 
38 |     parser.add_argument('yaml', metavar='Y', nargs='+', help="Input study config YAML file(s)")
39 | 
40 |     return parser.parse_args()
41 | 
42 | if __name__ == "__main__":
43 |     ARGS = parse_args()
44 |     main(ARGS)
45 | 


--------------------------------------------------------------------------------
/bin/data_processing/filter_pdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Filter PDB file based on section list
 4 | """
 5 | import sys
 6 | import argparse
 7 | from pathlib import Path
 8 | 
 9 | from Bio.PDB import PDBParser
10 | from Bio.PDB.PDBIO import PDBIO
11 | 
12 | from subtypes_utils import SectionSelecter, import_sections
13 | 
14 | def main(args):
15 |     """Main script"""
16 |     pdb_name = Path(args.pdb).stem
17 |     # deal with FoldX repaired PDBs
18 |     if pdb_name.endswith('_Repair'):
19 |         pdb_name = pdb_name.replace('_Repair', '')
20 | 
21 |     pdb_parser = PDBParser()
22 |     structure = pdb_parser.get_structure(pdb_name, args.pdb)
23 | 
24 |     sections = import_sections(args.yaml, pdb_name)
25 | 
26 |     pdbio = PDBIO()
27 |     pdbio.set_structure(structure)
28 |     pdbio.save(sys.stdout, select=SectionSelecter(sections))
29 | 
30 | def parse_args():
31 |     """Process input arguments"""
32 |     parser = argparse.ArgumentParser(description=__doc__,
33 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
34 | 
35 |     parser.add_argument('pdb', metavar='P', help="Input PDB file")
36 | 
37 |     parser.add_argument('--yaml', '-y',
38 |                         help=("YAML file detailing regions to process for a set of genes or "
39 |                               "these sections in raw YAML strings"))
40 | 
41 |     return parser.parse_args()
42 | 
43 | if __name__ == "__main__":
44 |     ARGS = parse_args()
45 |     main(ARGS)
46 | 


--------------------------------------------------------------------------------
/data/studies/heredia_2018_ccr5/heredia_2018_ccr5.yaml:
--------------------------------------------------------------------------------
 1 | study: 'heredia_2018_ccr5' 
 2 | gene: 'CCR5'
 3 | uniprot_id: 'P51681'
 4 | gene_type: 'GPCR'
 5 | species: 'H. sapiens'
 6 | seq: "MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFVGNMLVILILINCKR\
 7 |       LKSMTDIYLLNLAISDLFFLLTVPFWAHYAAAQWDFGNTMCQLLTGLYFIGFFSGIFFII\
 8 |       LLTIDRYLAVVHAVFALKARTVTFGVVTSVITWVVAVFASLPGIIFTRSQKEGLHYTCSS\
 9 |       HFPYSQYQFWKNFQTLKIVILGLVLPLLVMVICYSGILKTLLRCRNEKKRHRAVRLIFTI\
10 |       MIVYFLFWAPYNIVLLLNTFQEFFGLNNCSSSNRLDQAMQVTETLGMTHCCINPIIYAFV\
11 |       GEKFRNYLLVFFQKHIAKRFCKCCSIFQQEAPERASSVYTRSTGEQEISVGL" 
12 | experiment: 'Surface expression and HIV-1 blocking antibody 2D7 affinity via FACS'
13 | transform: 'None'
14 | authour: 'Heredia et al.'
15 | year: 2018
16 | title: 'Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning'
17 | lab: ['Procko']
18 | doi: '10.4049/jimmunol.1800343'
19 | pmid: '29678950'
20 | url: 'https://www.jimmunol.org/content/200/11/3825'
21 | notes: "Manually edited the raw XLSX file, moving Y14 in column 1 down one row to its \
22 |         proper place as it had been eroneously offset to be in the last row of the 13th \
23 |         AA data. Use mean of antibody binding experiment scores as method also incorporated\
24 |         surface expression."
25 | input_files:
26 |   - 'GSE100368_enrichment_ratios_CCR5.xlsx' 
27 | source: 'NCBI GEO: GSE100368 - https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368'
28 | qc:
29 |   filter: False
30 |   notes:
31 | 


--------------------------------------------------------------------------------
/data/studies/lee_2018_ha/lee_2018_ha.yaml:
--------------------------------------------------------------------------------
 1 | study: 'lee_2018_ha' 
 2 | gene: 'HA'
 3 | uniprot_id: 'P03437'
 4 | gene_type: 'Viral'
 5 | species: 'Influenza'
 6 | strain: 'Human adapted A/Perth/16/2009, H3N2'
 7 | seq: "MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDS\
 8 |       PHQILDGKNCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNES\
 9 |       FNWTGVTQNGTSSACIRRSKNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVLHPGTDKDQIFL\
10 |       YAQASGRITVSTKRSQQTVSPNIGSRPRVRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGK\
11 |       SSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNVPEKQTRGIFGA\
12 |       IAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEG\
13 |       RIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCD\
14 |       NACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNI\
15 |       RCNICI" 
16 | experiment: 'Growth'
17 | transform: 'log2(x_mut_i / x_wt_i)'
18 | authour: 'Lee et al.'
19 | year: 2018
20 | title: 'Deep mutational scanning of hemagglutinin helps predict evolutionary fates of human H3N2 influenza variants'
21 | lab: ['Bloom']
22 | doi: '10.1073/pnas.1806133115'
23 | pmid: '30104379'
24 | url: 'www.pnas.org/cgi/doi/10.1073/pnas.1806133115'
25 | notes: "Only gives per position frequencies, so cannot easily get comparable \
26 |         scores across the protein  without re-running analysis."
27 | input_files:
28 |   - 'pnas.1806133115.sd03.xlsx' 
29 | source: 'SI - Table S3'
30 | qc:
31 |   filter: False
32 |   notes:
33 | 


--------------------------------------------------------------------------------
/data/studies/mishra_2016_hsp90/mishra_2016_hsp90.yaml:
--------------------------------------------------------------------------------
 1 | study: 'mishra_2016_hsp90' 
 2 | gene: 'HSP90'
 3 | uniprot_id: 'P02829'
 4 | gene_type: 'Chaperone'
 5 | species: 'S. cerevisiae'
 6 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPKQLETEP\
 7 |       DLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF\
 8 |       GVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDEVNERIGRGTILRLFLKDD\
 9 |       QLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKEVPIPEEEKKDEEKKDEEKKDEDDK\
10 |       KPKLEEVDEEEEKKPKTKKVKEEVQEIEELNKTKPLWTRNPSDITQEEYNAFYKSISNDW\
11 |       EDPLYVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPE\
12 |       WLSFVKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSA\
13 |       FSKNIKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLK\
14 |       AVEKSPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\
15 |       EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRD\
16 |       SSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYETALLTSGFSLDE\
17 |       PTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPADTEMEEVD"
18 | experiment: 'Growth'
19 | transform: 'None'
20 | authour: 'Mishra et al.'
21 | year: 2016
22 | title: 'Systematic Mutant Analyses Elucidate General and Client-Specific Aspects of Hsp90 Function'
23 | lab: ['Bolon']
24 | doi: '10.1016/j.celrep.2016.03.046'
25 | pmid: '27068472'
26 | url: 'https://www.sciencedirect.com/science/article/pii/S2211124716303175'
27 | input_files:
28 |   - '1-s2.0-S2211124716303175-mmc2.xlsx' 
29 | source: 'SI - Table S1'
30 | qc:
31 |   filter: False
32 |   notes:
33 | 


--------------------------------------------------------------------------------
/data/studies/jones_2019_adrb2/standardise_jones_2019_adrb2.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Jones et al. 2019 (ADRB2)
 3 | 
 4 | source('src/config.R')
 5 | source('src/study_standardising.R')
 6 | 
 7 | # Determine WT like
 8 | data("BLOSUM62")
 9 | similar_aas <- as_tibble(BLOSUM62, rownames = 'aa1') %>%
10 |   pivot_longer(-aa1, names_to = 'aa2', values_to = 'blosum') %>%
11 |   filter(blosum > 2, !aa1 == aa2, !aa1 %in% c('B', 'J', 'Z', 'X', '*'), !aa2 %in% c('B', 'J', 'Z', 'X', '*'))
12 | similar_aas <- str_c(similar_aas$aa1, similar_aas$aa2)
13 |   
14 | # Import and process data
15 | meta <- read_yaml('data/studies/jones_2019_adrb2/jones_2019_adrb2.yaml')
16 | dm_data <- read_csv('data/studies/jones_2019_adrb2/raw/lib-med.csv') %>%
17 |   filter(Condition == 0.150) %>% # Select only EC50 measure (generally correlate between these)
18 |   select(position = Pos, mut = AA, raw_score = Median, Repeat) %>%
19 |   group_by(position, mut) %>%
20 |   summarise(raw_score = median(raw_score)) %>% # Average biological repeats
21 |   ungroup() %>%
22 |   mutate(wt = str_split(meta$seq, '')[[1]][position],
23 |          pair = str_c(wt, mut),
24 |          # Divide by average score of v. similar AAs (blosum62 > 1) as substitute for wt
25 |          transformed_score = log2(raw_score / mean(raw_score[pair %in% similar_aas], na.rm=TRUE)),
26 |          score = normalise_score(transformed_score), 
27 |          class = get_variant_class(wt, mut))
28 | 
29 | # Save output
30 | standardise_study(dm_data, meta$study, meta$transform)
31 | 
32 | 


--------------------------------------------------------------------------------
/data/studies/firnberg_2014_tem1/standardise_firnberg_2014_tem1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Firnberg et al. 2014 (TEM1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/firnberg_2014_tem1/firnberg_2014_tem1.yaml')
 8 | dm_data <- read_xlsx('data/studies/firnberg_2014_tem1/raw/firnberg_2014_tem1.xlsx', skip = 1,
 9 |                      col_names = c('position', 'ref_codon', 'alt_codon', 'wt', 'mut', 'base_changes', 'seq_counts_0.25',
10 |                                    'seq_counts_0.5', 'seq_counts_1', 'seq_counts_2', 'seq_counts_4', 'seq_counts_8',
11 |                                    'seq_counts_16', 'seq_counts_32', 'seq_counts_64', 'seq_counts_128', 'seq_counts_256', 
12 |                                    'seq_counts_512', 'seq_counts_1024', 'total_seq_count', 'raw_score', 'fitness_err')) %>%
13 |   drop_na(position) %>%
14 |   filter(!wt == '*') %>%
15 |   mutate(position = rep(1:nchar(meta$seq), each=64)) %>% # Numbering seems broken - starts at 3 and then misses 237 & 251
16 |   group_by(position, wt, mut) %>%
17 |   summarise(raw_score = mean(raw_score, na.rm = TRUE),
18 |             transformed_score = mean(log2(raw_score), na.rm = TRUE)) %>% # Average over codons
19 |   ungroup() %>%
20 |   mutate(score = normalise_score(transformed_score), 
21 |          class = get_variant_class(wt, mut)) %>%
22 |   drop_na(raw_score) # Some variants not measured in any codon
23 | 
24 | # Save output
25 | standardise_study(dm_data, meta$study, meta$transform)
26 | 


--------------------------------------------------------------------------------
/data/studies/jiang_2013_hsp90/jiang_2013_hsp90.yaml:
--------------------------------------------------------------------------------
 1 | study: 'jiang_2013_hsp90'
 2 | gene: 'HSP90'
 3 | domain: 'Putative substrate binding loop'
 4 | uniprot_id: 'P02829'
 5 | gene_type: 'Chaperone'
 6 | species: 'S. cerevisiae'
 7 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPK\
 8 |       QLETEPDLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEAL\
 9 |       SAGADVSMIGQFGVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDE\
10 |       VNERIGRGTILRLFLKDDQLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKE\
11 |       VPIPEEEKKDEEKKDEEKKDEDDKKPKLEEVDEEEEKKPKTKKVKEEVQEIEEL\
12 |       NKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPLYVKHFSVEGQLEFRAILFIP\
13 |       KRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSFVKGVVDSEDLPLNL\
14 |       SREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSAFSKNIKLGVHED\
15 |       TQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLKAVEKSP\
16 |       FLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\
17 |       EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMK\
18 |       AQALRDSSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYE\
19 |       TALLTSGFSLDEPTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPAD\
20 |       TEMEEVD" 
21 | experiment: 'Complement'
22 | transform: 'log2(x)'
23 | authour: 'Jiang et al.'
24 | year: 2013
25 | title: 'Latent Effects of Hsp90 Mutants Revealed at Reduced Expression Levels'
26 | lab: ['Bolon']
27 | doi: '10.1371/journal.pgen.1003600'
28 | pmid: '23825969'
29 | url: 'https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003600'
30 | input_files:
31 |   - 'journal.pgen.1003600.s014.xlsx' 
32 | source: 'SI - Table S3'
33 | qc:
34 |   filter: False
35 |   notes:
36 | 


--------------------------------------------------------------------------------
/data/studies/giacomelli_2018_tp53/giacomelli_2018_tp53.yaml:
--------------------------------------------------------------------------------
 1 | study: 'giacomelli_2018_tp53' 
 2 | gene: 'TP53'
 3 | uniprot_id: 'P04637'
 4 | gene_type: 'TF'
 5 | species: 'H. sapiens'
 6 | seq: "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP\
 7 |       DEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK\
 8 |       SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE\
 9 |       RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS\
10 |       SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP\
11 |       PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG\
12 |       GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" 
13 | experiment: 'Complement'
14 | transform: '-x'
15 | authour: 'Giacomelli et al.'
16 | year: 2018
17 | title: 'Mutational processes shape the landscape of TP53 mutations in human cancer'
18 | lab: ['Hahn', 'Root', 'Johannessen']
19 | doi: '10.1038/s41588-018-0204-y'
20 | pmid: '30224644'
21 | url: 'https://www.nature.com/articles/s41588-018-0204-y'
22 | notes: "Score is in Z-score format across 3 different conditions:\
23 |         wt p53 background and nutlin3 to select for dominant negatives;\
24 |         null p53 background and nutlin3 to select for LOF;
25 |         and null p53 background and etoposide to select for WT like.\
26 |         Here I only use p53 NULL, Etoposide because it aligns most\
27 |         directly with our objective of a functional protein.\
28 |         The WT for this paper differs by P72R from the Uniprot ID."
29 | input_files:
30 |   - '41588_2018_204_MOESM5_ESM.xlsx' 
31 | source: 'SI'
32 | qc:
33 |   filter: False
34 |   notes:
35 | 


--------------------------------------------------------------------------------
/meta/fasta/s_pyrogenes_cas9.fa:
--------------------------------------------------------------------------------
 1 | >sp|Q99ZW2|CAS9_STRP1 CRISPR-associated endonuclease Cas9/Csn1 OS=Streptococcus pyogenes serotype M1 OX=301447 GN=cas9 PE=1 SV=1
 2 | MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAE
 3 | ATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFG
 4 | NIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSD
 5 | VDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGN
 6 | LIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAI
 7 | LLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYA
 8 | GYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELH
 9 | AILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEE
10 | VVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFL
11 | SGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKI
12 | IKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWG
13 | RLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSL
14 | HEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRER
15 | MKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDH
16 | IVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNL
17 | TKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKS
18 | KLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKKYPKLESEFVYGDYKVYDVRK
19 | MIAKSEQEIGKATAKYFFYSNIMNFFKTEITLANGEIRKRPLIETNGETGEIVWDKGRDF
20 | ATVRKVLSMPQVNIVKKTEVQTGGFSKESILPKRNSDKLIARKKDWDPKKYGGFDSPTVA
21 | YSVLVVAKVEKGKSKKLKSVKELLGITIMERSSFEKNPIDFLEAKGYKEVKKDLIIKLPK
22 | YSLFELENGRKRMLASAGELQKGNELALPSKYVNFLYLASHYEKLKGSPEDNEQKQLFVE
23 | QHKHYLDEIIEQISEFSKRVILADANLDKVLSAYNKHRDKPIREQAENIIHLFTLTNLGA
24 | PAAFKYFDTTIDRKRYTSTKEVLDATLIHQSITGLYETRIDLSQLGGD
25 | 
26 | 


--------------------------------------------------------------------------------
/bin/figures/figureS3.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Produce figure S3 (Study Confounding)
 3 | source('src/config.R')
 4 | source('src/subtype_characterisation.R')
 5 | 
 6 | dms <- read_tsv('data/combined_mutational_scans.tsv') %>%
 7 |   mutate(uniprot_id = unname(UNIPROT_IDS[gene]))
 8 | 
 9 | umap2_breaks <- c(-2.5, 0, 2.5)
10 | 
11 | # Alternative format study normalising newline and correcting Ras/Src capitals. Not used in paper, where RAS and SRC were incorrectly lowercase
12 | alt_format_study <- function(x){
13 |   yaml <- read_yaml(str_c('data/studies/', x, '/', x, '.yaml'))
14 |   
15 |   study <- str_c(yaml$authour, ' ', yaml$year, '\n(', ifelse(yaml$gene == "Ras", "RAS", ifelse(yaml$gene == "Src", "SRC", yaml$gene)), ')')
16 |     
17 |   return(study)
18 | }
19 | 
20 | study_pretty <- sapply(unique(dms$study), alt_format_study)
21 | 
22 | figure <- mutate(dms, study_pretty = study_pretty[study]) %>%
23 |   ggplot(aes(x = umap1, y = umap2, colour = study_pretty)) +
24 |   facet_wrap(~study_pretty, ncol = 4) +
25 |   geom_point(data = dms, colour = 'grey90', shape = 20, size = 0.8) +
26 |   geom_point(shape = 20, size = 0.8) +
27 |   scale_y_continuous(breaks = umap2_breaks) +
28 |   labs(x = 'UMAP1', y = 'UMAP2') + 
29 |   guides(colour = FALSE)
30 | ggsave('figures/4_figures/figureS3.pdf', figure, width = 183, height = 270, units = 'mm')
31 | ggsave('figures/4_figures/figureS3.png', figure, width = 183, height = 270, units = 'mm')
32 | ggsave('figures/4_figures/figureS3.tiff', figure, width = 183, height = 270, units = 'mm')
33 | ggsave('figures/4_figures/figureS3.eps', figure, width = 183, height = 270, units = 'mm', device=cairo_ps, fallback_resolution = 600)
34 | 


--------------------------------------------------------------------------------
/data/studies/ashenberg_2017_np/ashenberg_2017_np.yaml:
--------------------------------------------------------------------------------
 1 | study: 'ashenberg_2017_np'
 2 | gene: 'NP'
 3 | uniprot_id: 'I6TAH8'
 4 | gene_type: 'viral'
 5 | species: 'Influenza'
 6 | strain: 'Human adapted strain A/Aichi/2/1968, H3N2'
 7 | seq: "MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLS\
 8 |       AFDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMI\
 9 |       WHSNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFW\
10 |       RGENGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCL\
11 |       PACVYGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLS\
12 |       FIRGTKVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSV\
13 |       QRNLPFDKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSN\
14 |       EGSYFFGDNAEEYDN"
15 | experiment: 'Complement'
16 | transform: 'None'
17 | authour: 'Ashenberg et al.'
18 | year: 2017
19 | title: 'Deep mutational scanning identifies sites in influenza nucleoprotein that affect viral inhibition by MxA'
20 | lab: ['Bloom']
21 | doi: '10.1371/journal.ppat.1006288'
22 | pmid: '28346537'
23 | url: 'https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1006288'
24 | input_files:
25 |   - 'journal.ppat.1006288.s013.csv'
26 | source: 'SI - S3 File (https://doi.org/10.1371/journal.ppat.1006288.s013)'
27 | qc:
28 |   filter: True 
29 |   notes: "Only report relative selection when grown in cell with MxA and\
30 |           without, effectively normalising for most selective effects.\
31 |           Consequently the score doesn't align with full fitness (and \
32 |           wasn't meant to) as shown by no correlation with SIFT scores."
33 | 


--------------------------------------------------------------------------------
/data/studies/wagenaar_2014_braf/standardise_wagenaar_2014_braf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Wagenaar et al. 2014 (BRAF)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/wagenaar_2014_braf/wagenaar_2014_braf.yaml')
 8 | dm_data <- read_xls('data/studies/wagenaar_2014_braf/raw/pcmr12171-sup-0011-TableS1.xls', skip = 3) %>%
 9 |   rename(position = Position,
10 |          mut = acid,
11 |          median_enrichment = Median,
12 |          rep1_codon1 = `Replicate 1`,
13 |          rep1_codon2 = ...5,
14 |          rep1_codon3 = ...6,
15 |          rep1_codon4 = ...7,
16 |          rep1_codon5 = ...8,
17 |          rep1_codon6 = ...9,
18 |          rep2_codon1 = `Replicate 2`,
19 |          rep2_codon2 = ...11,
20 |          rep2_codon3 = ...12,
21 |          rep2_codon4 = ...13,
22 |          rep2_codon5 = ...14,
23 |          rep2_codon6 = ...15,
24 |          ic50_vs_brafV600E = BRAFV600E,
25 |          individually_tested = `mutant?`,
26 |          possible_by_single_sub = `substitution?`) %>%
27 |   filter(!is.na(rep1_codon1) & !rep1_codon1 == 'Replicate 1') %>%
28 |   mutate_at(vars(-mut, -individually_tested, -possible_by_single_sub, -ic50_vs_brafV600E), as.numeric)%>%
29 |   mutate(wt = str_split(meta$seq, '')[[1]][position],
30 |          raw_score = median_enrichment,
31 |          transformed_score = -log2(median_enrichment),
32 |          score = normalise_score(transformed_score),
33 |          class = get_variant_class(wt, mut)) %>%
34 |   select(position, wt, mut, score, transformed_score, raw_score, class)
35 | 
36 | # Save output
37 | standardise_study(dm_data, meta$study, meta$transform)


--------------------------------------------------------------------------------
/data/studies/sarkisyan_2016_gfp/standardise_sarkisyan_2016_gfp.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Sarkisyan et al. 2016 (GFP)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/sarkisyan_2016_gfp/sarkisyan_2016_gfp.yaml')
 8 | raw_data <- read_tsv('data/studies/sarkisyan_2016_gfp/raw/amino_acid_genotypes_to_brightness.tsv', skip = 1,
 9 |                     col_names = c('mut', 'barcodes', 'median_brightness', 'std'))
10 | 
11 | wt_brightness <- filter(raw_data, is.na(mut)) %>% pull(median_brightness)
12 | 
13 | dm_data <- mutate(raw_data, n_mut = str_count(mut, ':') + 1) %>%
14 |   filter(n_mut <= 3) %>%
15 |   separate(mut, into = str_c('mut', 1:3), sep = ':', fill = 'right') %>%
16 |   pivot_longer(cols = starts_with('mut'), names_to = 'n', names_prefix = 'mut', values_to = 'mut') %>%
17 |   drop_na(mut) %>%
18 |   select(-n, -barcodes, -std) %>%
19 |   tidyr::extract(mut, into = c('wt', 'position', 'mut'), 'S([A-Z])([0-9]+)([A-Z*])', convert=TRUE, remove=FALSE) %>%
20 |   mutate(position = position + 2) %>% # Numbered from 3rd residue for some reason
21 |   arrange(position, mut) %>%
22 |   group_by(position, wt, mut) %>%
23 |   summarise(raw_score = if_else(1 %in% n_mut, mean(median_brightness[n_mut == 1], na.rm = TRUE), # Use value of single mut if available
24 |                                 mean(median_brightness[n_mut <= 4], na.rm = TRUE))) %>%
25 |   ungroup() %>%
26 |   mutate(transformed_score = log2(raw_score / wt_brightness),
27 |          score = normalise_score(transformed_score),
28 |          class = get_variant_class(wt, mut))
29 | 
30 | # Save output
31 | standardise_study(dm_data, meta$study, meta$transform)
32 | 


--------------------------------------------------------------------------------
/data/studies/wagenaar_2014_braf/wagenaar_2014_braf.yaml:
--------------------------------------------------------------------------------
 1 | study: 'wagenaar_2014_braf' 
 2 | gene: 'BRAF'
 3 | uniprot_id: 'P15056'
 4 | gene_type: 'Kinase'
 5 | species: 'H. sapiens'
 6 | seq: "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\
 7 |       IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\
 8 |       TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\
 9 |       LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\
10 |       TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\
11 |       PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\
12 |       DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\
13 |       GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\
14 |       AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\
15 |       LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATE\
16 |       KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\
17 |       NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\
18 |       LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" 
19 | experiment: 'Complement'
20 | transform: '-log2(x)'
21 | authour: 'Wagenaar et al.'
22 | year: 2014
23 | title: 'Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain'
24 | lab: ['Green', 'Bolon']
25 | doi: '10.1111/pcmr.12171'
26 | pmid: '24112705'
27 | url: 'https://onlinelibrary.wiley.com/doi/full/10.1111/pcmr.12171'
28 | notes: 'Only retained variants they deemed significantly different from wt'
29 | input_files:
30 |   - 'pcmr12171-sup-0011-TableS1.xls' 
31 | source: 'SI - Table S1'
32 | qc:
33 |   filter: True
34 |   notes: 'Selecting for drug resistance in oncogenic state - not exactly normal function'
35 | 


--------------------------------------------------------------------------------
/data/studies/dorrity_2018_ste12/dorrity_2018_ste12.yaml:
--------------------------------------------------------------------------------
 1 | study: 'dorrity_2018_ste12'
 2 | gene: 'STE12'
 3 | domain: 'DNA-binding' 
 4 | uniprot_id: 'P13574'
 5 | gene_type: 'Transcription Factor'
 6 | species: 'S. cerevisiae'
 7 | seq: "MKVQITNSRTEEILKVQANNENDEVSKATPGEVEESLRLIGDLKFFLATAPVNWQENQII\
 8 |       RRYYLNSGQGFVSCVFWNNLYYITGTDIVKCCLYRMQKFGREVVQKKKFEEGIFSDLRNL\
 9 |       KCGIDATLEQPKSEFLSFLFRNMCLKTQKKQKVFFWFSVAHDKLFADALERDLKRESLNQ\
10 |       PSTTKPVNEPALSFSYDSSSDKPLYDQLLQHLDSRRPSSTTKSDNSPPKLESENFKDNEL\
11 |       VTVTNQPLLGVGLMDDDAPESPSQINDFIPQKLIIEPNTLELNGLTEETPHDLPKNTAKG\
12 |       RDEEDFPLDYFPVSVEYPTEENAFDPFPPQAFTPAAPSMPISYDNVNERDSMPVNSLLNR\
13 |       YPYQLSVAPTFPVPPSSSRQHFMTNRDFYSSNNNKEKLVSPSDPTSYMKYDEPVMDFDES\
14 |       RPNENCTNAKSHNSGQQTKQHQLYSNNFQQSYPNGMVPGYYPKMPYNPMGGDPLLDQAFY\
15 |       GADDFFFPPEGCDNNMLYPQTATSWNVLPPQAMQPAPTYVGRPYTPNYRSTPGSAMFPYM\
16 |       QSSNSMQWNTAVSPYSSRAPSTTAKNYPPSTFYSQNINQYPRRRTVGMKSSQGNVPTGNK\
17 |       QSVGKSAKISKPLHIKTSAYQKQYKINLETKARPSAGDEDSAHPDKNKEISMPTPDSNTL\
18 |       VVQSEEGGAHSLEVDTNRRSDKNLPDAT"
19 | experiment: 'Mating and invasion efficiency, measured by proxy growth through plasmid count'
20 | transform: 'None'
21 | authour: 'Dorrity et al.'
22 | year: 2018
23 | title: 'Preferences in a trait decision determined by transcription factor variants'
24 | lab: ['Queitsch', 'Fields']
25 | doi: '10.1073/pnas.1805882115'
26 | pmid: '30068600'
27 | url: 'https://www.pnas.org/content/115/34/E7997'
28 | notes: "Use worst of two scores as their effects tend to be position exclusive - one \
29 |         domain governs invasion and one mating function"
30 | input_files:
31 |   - 'pnas.1805882115.sd01.xlsx'
32 |   - 'pnas.1805882115.sd02.xlsx'
33 | source: 'SI'
34 | qc:
35 |   filter: False
36 |   notes:
37 | 


--------------------------------------------------------------------------------
/data/studies/araya_2012_yap1/standardise_araya_2012_yap1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Araya et al. 2012 (YAP1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/araya_2012_yap1/araya_2012_yap1.yaml')
 8 | dm_data <- read_csv('data/studies/araya_2012_yap1/raw/urn_mavedb_00000002-a-2_scores.csv', skip = 4) %>%
 9 |   select(hgvs_pro, raw_score = score) %>%
10 |   mutate(hgvs_pro = if_else(str_ends(hgvs_pro, ']'), str_sub(hgvs_pro, start = 4, end = -2), str_sub(hgvs_pro, start = 3)),
11 |          n_mut = str_count(hgvs_pro, ';') + 1) %>%
12 |   separate(hgvs_pro, str_c('mut', 1:max(.$n_mut)), sep = ';', fill = 'right') %>%
13 |   pivot_longer(cols = starts_with('mut'), values_to = 'mut') %>%
14 |   drop_na(mut) %>%
15 |   select(-name) %>%
16 |   tidyr::extract(mut, into = c('wt', 'position', 'mut'), "([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", convert = TRUE) %>%
17 |   mutate(wt = AA_THREE_2_ONE[wt], mut = AA_THREE_2_ONE[mut], position = position + 169) %>%
18 |   mutate(transformed_score = raw_score) %>%
19 |   group_by(position, wt, mut) %>%
20 |   summarise(transformed_score = ifelse(1 %in% n_mut, mean(transformed_score[n_mut == 1], na.rm=TRUE), mean(transformed_score[n_mut <= 2], na.rm=TRUE)),
21 |             raw_score = ifelse(1 %in% n_mut, mean(raw_score[n_mut == 1], na.rm=TRUE), mean(raw_score[n_mut <= 2], na.rm=TRUE))) %>%
22 |   ungroup() %>%
23 |   mutate(class = get_variant_class(wt, mut),
24 |          score = normalise_score(transformed_score)) %>%
25 |   drop_na(score) %>% # Some mutant arent found in seqs with <= 2 variants
26 |   arrange(position, mut)
27 | 
28 | # Save output
29 | standardise_study(dm_data, meta$study, meta$transform)
30 | 


--------------------------------------------------------------------------------
/bin/figures/figureS8_27.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Produce figure S8-27 (Subtype Characterisation)
 3 | source('src/config.R')
 4 | source('src/subtype_characterisation.R')
 5 | 
 6 | dms <- full_join(read_tsv('data/subtypes/final_subtypes.tsv'),
 7 |                  read_tsv('data/combined_mutational_scans.tsv'),
 8 |                  by = c('study', 'gene', 'position', 'wt')) %>%
 9 |   arrange(study, position)
10 | 
11 | full_characterisation <- full_cluster_characterisation(dms)
12 | 
13 | figures <- group_by(dms, wt) %>%
14 |   group_map(~plot_full_characterisation(unique(.$cluster), full_characterisation, exclude_outliers = TRUE, global_scale = FALSE)) %>%
15 |   map(extract2, 'overall') %>%
16 |   set_names(str_c('figureS', 8:27))
17 | 
18 | save_plotlist(figures, 'figures/4_figures/', default_format = 'pdf')
19 | save_plotlist(figures, 'figures/4_figures/', default_format = 'png')
20 | save_plotlist(figures, 'figures/4_figures/', default_format = 'eps')
21 | save_plotlist(figures, 'figures/4_figures/', default_format = 'tiff')
22 | 
23 | # Save single PDF version
24 | pdf('figures/4_figures/figureS8_27.pdf', onefile = TRUE, width = 11.7, height = 8.3)
25 | for (name in names(figures)){
26 |   p <- ggplot() +
27 |     geom_blank() +
28 |     lims(x=c(0,1), y=c(0,1)) +
29 |     labs(caption = str_c('Figure ', str_sub(name, start = -2))) +
30 |     annotation_custom(figures[[name]], xmin = 0, xmax = 1, ymin = 0, ymax = 1) +
31 |     theme(plot.caption = element_text(hjust = 0.5),
32 |           panel.grid.major.y = element_blank(),
33 |           axis.ticks = element_blank(),
34 |           axis.title = element_blank(),
35 |           axis.text = element_blank(),
36 |           plot.margin = unit(c(8, 1, 1, 1), 'mm'))
37 |   print(p)
38 | }
39 | dev.off()
40 | 


--------------------------------------------------------------------------------
/bin/analysis/2_subtypes/characterise_subtypes.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Characterise generated clusters
 3 | source('src/config.R')
 4 | source('src/subtype_characterisation.R')
 5 | library(argparser)
 6 | 
 7 | ### Parse args and setup ###
 8 | parser <- arg_parser(description = 'Characterise AA subtypes', name = 'AA Subtype Characerisation')
 9 | parser <- add_argument(parser, arg = 'subtypes', help = 'Root filename assigning positions to subtypes. Should have a tsv (subtype per position) and rds (list of cluster objects) file.')
10 | parser <- add_argument(parser, arg = '--dms', help = 'Path to DMS data', default = 'data/combined_mutational_scans.tsv')
11 | parser <- add_argument(parser, arg = '--figures', help = 'Directory to save figures', default = '.')
12 | args <- parse_args(parser)
13 | 
14 | subtypes <- read_tsv(str_c(args$subtypes, '.tsv'))
15 | clusters <- readRDS(str_c(args$subtypes, '.rds'))
16 | 
17 | dms <- read_tsv(args$dms) %>%
18 |   left_join(subtypes, ., by = c('study', 'gene', 'position', 'wt'))
19 | 
20 | ### Calculate Profiles for all clusters ###
21 | full_characterisation <- full_cluster_characterisation(dms)
22 | 
23 | # Make profiles with permissive/outlier clusters excluded
24 | outlier_clusters <- filter(full_characterisation$profiles, str_detect(cluster, CLUSTER_PERMISSIVE_RE) | str_detect(cluster, CLUSTER_OUTLIER_RE)) %>%
25 |   pull(cluster) %>%
26 |   unique()
27 | selective_characterisation <- full_cluster_characterisation(filter(dms, !cluster %in% outlier_clusters))
28 | n_clusters_selective <- nrow(full_characterisation$summary)
29 | 
30 | ### Plot all cluster characterisation ###
31 | plots <- plot_cluster_characterisation(full_characterisation, selective_characterisation, clusters)
32 | 
33 | ### Save Plots ###
34 | save_plotlist(plots, args$figures, verbose = 2)
35 | 


--------------------------------------------------------------------------------
/data/studies/kitzman_2015_gal4/kitzman_2015_gal4.yaml:
--------------------------------------------------------------------------------
 1 | study: 'kitzman_2015_gal4' 
 2 | gene: 'GAL4'
 3 | uniprot_id: 'P04386'
 4 | gene_type: 'TF'
 5 | species: 'S. cerevisiae'
 6 | seq: "MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKRSPLTRAHLTEVESR\
 7 |       LERLEQLFLLIFPREDLDMILKMDSLQDIKALLTGLFVQDNVNKDAVTDRLASVETDMPL\
 8 |       TLRQHRISATSSSEESSNKGQRQLTVSIDSAAHHDNSTIPLDFMPRDALHGFDWSEEDDM\
 9 |       SDGLPFLKTDPNNNGFFGDGSLLCILRSIGFKPENYTNSNVNRLPTMITDRYTLASRSTT\
10 |       SRLLQSYLNNFHPYCPIVHSPTLMMLYNNQIEIASKDQWQILFNCILAIGAWCIEGESTD\
11 |       IDVFYYQNAKSHLTSKVFESGSIILVTALHLLSRYTQWRQKTNTSYNFHSFSIRMAISLG\
12 |       LNRDLPSSFSDSSILEQRRRIWWSVYSWEIQLSLLYGRSIQLSQNTISFPSSVDDVQRTT\
13 |       TGPTIYHGIIETARLLQVFTKIYELDKTVTAEKSPICAKKCLMICNEIEEVSRQAPKFLQ\
14 |       MDISTTALTNLLKEHPWLSFTRFELKWKQLSLIIYVLRDFFTNFTQKKSQLEQDQNDHQS\
15 |       YEVKRCSIMLSDAAQRTVMSVSSYMDNHNVTPYFAWNCSYYLFNAVLVPIKTLLSNSKSN\
16 |       AENNETAQLLQQINTVLMLLKKLATFKIQTCEKYIQVLEEVCAPFLLSQCAIPLPHISYN\
17 |       NSNGSAIKNIVGSATIAQYPTLPEENVNNISVKYVSPGSVGPSPVPLKSGASFSDLVKLL\
18 |       SNRPPSRNSPVTIPRSTPSHRSVTPFLGQQQQLQSLVPLTPSALFGGANFNQSGNIADSS\
19 |       LSFTFTNSSNGPNLITTQTNSQALSQPIASSNVHDNFMNNEITASKIDDGNNSKPLSPGW\
20 |       TDQTAYNAFGITTGMFNTTTMDDVYNYLFDDEDTPPNPKKE" 
21 | experiment: 'Coupled growth'
22 | transform: 'log2(x)'
23 | authour: 'Kitzman et al.'
24 | year: 2015
25 | title: 'Massively parallel single amino-acid mutagenesis'
26 | lab: ['Shendure', 'Fields']
27 | doi: '10.1038/nmeth.3223'
28 | pmid: '25559584'
29 | url: 'https://www.nature.com/articles/nmeth.3223'
30 | notes: "Average of A (24h, 40h), B (40h) & C (40h, 64h) reps at different timepoints (since all timepoints correlate)"
31 | input_files:
32 |   - '41592_2015_BFnmeth3223_MOESM306_ESM.xlsx' 
33 | source: 'SI - Supplementary Data'
34 | qc:
35 |   filter: False
36 |   notes:
37 | 


--------------------------------------------------------------------------------
/bin/analysis/2_subtypes/compare_hclust_dynamic_deep_split.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Compare deepSplit values for dynamic tree cutting
 3 | source('src/config.R')
 4 | 
 5 | dms <- left_join(rename(read_tsv('data/subtypes/hclust_pca_no_sig_dynamic_cos_deep_0_no_permissive.tsv'), cluster_ds0 = cluster),
 6 |                  rename(read_tsv('data/subtypes/hclust_pca_no_sig_dynamic_cos_deep_1_no_permissive.tsv'), cluster_ds1 = cluster),
 7 |                  by = c("study", "gene", "position", "wt")) %>%
 8 |   left_join(read_tsv('data/combined_mutational_scans.tsv'), by = c("study", "gene", "position", "wt")) %>%
 9 |   select(cluster_ds0, cluster_ds1, everything())
10 | 
11 | # Get the mean correlation for profiles in a tibble
12 | get_mean_cor <- function(dms, method='spearman'){
13 |   f <- function(x, ...){
14 |     tibble_to_matrix(x, A:Y) %>%
15 |       t() %>%
16 |       cor(method = method) %>%
17 |       tril() %>%
18 |       mean()
19 |   }
20 |   
21 |   group_map(dms, f) %>%
22 |     unlist() %>%
23 |     tibble(cluster=group_keys(dms)[[1]], mean_cor=.)
24 | }
25 | 
26 | mean_cors <- bind_rows(.id = 'deepSplit',
27 |                        "0"=group_by(dms, cluster_ds0) %>% get_mean_cor(),
28 |                        "1"=group_by(dms, cluster_ds1) %>% get_mean_cor()) %>%
29 |   mutate(wt = str_sub(cluster, end = 1)) %>%
30 |   filter(!str_detect(cluster, CLUSTER_OUTLIER_RE), !str_detect(cluster, CLUSTER_PERMISSIVE_RE))
31 | 
32 | p_mean_cors <- ggplot(mean_cors, aes(x = deepSplit, y = mean_cor, colour = wt)) +
33 |   geom_boxplot() +
34 |   geom_point() +
35 |   facet_wrap(~wt) +
36 |   scale_color_manual(values = AA_COLOURS) +
37 |   guides(colour=FALSE) +
38 |   labs(y = expression('Cluster Mean Spearmans'~rho))
39 | ggsave('figures/2_subtypes/hclust_dynamic_deep_split_mean_corelation.pdf', units = 'cm', width = 15, height = 15)


--------------------------------------------------------------------------------
/data/studies/heredia_2018_ccr5/standardise_heredia_2018_ccr5.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Heredia et al. 2018 (CCR5)
 3 | 
 4 | source('src/config.R')
 5 | source('src/study_standardising.R')
 6 | 
 7 | # Import and process data
 8 | meta <- read_yaml('data/studies/heredia_2018_ccr5/heredia_2018_ccr5.yaml')
 9 | dm_data <- read_xlsx('data/studies/heredia_2018_ccr5/raw/GSE100368_enrichment_ratios_CCR5.xlsx', skip = 7,
10 |                      col_names = c('wt', 'mut', 'reads_l1', 'surface_exp_fitc_l1_r1', 'surface_exp_fitc_l1_r2', 'binding_2d7_l1_r1', 'binding_2d7_l1_r2',
11 |                                    'empty', 'reads_l2', 'surface_exp_alexa_l2_r1', 'surface_exp_alexa_l2_r2', 'binding_gp120_cd4_l2_r1', 'binding_gp120_cd4_l2_r2')) %>%
12 |   select(-empty) %>%
13 |   mutate(wt = rep(wt[!is.na(wt)], each = 21)) %>%
14 |   tidyr::extract(wt, into = c('wt', 'position'), '([A-Z])([0-9]+)', convert = TRUE) %>%
15 |   
16 |   # Average replicates for binding (which also incorporate surface expression)
17 |   mutate(binding_2d7_l1 = rowMeans(select(., binding_2d7_l1_r1, binding_2d7_l1_r2), na.rm = TRUE) %>% replace_na(NA),
18 |          binding_gp120_cd4_l2 = rowMeans(select(., binding_gp120_cd4_l2_r1, binding_gp120_cd4_l2_r2), na.rm = TRUE) %>% replace_na(NA)) %>%
19 |   
20 |   # Average binding scores for two conditions (0.61 correlation)
21 |   mutate(raw_score = rowMeans(select(., binding_2d7_l1, binding_gp120_cd4_l2), na.rm = TRUE) %>% replace_na(NA),
22 |          transformed_score = raw_score,
23 |          score = normalise_score(transformed_score),
24 |          class = get_variant_class(wt, mut)) %>%
25 |   select(position, wt, mut, score, transformed_score, raw_score, class) %>%
26 |   drop_na(score) # not all measured
27 | 
28 | 
29 | # Save output
30 | standardise_study(dm_data, meta$study, meta$transform)
31 | 


--------------------------------------------------------------------------------
/data/studies/heredia_2018_cxcr4/standardise_heredia_2018_cxcr4.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Heredia et al. 2018 (CXCR4)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/heredia_2018_cxcr4/heredia_2018_cxcr4.yaml')
 8 | dm_data <- read_xlsx('data/studies/heredia_2018_cxcr4/raw/GSE100368_enrichment_ratios_CXCR4.xlsx', skip = 6,
 9 |                      col_names = c('wt', 'mut', 'reads',
10 |                                    'surface_exp_fitc_r1', 'surface_exp_fitc_r2',
11 |                                    'binding_12g5_r1', 'binding_12g5_r2',
12 |                                    'surface_exp_alexa_r1', 'surface_exp_alexa_r2',
13 |                                    'binding_cxcl12_r1', 'binding_cxcl12_r2')) %>%
14 |   mutate(wt = rep(wt[!is.na(wt)], each = 21)) %>%
15 |   tidyr::extract(wt, into = c('wt', 'position'), '([A-Z])([0-9]+)', convert = TRUE) %>%
16 |   
17 |   # Average replicates for binding (which also incorporate surface expression)
18 |   mutate(binding_12g5 = rowMeans(select(., binding_12g5_r1, binding_12g5_r2), na.rm = TRUE) %>% replace_na(NA),
19 |          binding_cxcl12 = rowMeans(select(., binding_cxcl12_r1, binding_cxcl12_r2), na.rm = TRUE) %>% replace_na(NA)) %>%
20 |   
21 |   # Average binding scores for two conditions (0.48 correlation)
22 |   mutate(raw_score = rowMeans(select(., binding_12g5, binding_cxcl12), na.rm = TRUE) %>% replace_na(NA),
23 |          transformed_score = raw_score,
24 |          score = normalise_score(transformed_score),
25 |          class = get_variant_class(wt, mut)) %>%
26 |   select(position, wt, mut, score, transformed_score, raw_score, class) %>%
27 |   drop_na(score) # not all measured
28 | 
29 | # Save output
30 | standardise_study(dm_data, meta$study, meta$transform)
31 | 


--------------------------------------------------------------------------------
/bin/analysis/2_subtypes/sequence_context.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Analyse subtype sequence context
 3 | library(Biostrings)
 4 | source('src/config.R')
 5 | library(ggseqlogo)
 6 | 
 7 | dms <- full_join(read_tsv('data/subtypes/final_subtypes.tsv'),
 8 |                  read_tsv('data/combined_mutational_scans.tsv'),
 9 |                  by = c('study', 'gene', 'position', 'wt')) %>%
10 |   arrange(study, position)
11 | 
12 | fasta <- map(dir('data/fasta', full.names = TRUE), readAAStringSet) %>% reduce(c)
13 | 
14 | extract_seq_context <- function(seq, window=10){
15 |   s <- as.matrix(seq)[,1]
16 |   s <- c(rep('-', window), s, rep('-', window))
17 |   w <- map_chr((window + 1):(length(s) - window), ~str_c(s[(. - window):(. + window)], collapse = ''))
18 |   tibble(position = 1:length(seq), wt = as.matrix(seq)[,1], seq_context = w)
19 | }
20 | 
21 | windows <- lapply(fasta, extract_seq_context) %>%
22 |   bind_rows(.id = 'geneid')
23 | 
24 | build_profiles <- function(tbl, ...){
25 |   seq <- AAStringSet(tbl$seq_context)
26 |   consensusMatrix(seq)
27 | }
28 | 
29 | cluster_contexts <- select(dms, cluster, study, gene, position, wt) %>%
30 |   mutate(geneid = gene_to_filename(gene)) %>%
31 |   left_join(windows, by = c('geneid', 'position', 'wt')) %>%
32 |   select(-geneid) %>%
33 |   group_by(cluster)
34 | cluster_contexts <- group_map(cluster_contexts, build_profiles) %>%
35 |   set_names(group_keys(cluster_contexts)$cluster)
36 | 
37 | context_plots <- map(sort(unique(dms$wt)),
38 |                      ~labeled_plot(
39 |                        ggseqlogo(cluster_contexts[str_starts(names(cluster_contexts), .)], method = 'probability'),
40 |                        unit = 'cm', height = 20, width = 30)
41 |                      ) %>%
42 |   set_names(sort(unique(dms$wt)))
43 | 
44 | save_plotlist(context_plots, 'figures/2_subtypes/final_subtypes/sequence_contexts')
45 | 


--------------------------------------------------------------------------------
/data/studies/starita_2013_ube4b/standardise_starita_2013_ube4b.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Starita et al. 2013 (UBE4B)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml')
 8 | dm_data <- read_xlsx('data/studies/starita_2013_ube4b/raw/sd01.xlsx', na = c('NA', '')) %>%
 9 |   filter(!seqID == 'NA-NA') %>% # Filter WT
10 |   rename(raw_score = log2_ratio) %>%
11 |   separate(seqID, into = c('position', 'mut'), sep='-') %>%
12 |   select(-nscor_log2_ratio) %>%
13 |   mutate(n_mut = sapply(position, function(x){str_count(x, ',') + 1})) %>%
14 |   separate(mut, str_c('mut', 1:max(.$n_mut)), sep = ',', fill = 'right') %>%
15 |   separate(position, str_c('position', 1:max(.$n_mut)), sep = ',', fill = 'right') %>%
16 |   pivot_longer(starts_with('position'), names_to = 'pos_num', names_prefix = 'position', values_to = 'position') %>%
17 |   drop_na(position) %>%
18 |   pivot_longer(starts_with('mut'), names_to = 'mut_num', names_prefix = 'mut', values_to = 'mut') %>%
19 |   drop_na(mut) %>%
20 |   filter(pos_num == mut_num) %>%
21 |   select(-pos_num, -mut_num) %>%
22 |   group_by(position, mut) %>%
23 |   summarise(raw_score = ifelse(1 %in% n_mut, mean(raw_score[n_mut == 1], na.rm=TRUE), mean(raw_score[n_mut <= 3], na.rm=TRUE))) %>%
24 |   ungroup() %>%
25 |   mutate(transformed_score = raw_score,
26 |          score = normalise_score(transformed_score), 
27 |          position = as.integer(position) + 1072, # tested region starts at +1072 according to Starita (slightly before uniprot UBOX) This does lead to ref seq aligning
28 |          wt = str_split(meta$seq, '')[[1]][position],
29 |          class = get_variant_class(wt, mut)) %>%
30 |   drop_na(score) # Not all variants present in seqs with <= 3 variants
31 | 
32 | # Save output
33 | standardise_study(dm_data, meta$study, meta$transform)
34 | 
35 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/summarise_standardised_data.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Summarise standardised dataset
 3 | source('src/config.R')
 4 | dms <- read_tsv('data/long_combined_mutational_scans.tsv')
 5 | 
 6 | p_score_dist <- ggplot(dms, aes(x=imputed_score, fill=ifelse(!is.na(score), 'Experiment', 'Imputed'))) +
 7 |   facet_wrap(~study, ncol = 4, labeller = as_labeller(sapply(unique(dms$study), format_study, max_width = 28)), scales = 'free') +
 8 |   geom_histogram(bins=30) +
 9 |   labs(x = 'Normalised ER', y='Count') +
10 |   scale_fill_manual(values = c(Experiment='cornflowerblue', Imputed='firebrick2')) +
11 |   guides(fill=guide_legend(title = ''))
12 | ggsave('figures/0_data/standardised_distributions.pdf', p_score_dist, units = 'cm', height = 35, width = 30)
13 | 
14 | summary_tbl <- group_by(dms, study, position, wt) %>%
15 |   summarise(fx = sum(!is.na(total_energy)) == 19,
16 |             sift = all(!is.na(sift)),
17 |             phi = all(!is.na(phi)),
18 |             psi = all(!is.na(psi)),
19 |             sa = all(!is.na(all_atom_abs))) %>%
20 |   ungroup() %>%
21 |   summarise(Total = n(),
22 |             `FoldX Results` = sum(fx),
23 |             `SIFT Results` = sum(sift),
24 |             Phi = sum(phi),
25 |             Psi = sum(psi),
26 |             `Surface Accessibility` = sum(sa)) %>%
27 |   pivot_longer(everything(), names_to = 'metric', values_to = 'Count') %>%
28 |   mutate(metric = factor(metric, levels = metric[order(Count, c(1, rep(0, length(metric) - 1)))])) # Second order arg ensures total is always first
29 | 
30 | p_position_summary <- ggplot(summary_tbl, aes(x = metric, y = Count, fill = metric)) +
31 |   geom_col(width = 0.5) +
32 |   geom_text(aes(label = Count), nudge_y = 500) +
33 |   coord_flip() +
34 |   guides(fill=FALSE) +
35 |   labs(x='', title = 'Summary of data collected after filtering') +
36 |   theme(panel.grid.major.y = element_blank(),
37 |         axis.ticks.y = element_blank())
38 | ggsave('figures/0_data/position_data_summary.pdf', p_position_summary, units = 'cm', height = 8, width = 15)
39 | 


--------------------------------------------------------------------------------
/data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml:
--------------------------------------------------------------------------------
 1 | study: 'starita_2013_ube4b'
 2 | gene: 'UBE4B'
 3 | domain: 'UBOX'
 4 | uniprot_id: 'Q9ES00'
 5 | gene_type: 'E3 Ubiquitin Ligase'
 6 | species: 'M. musculus'
 7 | seq: "MEELSADEIRRRRLARLAGGQTSQPTTPLTSPQRENPPGPPIAASAPGPSQSLGLNVHNM\
 8 |       TPATSPIGAAGVAHRSQSSEGVSSLSSSPSNSLETQSQSLSRSQSMDIDGVSCEKSMSQV\
 9 |       DVDSGIENMEVDENDRREKRSLSDKEPSSGPEVSEEQALQLVCKIFRVSWKDRDRDVIFL\
10 |       SSLSAQFKQNPKEVFSDFKDLIGQILMEVLMMSTQTRDENPFASLTATSQPIATAARSPD\
11 |       RNLMLNTGSSSGTSPMFCNMGSFSTSSLSSLGASGGASNWDSYSDHFTIETCKETDMLNY\
12 |       LIECFDRVGIEEKKAPKMCSQPAVSQLLSNIRSQCISHTALVLQGSLTQPRSLQQPSFLV\
13 |       PYMLCRNLPYGFIQELVRTTHQDEEVFKQIFIPILQGLALAAKECSLESDYFKYPLMALG\
14 |       ELCETKFGKTHPMCNLVASLPLWLPKSLSPGSGRELQRLSYLGAFFSFSVFAEDDAKVVE\
15 |       KYFSGPAITLENTRVVSQSLQHYLELGRQELFKILHSILLNGETREAALSYMAALVNANM\
16 |       KKAQMQADDRLVSTDGFMLNLLWVLQQLSTKIKLETVDPTYIFHPRCRITLPNDETRINA\
17 |       TMEDVNERLTELYGDQPPFSEPKFPTECFFLTLHAHHLSILPSCRRYIRRLRAIRELNRT\
18 |       VEDLKNNESQWKDSPLATRHREMLKRCKTQLKKLVRCKACADAGLLDESFLRRCLNFYGL\
19 |       LIQLMLRILDPAYPDVTLPLNSEVPKVFAALPEFYVEDVAEFLFFIVQYSPQVLYEPCTQ\
20 |       DIVMFLVVMLCNQNYIRNPYLVAKLVEVMFMTNPSVQPRTQKFFEMIENHPLSTKLLVPS\
21 |       LMKFYTDVEHTGATSEFYDKFTIRYHISTIFKSLWQNIAHHGTFMEEFNSGKQFVRYINM\
22 |       LINDTTFLLDESLESLKRIHEVQEEMKNKEQWDQLPRDQQQARQSQLAQDERVSRSYLAL\
23 |       ATETVDMFHLLTKQVQKPFLRPELGPRLAAMLNFNLQQLCGPKCRDLKVENPEKYGFEPK\
24 |       KLLDQLTDIYLQLDCARFAKAIADDQRSYSKELFEEVISKMRKAGIKSTIAIEKFKLLAE\
25 |       KVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDP\
26 |       FNRQMLTESMLEPVPELKEQIQAWMREKQSSDH" 
27 | experiment: 'Activity'
28 | transform: 'None'
29 | authour: 'Starita et al.'
30 | year: 2013
31 | title: 'Activity-enhancing mutations in an E3 ubiquitin ligaseidentified by high-throughput mutagenesis'
32 | lab: ['Klevit', 'Fields', 'Shendure', 'Fowler']
33 | doi: '10.1073/pnas.1303309110'
34 | pmid: '23509263'
35 | url: 'http://www.pnas.org/content/110/14/E1263.long'
36 | input_files:
37 |   - 'sd01.xlsx' 
38 | source: 'SI - Dataset S1'
39 | qc:
40 |   filter: False
41 |   notes:
42 | 


--------------------------------------------------------------------------------
/meta/fasta/h_sapiens_brca1.fa:
--------------------------------------------------------------------------------
 1 | >sp|P38398|BRCA1_HUMAN Breast cancer type 1 susceptibility protein OS=Homo sapiens OX=9606 GN=BRCA1 PE=1 SV=2
 2 | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQ
 3 | CPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKD
 4 | EVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYI
 5 | ELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQ
 6 | PSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVE
 7 | KAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPC
 8 | SENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVD
 9 | EYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTEN
10 | LIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTE
11 | QNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNI
12 | HNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPV
13 | RHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKE
14 | FVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQ
15 | ESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHS
16 | RETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVT
17 | FECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRG
18 | NETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIP
19 | STVSTISRNNIRENVFKEASSSNINEVGSSTNEVGSSINEIGSSDENIQAELGRNRGPKL
20 | NAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTVNTDFSPYLISDNLEQPMGSS
21 | HASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTHTHLAQ
22 | GYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENL
23 | LSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGS
24 | SKQMRHQSESQGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSE
25 | DCSGLSSQSDILTTQQRDTMQHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALE
26 | DLRNPEQSTSEKAVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSKNKEPGVERSSPSK
27 | CPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEESGPHDLTETSYLPRQDLEG
28 | TPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTT
29 | DTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLI
30 | TEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDV
31 | VNGRNHQGPKRARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTL
32 | GTGVHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPH
33 | SHY
34 | 
35 | 


--------------------------------------------------------------------------------
/src/pymol_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Library for visualising data in PyMOL, paticularly projecting arbitary values onto proteins.
 3 | """
 4 | from itertools import cycle
 5 | 
 6 | from colour_spectrum import ColourSpectrum
 7 | import subtypes_utils as su
 8 | 
 9 | def quick_highlight(cmd, dms, gene, factor='cluster_num', res=None, root='data/pdb'):
10 |     """
11 |     Load a new protein and highlight a given factor.
12 |     """
13 |     cmd.delete('all')
14 |     cmd.load(f'{root}/{su.gene_to_filename(gene)}.pdb')
15 | 
16 |     if res:
17 |         pdb_dms = dms[(dms.gene == gene) & (dms.wt == res)]
18 |     else:
19 |         pdb_dms = dms[dms.gene == gene]
20 |     pdb_dms = pdb_dms.dropna(subset=['pdb_position'])
21 | 
22 |     colourer = su.SubtypesColourMap.lookup_map(factor, dms[factor])
23 |     project_landscape(cmd, pdb_dms.pdb_chain, pdb_dms.pdb_position, pdb_dms[factor], colourer)
24 |     return pdb_dms
25 | 
26 | def project_landscape(cmd, chain, position, value, colourer=None, na_colour=None):
27 |     """
28 |     Colour specific residues according to a colourmap. colourer must return a Hexcode
29 |     when called with a value as well as have an 'na_colour' attribute if no na_colour
30 |     is specifically supplied. Chain can either be a single identifier (str) or an
31 |     iterable of identifiers
32 |     """
33 |     if colourer is None:
34 |         colourer = ColourSpectrum(min(value), max(value), colourmap='viridis')
35 | 
36 |     if isinstance(chain, str):
37 |         chain = cycle([chain])
38 | 
39 |     colour_residues(cmd, *zip(chain, position, [colourer(val) for val in value]),
40 |                     base_colour=na_colour or colourer.na_colour)
41 | 
42 | def colour_residues(cmd, *args, base_colour=None):
43 |     """
44 |     Colour multiple residues programatically. Each argument should be a
45 |     (chain, position index, hex code) tuple
46 |     """
47 |     if base_colour is not None:
48 |         cmd.color(base_colour, 'polymer')
49 | 
50 |     for chn, pos, col in args:
51 |         pos = int(pos)
52 |         pos = f'\{pos}' if pos < 0 else pos # Negative indices must be escaped
53 |         cmd.color(col, f'polymer and chain {chn} and resi {pos}')
54 | 


--------------------------------------------------------------------------------
/meta/README.md:
--------------------------------------------------------------------------------
 1 | # Meta Data folder
 2 | 
 3 | This folder contains various bits of metadata required to execute the project
 4 | and a few summary tables output by the pipeline.
 5 | 
 6 | Metadata:
 7 | * **structures.yaml** - details the SwissMODEL structures chosen for the studied proteins. 
 8 |   Each gene is linked to a SwissMODEL template ID, the type of structure it is (e.g. 
 9 |   x-ray, homology model etc.) and a list of sections to use. Each section is defined by
10 |   a chain and region, with an optional offset for sequence numbering compared to Uniprot.
11 |   This manual system could theoretically be replaced by automated model selection via the
12 |   SwissMODEL API, but the complexity of the choice and the small number of proteins made 
13 |   manual selection easier. This file is used downstream to select regions of the PDB to
14 |   process (e.g. with FoldX and Naccess) and to convert results to Uniprot sequence numbering.
15 |   If a new study is added for a new protein a model must be chosen and added here.
16 | * **study\_template.yaml** - Template YAML file for adding new studies to the project
17 | * **subtypes/** - folder containing YAML config files for the various clustering approaches 
18 |   to extracting amino acid subtypes.
19 | * **residue\_hydrophobicity.tsv** - Average amino acid hydrophobicity table, sourced from 
20 |   [Bandyopadhyay & Mehler (2008)](https://onlinelibrary.wiley.com/doi/full/10.1002/prot.21958)
21 | * **fasta/** - can contain .fa files of the form {species/strain}\_{gene}.fa that act as 
22 |   master copies when validating study config files. Not required for normal execution.
23 |   In general the fasta files are sourced from Uniprot, although some come from the 
24 |   studies directly or have manual edits based on the mutations used in a study.
25 | 
26 | Generated Files:
27 | * **overall\_summary** - Summary of the project overall, giving the total number of 
28 |   studies, genes, etc. processed.
29 | * **gene\_summary.tsv** - Table summarising the properties of the genes included in the
30 |   project, including stats such as the number of mutants and mutant coverage.
31 | * **study\_summary.tsv** - Table summarising the properties of the studies included in 
32 |   the project.
33 | 


--------------------------------------------------------------------------------
/docs/combined_mutational_scans_readme.txt:
--------------------------------------------------------------------------------
 1 | # Combined Mutational Scans Dataset EV2
 2 | 
 3 | Full deep landscape dataset, containing all positions normalised ER scores and additional annotations
 4 | 
 5 | ## Columns
 6 | 
 7 | study: Study the position comes from
 8 | gene: Gene
 9 | position: Position in gene (relative to cannonical Uniprot sequence)
10 | wt: Wild-type amino acid
11 | A-Y: Normalised ER score when the position is mutated to each amino acid
12 | log10_sift_A-Y: Log10 SIFT4G score for each substitution at this position
13 | mean_score: Mean normalised ER score across all substitutions
14 | mean_sift: Mean log10 SIFT4G score across all substitutions
15 | total_energy: Mean FoldX ddG prediction (kJ/mol) across all substitutions
16 | backbone_hbond-energy_ionisation: Mean prediction for each energy term in FoldX's force field model
17 | phi: Phi backbone angle
18 | psi: Psi backbone angle
19 | all_atom_abs: All atom absolute surface accessibility as calculated by Naccess 
20 | all_atom_rel: All atom relative surface accessibility as calculated by Naccess 
21 | side_chain_abs: Side chain absolute surface accessibility as calculated by Naccess 
22 | side_chain_rel: Side chain relative surface accessibility as calculated by Naccess 
23 | backbone_abs: Backbone atom absolute surface accessibility as calculated by Naccess 
24 | backbone_rel: Backbone atom relative surface accessibility as calculated by Naccess 
25 | non_polar_abs: Non-polar absolute surface accessibility as calculated by Naccess 
26 | non_polar_rel: Non-polar relative surface accessibility as calculated by Naccess 
27 | polar_abs: Polar absolute surface accessibility as calculated by Naccess 
28 | polar_rel: Polar relative  surface accessibility as calculated by Naccess 
29 | within_10_0_A-Y: Count of each amino acid within a 10 angstrom sphere centered at the position
30 | angstroms_to_A-Y: Distance to each amino acid, in angstroms
31 | ss_g-t: Porter5 predictions for each DSSP secondary structure class
32 | hydrophobicity: Hydrophobicity score from Bandyopadhyay adn Mehler (2008)
33 | PC1-20: Principal component projection of the normalised ER score profile
34 | tSNE1-2: tSNE coordinates of the normalised ER score profile
35 | umap1-2: UMAP projection of the normalised ER score profile
36 | 


--------------------------------------------------------------------------------
/bin/utils/farm_sync.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # TODO: Convert to python script with more convenient behaviour
 3 | # Script to sync between a local and remote directory(s)
 4 | #
 5 | # Setup for a given project folder using the config beow
 6 | #
 7 | # Will look for global and local .rsync_exclude/rsync_include files
 8 | # .rsync_exclude in $HOME is assumed to be global prefs
 9 | # Manual include overrides all excludes
10 | #
11 | # Sync only desired folders based on args
12 | 
13 | ## Config ##
14 | project_name="Amino Acid Subtypes Project"
15 | local_dir=$HOME/phd/subtypes
16 | remote_dirs=( "ebi:/hps/research1/beltrao/ally/subtypes" )
17 | folders=( "data" "meta" "figures" "docs" "logs" )
18 | 
19 | ## Colours for printf ##
20 | green=$(tput setaf 2)
21 | magenta=$(tput setaf 5)
22 | bold=$(tput bold)
23 | normal=$(tput sgr0)
24 | 
25 | ## Check for presence of include/exclude files ##
26 | rsync_options=( -a -u -h )
27 | 
28 | if [ -e "$HOME/.rsync_exclude" ]; then
29 |    rsync_options+=( --exclude-from "$HOME/.rsync_exclude" )
30 | fi
31 | 
32 | if [ -e "$local_dir/.rsync_include" ]; then
33 |    rsync_options+=( --include-from "$local_dir/.rsync_include" --exclude"="'*')
34 | elif [ -e "$local_dir/.rsync_exclude" ]; then
35 |    rsync_options+=( --exclude-from "$local_dir/.rsync_exclude" )
36 | fi
37 | 
38 | ## Sync function ##
39 | syncr () {
40 |    rsync -v --dry-run "${rsync_options[@]}" "$1" "$2"
41 | 
42 |    read -p "Transfer? " -n 1 -r
43 |    echo
44 |    if [[ $REPLY =~ ^[Yy]$ ]]
45 |    then
46 |       rsync "${rsync_options[@]}" "$1" "$2"
47 |    fi
48 | }
49 | 
50 | ## Override folders if argument passed ##
51 | if [ $# -ne 0 ]; then
52 |    folders=( "$@" )
53 | fi
54 | 
55 | ## Perform sync ##
56 | printf "%s\n" "${magenta}${bold}Rsyncing $project_name${normal}"
57 | for r in "${remote_dirs[@]}"; do
58 |    read -p "Sync to remote: $r? " -n 1 -r
59 |    echo
60 |    if [[ $REPLY =~ ^[Yy]$ ]]; then
61 |       for f in "${folders[@]}"; do
62 |          printf "\n%s\n%s\n" "${green}${bold}Folder: $f${normal}" "${green}Local -> Remote${normal}"
63 |          syncr "$local_dir/$f/" "$r/$f"
64 |          printf "\n%s\n" "${green}Local <- Remote${normal}"
65 |          syncr "$r/$f/" "$local_dir/$f"
66 |       done
67 |    fi
68 | done
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/docs/foldx_eqn.md:
--------------------------------------------------------------------------------
 1 | # Force Field
 2 | ## Overall equation:
 3 | ΔG = Wvdw \* ΔGvdw + WsolvH \* ΔGsolvH +  WsolvP \* ΔGsolvP + ΔGwb + ΔGhbond + ΔGel + ΔGKon + Wmc \* T \* ΔSmc + Wsc \* T \* ΔSsc
 4 | 
 5 | Basically:
 6 | ΔG = ΔH - TΔS
 7 | 
 8 | ## Terms:
 9 | * ΔGvdw = Sum of van der waals of all atoms wrt same interactions with solvent 
10 | * ΔGsolvH = Difference in solvation energy for apolar groups 
11 | * ΔGsolvP = Difference in solvation energy for polar groups
12 | * ΔGwbi = Extra stabalising energy for H2O making 2 h-bonds to protein 
13 | * ΔGhbond = Energy change from forming h-bonds within the protein rather than with solvent 
14 | * ΔGel = Electrostatic contribution of charged groups, including helix dipole
15 | * ΔGKon = Electrostatic effect on association constant k[on]
16 | * T = Temperature (K) 
17 | * ΔSmc = Entropy of fixing main chain in conformation
18 | * ΔSsc = Entropy of fixing side chains in conformation
19 | * Wxxx = Weighting terms, all 1 apart from Wvdv = 0.33
20 | 
21 | # BuildModel output terms
22 | From: http://foldxsuite.crg.eu/command/BuildModel
23 | 
24 | * Total Energy - predicted overall stability
25 | * Backbone Hbond - contribution of backbone Hbonds
26 | * Sidechain Hbond - contribution of sidechain-sidechain and sidechain-backbone Hbonds
27 | * Van der Waals - contribution of the VanderWaals
28 | * Electrostatics - electrostatic interactions
29 | * Solvation Polar - penalization for burying polar groups
30 | * Solvation Hydrophobic - contribution of hydrophobic groups
31 | * Van der Waals clashes - energy penalization due to interresidue VanderWaals’ clashes
32 | * Entropy Side Chain - entropy cost of fixing the side chain
33 | * Entropy Main Chain - entropy cost of fixing the main chain
34 | * Sloop Entropy - ONLY FOR ADVANCED USERS
35 | * Mloop Entropy - ONLY FOR ADVANCED USERS
36 | * Cis Bond - cost of having a cis peptide bond
37 | * Torsional Clash - intraresidue VanderWaals’ torsional clashes
38 | * Backbone Clash - backbone-backbone VanderWaals. *not considered in the total*
39 | * Helix Dipole - electrostatic contribution of helix dipole
40 | * Water Bridge - water bridges
41 | * Disulfide - disulfide bonds
42 | * Electrostatic Kon - electrostatic interaction between molecules in the precomplex
43 | * Partial Covalent Bonds - interactions with bound metals
44 | * Energy Ionisation - ionisation energy
45 | * Entropy Complex - entropy cost of forming a complex
46 | 


--------------------------------------------------------------------------------
/bin/analysis/2_subtypes/all_positions.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Perform clustering on all positions at once
 3 | source('src/config.R')
 4 | source('src/subtype_characterisation.R')
 5 | 
 6 | dms <- read_tsv('data/combined_mutational_scans.tsv')
 7 | 
 8 | ### Make Dynamic Hclust Clusters ###
 9 | clusters <- make_dynamic_hclust_clusters(dms, PC2:PC20, distance_method = 'cosine', treecut_args = list(deepSplit=0))
10 | dms <- mutate(clusters$tbl,
11 |               cluster = str_c('X', cluster) %>% relabel_outlier_clusters(),
12 |               cluster = factor(cluster, levels = sort_clusters(unique(cluster)))) %>%
13 |   select(cluster, everything())
14 | 
15 | ### Analyse clusters ###
16 | n_clusters <- n_distinct(dms$cluster)
17 | plots <- list()
18 | 
19 | plots$umap <- (ggplot(dms, aes(x=umap1, y=umap2, colour=wt)) +
20 |                  facet_wrap(~cluster) +
21 |                  scale_colour_manual(values = AA_COLOURS) +
22 |                  geom_point() +
23 |                  labs(x = 'UMAP1', y = 'UMAP2')) %>%
24 |   labeled_plot(units='cm', height=25, width=25)
25 | 
26 | plots$tsne <- (ggplot(dms, aes(x=tSNE1, y=tSNE2, colour=wt)) +
27 |                  facet_wrap(~cluster) +
28 |                  geom_point() +
29 |                  scale_colour_manual(values = AA_COLOURS)) %>%
30 |   labeled_plot(units='cm', height=25, width=25)
31 | 
32 | plots$silhouette <- labeled_plot(plot_silhouette(dms, A:Y, 'cosine'),
33 |                                  units='cm', height = n_clusters*0.33 + 2, width = 15, limitsize=FALSE)
34 | 
35 | 
36 | cluster_occupancy <- group_by(dms, cluster, wt) %>%
37 |   tally() %>%
38 |   mutate(rel_n = n / sum(n))
39 | plots$cluster_occupancy <- filter(cluster_occupancy, !str_ends(cluster, '0')) %>%
40 |   ggplot(aes(x = wt, y = cluster, fill = rel_n)) +
41 |   geom_raster() +
42 |   coord_equal() + 
43 |   guides(fill = guide_colourbar(title = 'Proportion')) +
44 |   scale_fill_distiller(type = 'seq', palette = 'Reds', direction = 1) +
45 |   theme(axis.ticks = element_blank(),
46 |         axis.title = element_blank(),
47 |         panel.grid.major.y = element_blank())
48 | 
49 | ### Save results ###
50 | write_tsv(select(dms, cluster, study, gene, position, wt), 'data/subtypes/all_positions.tsv')
51 | saveRDS(clusters, 'data/subtypes/all_positions.rds')
52 | 
53 | root <- 'figures/2_subtypes/all_positions'
54 | dir.create(root)
55 | save_plotlist(plots, root)
56 | 


--------------------------------------------------------------------------------
/bin/figures/figureS29.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Figure S29 (SIFT subtypes)
 3 | source('src/config.R')
 4 | 
 5 | dms <- left_join(read_tsv('data/subtypes/final_subtypes.tsv'),
 6 |                   read_tsv('data/subtypes/sift_scores.tsv') %>% rename(sift_cluster = cluster),
 7 |                   by = c('study', 'gene', 'position', 'wt')) %>%
 8 |   left_join(read_tsv('data/combined_mutational_scans.tsv'), by = c('study', 'gene', 'position', 'wt'))
 9 | 
10 | dms_dist <- filter(dms, !str_detect(cluster, CLUSTER_OUTLIER_RE)) %>%
11 |   group_by(wt, cluster) %>%
12 |   summarise_at(vars(A:Y), mean) %>%
13 |   group_map(~tibble_to_matrix(., A:Y) %>% 
14 |               cosine_distance_matrix() %>% 
15 |               rowMeans() %>% 
16 |               set_names(.x$cluster)) %>%
17 |   unlist()
18 | 
19 | sift_dist <- filter(dms, !str_detect(sift_cluster, CLUSTER_OUTLIER_RE)) %>%
20 |   group_by(wt, sift_cluster) %>%
21 |   summarise_at(vars(starts_with('log10_sift')), mean) %>%
22 |   group_map(~tibble_to_matrix(., starts_with('log10_sift')) %>% 
23 |               cosine_distance_matrix() %>% 
24 |               rowMeans() %>% 
25 |               set_names(.x$sift_cluster)) %>%
26 |   unlist()
27 | 
28 | cosine <- tibble(type=c(rep('dms', length(dms_dist)), rep('sift', length(sift_dist))),
29 |                  cluster=c(names(dms_dist), names(sift_dist)),
30 |                  cosine=c(dms_dist, sift_dist) %>% unname())
31 | 
32 | figure <- ggplot(cosine, aes(x = type, y = cosine, fill = type)) +
33 |   geom_boxplot(show.legend = FALSE) +
34 |   stat_compare_means(paired = FALSE, comparisons = list(c('dms', 'sift'))) +
35 |   scale_x_discrete(name = 'Profiles used for clustering', labels = c(dms='ER', sift='log<sub>10</sub>SIFT4G')) +
36 |   scale_y_continuous(name = 'Mean Cosine Distance', limits = c(0, 0.6)) +
37 |   scale_fill_manual(values = c(dms='firebrick2', sift='cornflowerblue')) +
38 |   theme(axis.text.x = element_markdown(),
39 |         axis.ticks.x = element_blank())
40 | 
41 | ggsave('figures/4_figures/figureS29.pdf', figure, width = 120, height = 120, units = 'mm')
42 | ggsave('figures/4_figures/figureS29.png', figure, width = 120, height = 120, units = 'mm')
43 | ggsave('figures/4_figures/figureS29.tiff', figure, width = 120, height = 120, units = 'mm')
44 | ggsave('figures/4_figures/figureS29.eps', figure, width = 120, height = 120, units = 'mm', device=cairo_ps, fallback_resolution = 600)
45 | 
46 | 


--------------------------------------------------------------------------------
/data/studies/dorrity_2018_ste12/standardise_dorrity_2018_ste12.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Dorrity et al. 2018 (STE12)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/dorrity_2018_ste12/dorrity_2018_ste12.yaml')
 8 | mating_data <- read_xlsx('data/studies/dorrity_2018_ste12/raw/pnas.1805882115.sd01.xlsx') %>%
 9 |   mutate(mut = sapply(seqID, process_split_seqid, USE.NAMES = FALSE),
10 |          nmut = str_count(mut, ',') + 1,
11 |          mating_avg = rowMeans(select(., mating_30C_rep1, mating_30C_rep2, mating_30C_rep3), na.rm = TRUE) %>% replace_na(NA)) %>%
12 |   select(mut, nmut, mating_avg, starts_with('mating_30C_'))
13 | 
14 | invasion_data <- read_xlsx('data/studies/dorrity_2018_ste12/raw/pnas.1805882115.sd02.xlsx') %>%
15 |   mutate(mut = sapply(seqID, process_split_seqid, USE.NAMES = FALSE),
16 |          nmut = str_count(mut, ',') + 1,
17 |          invasion_avg = rowMeans(select(., invasion_30C_rep1, invasion_30C_rep2, invasion_30C_rep3), na.rm = TRUE) %>% replace_na(NA)) %>%
18 |   select(mut, nmut, invasion_avg, starts_with('invasion_30C_'))
19 | 
20 | dm_data <- full_join(mating_data, invasion_data, by = c('mut', 'nmut')) %>%
21 |   
22 |   # Take worst of two scores (both are essential functions)
23 |   mutate(raw_score = pmin(mating_avg, invasion_avg, na.rm = TRUE)) %>%
24 |   select(mut, nmut, raw_score) %>%
25 |   separate(mut, into = str_c('mut', 1:max(.$nmut)), sep=',', fill = 'right') %>%
26 |   pivot_longer(starts_with('mut'), names_to = 'tmp', values_to = 'mut') %>%
27 |   select(-tmp) %>%
28 |   drop_na(mut) %>%
29 |   tidyr::extract(mut, into = c('position', 'mut'), '([0-9]*)([A-Z*])', convert = TRUE) %>%
30 |   mutate(position = position + 140,
31 |          wt = str_split(meta$seq, '')[[1]][position]) %>%
32 | 
33 |   # Take value from a single variant if possible, average over multiple mutations otherwise (<=4 gives 97% coverage without using very heavily mutated seqs)
34 |   group_by(position, wt, mut) %>%
35 |   summarise(raw_score = ifelse(any(nmut == 1), mean(raw_score[nmut == 1], na.rm = TRUE), mean(raw_score[nmut <= 4], na.rm = TRUE))) %>%
36 |   ungroup() %>%
37 |   mutate(transformed_score = raw_score,
38 |          score = normalise_score(transformed_score),
39 |          class = get_variant_class(wt, mut)) %>%
40 |   drop_na(position, raw_score) # Some muts arent found in seqs with <= 4 variants
41 | 
42 | # Save output
43 | standardise_study(dm_data, meta$study, meta$transform)
44 | 


--------------------------------------------------------------------------------
/data/studies/giacomelli_2018_tp53/standardise_giacomelli_2018_tp53.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Giacomelli et al. (TP53)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/giacomelli_2018_tp53/giacomelli_2018_tp53.yaml')
 8 | dm_data <- read_xlsx('data/studies/giacomelli_2018_tp53/raw/41588_2018_204_MOESM5_ESM.xlsx', skip=1) %>%
 9 |   rename_all(str_to_lower) %>%
10 |   rename(wt = wt_aa, mut = vt_aa) %>%
11 |   mutate(mut = if_else(mut == 'Z', '*', mut),
12 |          wt = if_else(wt == 'Z', '*', wt)) %>%
13 |   filter(a549_p53wt_early_time_point_experiment_1 > 0,
14 |          a549_p53wt_early_time_point_experiment_2 > 0) %>%
15 |   
16 |   # Transform to frequencies with pseudocount
17 |   mutate_at(vars(starts_with('a549')), .funs = ~(. + min(.[. > 0], na.rm = TRUE))/sum(., na.rm = TRUE)) %>%
18 |   
19 |   # Extract columns for the combination of states
20 |   pivot_longer(starts_with('a549'), names_to = 'id', values_to = 'count') %>%
21 |   tidyr::extract(id, into = c('p53', 'state', 'experiment'), "a549_p53(wt|null)_(early_time_point|nutlin-3|etoposide)_experiment_([12])") %>%
22 |   pivot_wider(names_from = state, values_from = count) %>%
23 |   rename(nutlin3 = `nutlin-3`, initial_freq = early_time_point) %>%
24 |   pivot_longer(c('nutlin3', 'etoposide'), names_to = 'drug', values_to = 'freq') %>%
25 |   drop_na(freq) %>%
26 |   
27 |   # Determine ER and fitness for each combination
28 |   mutate(er = freq / initial_freq) %>%
29 |   select(position, wt, mut, wt_codon, vt_codon, variant_group, p53, experiment, drug, initial_freq, freq, er) %>%
30 |   group_by(p53, experiment, drug) %>%
31 |   mutate(fitness = log2(er / mean(er[variant_group == 'BackboneWt'], na.rm = TRUE))) %>%
32 |   
33 |   # Average codons and then experiments
34 |   group_by(position, wt, mut, p53, experiment, drug) %>%
35 |   summarise(raw_score = weighted.mean(fitness, initial_freq, na.rm = TRUE)) %>%
36 |   group_by(position, wt, mut, p53, drug) %>%
37 |   summarise(raw_score = mean(raw_score, na.rm = TRUE)) %>%
38 |   ungroup() %>%
39 |   
40 |   # Select appropriate experiment (p53 NULL, Etoposide selects for funcional p53), others test other functions that could possibly be integrated carefully
41 |   filter(p53 == 'null', drug == 'etoposide') %>%
42 |   
43 |   mutate(transformed_score = raw_score,
44 |          score = normalise_score(transformed_score), 
45 |          class = get_variant_class(wt, mut)) %>%
46 |   select(position, wt, mut, score, transformed_score, raw_score, class)
47 |   
48 | 
49 | # Save output
50 | standardise_study(dm_data, meta$study, meta$transform)
51 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/sift_correlation.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Calculate the correlation between study scores and SIFT results
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | sift_dir <- 'data/sift/'
 7 | study_dirs <- dir('data/studies', full.names = TRUE)
 8 | 
 9 | dms <- lapply(study_dirs, import_study, fields = c('gene')) %>%
10 |   bind_rows()
11 | 
12 | sift <- sapply(unique(dms$gene), import_sift, simplify = FALSE) %>%
13 |   bind_rows(.id = 'gene') 
14 | 
15 | dms <- left_join(dms, sift, by = c('gene', 'position', 'wt', 'mut'))
16 | 
17 | sift_correlations <- bind_rows(group_by(dms, study) %>% 
18 |                                  do(tidy(cor.test(.$score, .$log10_sift, method = 'kendall'))),
19 |                                group_by(dms, study) %>% 
20 |                                  do(tidy(cor.test(.$score, .$log10_sift, method = 'pearson')))) %>%
21 |   mutate(study_pretty = sapply(study, format_study, USE.NAMES = FALSE),
22 |          p_cat = pretty_p_values(p.value, breaks = c(1e-48, 1e-12, 1e-06, 1e-3, 0.01, 0.05)))
23 | 
24 | filtered <- sapply(unique(sift_correlations$study), function(x){
25 |   y <- read_yaml(str_c('data/studies/', x, '/', x, '.yaml'))
26 |   return(ifelse(y$qc$filter, 'red', 'black'))
27 | })
28 | names(filtered) <- sapply(names(filtered), format_study, USE.NAMES = FALSE)
29 |   
30 | p_sift_cor <- ungroup(sift_correlations) %>%
31 |   mutate(study_pretty = add_markdown(study_pretty, colour = filtered)) %>%
32 |   ggplot(aes(x = study_pretty, y = estimate, fill = p_cat)) +
33 |   facet_wrap(~method, ncol = 1) +
34 |   geom_col(position = position_dodge()) +
35 |   geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=0.5, position = position_dodge(0.9)) +
36 |   geom_hline(yintercept = 0) +
37 |   ggtitle('Correlation between Normalised Score and log10(SIFT)') +
38 |   xlab('') +
39 |   ylab('Correlation') +
40 |   scale_fill_viridis_d(guide=guide_legend(title='p-value'), drop=FALSE) +
41 |   theme(axis.text.x = element_markdown(angle = 90, hjust = 1, vjust = 0.5))
42 | ggsave('figures/0_data/sift_score_correlation.pdf', p_sift_cor, width = 20, height = 20, units = 'cm')
43 | 
44 | p_sift_density <- ggplot(dms, aes(x = score, y = jitter(log10_sift, 0.1), colour = class)) +
45 |   facet_wrap(~study, labeller = as_labeller(sapply(unique(dms$study), format_study)), ncol = 6, scales = 'free_x') +
46 |   geom_density2d(data = filter(dms, class == 'Missense')) +
47 |   geom_point(data = filter(dms, !class == 'Missense')) +
48 |   scale_colour_manual(values = MUT_CLASS_COLOURS) +
49 |   labs(x = 'Score', y = 'log10(SIFT)')
50 | ggsave('figures/0_data/sift_score_density.pdf', p_sift_density, width = 35, height = 35, units = 'cm')
51 | 


--------------------------------------------------------------------------------
/bin/figures/figureS4.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Produce figure S4 (Consistency of positions in UMAP space)
 3 | source('src/config.R')
 4 | source('src/subtype_characterisation.R')
 5 | 
 6 | dms <- read_tsv('data/combined_mutational_scans.tsv')
 7 | dms <- left_join(dms, count(dms, gene, position), by = c('gene', 'position'))
 8 | 
 9 | repeated <- filter(dms, n > 1) %>%
10 |   select(study, gene, position, umap1, umap2) %>%
11 |   group_by(gene, position) %>%
12 |   summarise(umap1_1 = umap1[1],
13 |             umap1_2 = umap1[2],
14 |             umap2_1 = umap2[1],
15 |             umap2_2 = umap2[2])
16 | 
17 | p_umap <- ggplot() +
18 |   geom_point(data = dms, mapping = aes(x = umap1, y = umap2), colour = 'grey90', shape = 20, size = 0.8) +
19 |   geom_segment(data = repeated, aes(x = umap1_1, y = umap2_1, xend = umap1_2, yend = umap2_2)) +
20 |   geom_point(data = filter(dms, n > 1), mapping = aes(x = umap1, y = umap2, colour = gene)) +
21 |   scale_colour_brewer(type = 'qual', palette = 'Set1') +
22 |   guides(colour = guide_legend(title = '')) +
23 |   labs(x = 'UMAP1', y = 'UMAP2')
24 | 
25 | # Distribution of distances n > 1/n == 1
26 | distances <- mutate(dms, gene_pos = str_c(gene, '_', position)) %>%
27 |   tibble_to_matrix(umap1, umap2, row_names = 'gene_pos') %>%
28 |   dist() %>%
29 |   as.matrix()
30 | distances[upper.tri(distances, diag = TRUE)] <- NA
31 | distances <- as_tibble(distances, rownames = 'gene_pos1') %>%
32 |   pivot_longer(-gene_pos1, names_to = 'gene_pos2', values_to = 'dist') %>%
33 |   drop_na() %>%
34 |   mutate(rep = ifelse(gene_pos1 == gene_pos2, 'Repeated Position', 'Background'))
35 | 
36 | # t.test
37 | # wilcox.test(x = filter(distances, rep == 'Repeated Position')$dist, y = filter(distances, rep == 'Background')$dist, alternative = 'less')
38 | 
39 | p_dists <- ggplot(distances, aes(x = dist, y = ..scaled.., colour = rep)) +
40 |   stat_density(geom = 'line', position = 'identity') +
41 |   labs(x = 'UMAP Space Euclidean Distance', y = 'Scaled Density') + 
42 |   scale_colour_brewer(type = 'qual', palette = 'Dark2') +
43 |   guides(colour = guide_legend(title = ''))
44 |   
45 | figure <- multi_panel_figure(width = 183, height = c(89, 89), unit = 'mm', columns = 1) %>%
46 |   fill_panel(p_umap, row = 1, column = 1) %>%
47 |   fill_panel(p_dists, row = 2, column = 1)
48 | ggsave('figures/4_figures/figureS4.pdf', figure, width = 183, height = 185, units = 'mm')
49 | ggsave('figures/4_figures/figureS4.png', figure, width = 183, height = 185, units = 'mm')
50 | ggsave('figures/4_figures/figureS4.tiff', figure, width = 183, height = 185, units = 'mm')
51 | ggsave('figures/4_figures/figureS4.eps', figure, width = 183, height = 185, units = 'mm', device=cairo_ps, fallback_resolution = 600)
52 | 
53 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/validate_araya_2012_yap1.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Validate multi mutant combination method for Araya et al. 2012 (YAP1)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/araya_2012_yap1/araya_2012_yap1.yaml')
 8 | dm_data <- read_csv('data/studies/araya_2012_yap1/raw/urn_mavedb_00000002-a-2_scores.csv', skip = 4) %>%
 9 |   select(hgvs_pro, raw_score = score) %>%
10 |   mutate(hgvs_pro = if_else(str_ends(hgvs_pro, ']'), str_sub(hgvs_pro, start = 4, end = -2), str_sub(hgvs_pro, start = 3)),
11 |          n_mut = str_count(hgvs_pro, ';') + 1) %>%
12 |   separate(hgvs_pro, str_c('mut', 1:max(.$n_mut)), sep = ';', fill = 'right') %>%
13 |   pivot_longer(cols = starts_with('mut'), values_to = 'mut') %>%
14 |   drop_na(mut) %>%
15 |   select(-name) %>%
16 |   tidyr::extract(mut, into = c('wt', 'position', 'mut'), "([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", convert = TRUE) %>%
17 |   mutate(wt = AA_THREE_2_ONE[wt], mut = AA_THREE_2_ONE[mut], position = position + 169) %>%
18 |   mutate(transformed_score = raw_score) %>%
19 |   group_by(position, wt, mut)
20 | 
21 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(transformed_score[n_mut == 1], na.rm = TRUE), NA)) %>%
22 |   ungroup()
23 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score)
24 | 
25 | mut_count_data <- lapply(2:9, function(x){
26 |   summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(transformed_score[n_mut <= x], na.rm = TRUE), NA)) %>%
27 |     ungroup() %>%
28 |     select(!!str_c('score_', x))
29 | }) %>%
30 |   bind_cols(singles, .) %>%
31 |   pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>%
32 |   group_by(n_mut) %>%
33 |   mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>%
34 |   ungroup() %>%
35 |   mutate(n_mut = as.integer(n_mut))
36 | 
37 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) +
38 |   facet_wrap(~n_mut, ncol = 5) +
39 |   geom_point(colour = 'cornflowerblue') + 
40 |   geom_abline(slope = 1, linetype='dashed') +
41 |   geom_text(x = -5, y = 1, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) +
42 |   labs(x = 'log2(Score) (Single Variant)',
43 |        y = 'log2(Score) (Mean Over Multiple Variants)',
44 |        title = 'Accuracy of multi-variant averaging for scoring in Araya et al. 2012 (YAP1)',
45 |        subtitle = str_c('Fraction of variants with individual measures: ', single_frac))
46 | ggsave('figures/0_data/per_study/araya_2012_yap1/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25)
47 | 


--------------------------------------------------------------------------------
/src/dimensionality_reduction.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Functions for dimensionality reduction analysis
 3 | 
 4 | ### tSNE/UMAP Plots ###
 5 | tsne_umap_plots <- function(tbl, x, y, name){
 6 |   x <- enquo(x)
 7 |   y <- enquo(y)
 8 |   p <- list(study = plot_dim_red_study(tbl, !!x, !!y),
 9 |             aa = plot_dim_red_aa(tbl, !!x, !!y),
10 |             hydrophobicity = plot_dim_red_hydrophobicity(tbl, !!x, !!y),
11 |             surface_accessibility = plot_dim_red_surface_accessibility(tbl, !!x, !!y),
12 |             mean_er = plot_dim_red_mean_er(tbl, !!x, !!y))
13 |   names(p) <- str_c(name, '_', names(p))
14 |   return(p)
15 | }
16 | 
17 | plot_dim_red_study <- function(tbl, x, y){
18 |   x <- enquo(x)
19 |   y <- enquo(y)
20 |   (ggplot(tbl, aes(x=!!x, y=!!y, colour=study)) +
21 |       geom_point() +
22 |       facet_wrap(~study, labeller = as_labeller(sapply(unique(dms_wide$study), format_study, max_width=18)), nrow = 4) +
23 |       guides(colour=FALSE)) %>%
24 |     labeled_plot(units='cm', height = 20, width = 32)
25 | }
26 | 
27 | plot_dim_red_aa <- function(tbl, x, y){
28 |   x <- enquo(x)
29 |   y <- enquo(y)
30 |   (mutate(tbl, aa_class = AA_REDUCED_HASH[wt]) %>%
31 |       ggplot(aes(x=!!x, y=!!y, colour=wt)) +
32 |       geom_point() +
33 |       facet_wrap(~aa_class) +
34 |       scale_colour_manual(values = AA_COLOURS) +
35 |       guides(colour = guide_legend(title='AA')) +
36 |       theme(panel.grid.major.x = element_line(colour = 'gray', linetype = 'dotted'))) %>%
37 |     labeled_plot(units='cm', height = 25, width = 25)
38 | }
39 | 
40 | plot_dim_red_hydrophobicity <- function(tbl, x, y){
41 |   x <- enquo(x)
42 |   y <- enquo(y)
43 |   (ggplot(tbl, aes(x=!!x, y=!!y, colour=hydrophobicity)) +
44 |       geom_point() +
45 |       scale_colour_gradient2() +
46 |       guides(colour = guide_colourbar(title = 'Hydrophobicity'))) %>%
47 |     labeled_plot(units='cm', height = 10, width = 15)
48 | }
49 | 
50 | plot_dim_red_surface_accessibility <- function(tbl, x, y){
51 |   x <- enquo(x)
52 |   y <- enquo(y)
53 |   (drop_na(tbl, all_atom_abs) %>%
54 |       ggplot(aes(x=!!x, y=!!y, colour=all_atom_abs)) +
55 |       geom_point() +
56 |       scale_colour_viridis_c() +
57 |       guides(colour = guide_colourbar(title = 'Surface Accessibility\n(All Atom Abs)'))) %>%
58 |     labeled_plot(units='cm', height = 10, width = 15)
59 | }
60 | 
61 | plot_dim_red_mean_er <- function(tbl, x, y){
62 |   x <- enquo(x)
63 |   y <- enquo(y)
64 |   (drop_na(tbl, all_atom_abs) %>%
65 |       ggplot(aes(x=!!x, y=!!y, colour=mean_score)) +
66 |       geom_point() +
67 |       scale_colour_gradient2() +
68 |       guides(colour = guide_colourbar(title = 'Mean Norm. ER'))) %>%
69 |     labeled_plot(units='cm', height = 10, width = 15)
70 | }
71 | 
72 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/validate_sarkisyan_2016_gfp.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Validate the strategy used for Sarkisyan et al. 2016 (GFP)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | dir.create('figures/0_data/per_study/sarkisyan_2016_gfp')
 7 | 
 8 | # Import and process data
 9 | raw_data <- read_tsv('data/studies/sarkisyan_2016_gfp/raw/amino_acid_genotypes_to_brightness.tsv', skip = 1,
10 |                      col_names = c('mut', 'barcodes', 'median_brightness', 'std')) %>%
11 |             mutate(n_mut = str_count(mut, ':') + 1)
12 | wt_brightness <- filter(raw_data, is.na(mut)) %>% pull(median_brightness)
13 | 
14 | dm_data <- separate(raw_data, mut, into = str_c('mut', 1:15), sep = ':', fill = 'right') %>%
15 |     pivot_longer(cols = starts_with('mut'), names_to = 'n', names_prefix = 'mut', values_to = 'mut') %>%
16 |     drop_na(mut) %>%
17 |     select(-n, -barcodes, -std) %>%
18 |     tidyr::extract(mut, into = c('wt', 'position', 'mut'), 'S([A-Z])([0-9]+)([A-Z])', convert=TRUE) %>%
19 |     arrange(position, mut) %>%
20 |     mutate(single_score = ifelse(n_mut == 1, median_brightness, NA)) %>%
21 |     group_by(position, wt, mut)
22 | 
23 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(median_brightness[n_mut == 1], na.rm = TRUE), NA)) %>%
24 |   ungroup()
25 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score)
26 | 
27 | mut_count_data <- lapply(2:15, function(x){
28 |   summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(median_brightness[n_mut <= x], na.rm = TRUE), NA)) %>%
29 |     ungroup() %>%
30 |     select(!!str_c('score_', x))
31 | }) %>%
32 |   bind_cols(singles, .) %>%
33 |   pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>%
34 |   group_by(n_mut) %>%
35 |   mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>%
36 |   ungroup() %>%
37 |   mutate(n_mut = as.integer(n_mut))
38 | 
39 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) +
40 |   facet_wrap(~n_mut, ncol = 5) +
41 |   geom_point(colour = 'cornflowerblue') + 
42 |   geom_abline(slope = 1, linetype='dashed') +
43 |   geom_text(x = 1.5, y = 3.5, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) +
44 |   labs(x = 'Median Brightness (Single Variant)',
45 |        y = 'Median Brightness (Mean Over Multiple Variants)',
46 |        title = 'Accuracy of multi-variant averaging for scoring in Sarkisyan et al. 2016 (GFP)',
47 |        subtitle = str_c('Fraction of variants with individual measures: ', single_frac))
48 | ggsave('figures/0_data/per_study/sarkisyan_2016_gfp/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25)
49 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/validate_starita_2013_ube4b.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Validate multiple mutation method for Starita et al. 2013 (UBE4B)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | dir.create('figures/0_data/per_study/starita_2013_ube4b/')
 7 | 
 8 | # Import and process data
 9 | meta <- read_yaml('data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml')
10 | dm_data <- read_xlsx('data/studies/starita_2013_ube4b/raw/sd01.xlsx', na = c('NA', '')) %>%
11 |   filter(!seqID == 'NA-NA') %>% # Filter WT
12 |   rename(raw_score = log2_ratio) %>%
13 |   separate(seqID, into = c('position', 'mut'), sep='-') %>%
14 |   select(-nscor_log2_ratio) %>%
15 |   mutate(n_mut = sapply(position, function(x){str_count(x, ',') + 1})) %>%
16 |   separate(mut, str_c('mut', 1:max(.$n_mut)), sep = ',', fill = 'right') %>%
17 |   separate(position, str_c('position', 1:max(.$n_mut)), sep = ',', fill = 'right') %>%
18 |   pivot_longer(starts_with('position'), names_to = 'pos_num', names_prefix = 'position', values_to = 'position') %>%
19 |   drop_na(position) %>%
20 |   pivot_longer(starts_with('mut'), names_to = 'mut_num', names_prefix = 'mut', values_to = 'mut') %>%
21 |   drop_na(mut) %>%
22 |   filter(pos_num == mut_num) %>%
23 |   select(-pos_num, -mut_num) %>%
24 |   group_by(position, mut)
25 | 
26 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(raw_score[n_mut == 1], na.rm = TRUE), NA)) %>%
27 |   ungroup()
28 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score)
29 | 
30 | mut_count_data <- lapply(2:9, function(x){
31 |   summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(raw_score[n_mut <= x], na.rm = TRUE), NA)) %>%
32 |     ungroup() %>%
33 |     select(!!str_c('score_', x))
34 | }) %>%
35 |   bind_cols(singles, .) %>%
36 |   pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>%
37 |   group_by(n_mut) %>%
38 |   mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>%
39 |   ungroup() %>%
40 |   mutate(n_mut = as.integer(n_mut))
41 | 
42 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) +
43 |   facet_wrap(~n_mut, ncol = 5) +
44 |   geom_point(colour = 'cornflowerblue') + 
45 |   geom_abline(slope = 1, linetype='dashed') +
46 |   geom_text(x = -4, y = 3, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) +
47 |   labs(x = 'log2(Score) (Single Variant)',
48 |        y = 'log2(Score) (Mean Over Multiple Variants)',
49 |        title = 'Accuracy of multi-variant averaging for scoring in Starita et al. 2013 (UBE4B)',
50 |        subtitle = str_c('Fraction of variants with individual measures: ', single_frac))
51 | ggsave('figures/0_data/per_study/starita_2013_ube4b/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25)
52 | 


--------------------------------------------------------------------------------
/bin/pipeline/sequence_statistics.smk:
--------------------------------------------------------------------------------
 1 | """
 2 | Rules for generating statistics from protein sequences, including the Porter5 and SIFT pipeline
 3 | 
 4 | Expects global variables:
 5 | - GENES: dict mapping genes to lists of the studies assessing them
 6 | """
 7 | 
 8 | rule make_gene_fasta:
 9 |     """
10 |     Generate a FASTA file for a gene from the sequence in the studies YAML configs
11 |     """
12 |     # marked ancient as seq shouldn't change when .yaml's do
13 |     # force re-run if neccessary by deleting relevant .fa
14 |     input:
15 |         lambda wildcards: [ancient(f'data/studies/{i}/{i}.yaml') for i
16 |                            in GENES[wildcards.gene]]
17 | 
18 |     output:
19 |         "data/fasta/{gene}.fa"
20 | 
21 |     log:
22 |         "logs/make_gene_fasta/{gene}.log"
23 | 
24 |     shell:
25 |         "python bin/data_processing/make_gene_fasta.py {input} > {output} 2> {log}"
26 | 
27 | rule sift4g:
28 |     """
29 |     Run SIFT4G on a FASTA file, assessing all possible variants.
30 |     Note: for this project I have been using a modified version of SIFT4G that
31 |     outputs to 4.d.p rather than 2.
32 |     """
33 |     input:
34 |         fa = "data/fasta/{gene}.fa",
35 |         db = config['sift']['uniref90_fa_path']
36 | 
37 |     output:
38 |         "data/sift/{gene}.SIFTprediction"
39 | 
40 |     log:
41 |         'logs/sift4g/{gene}.log'
42 | 
43 |     resources:
44 |         mem_mb = 8000
45 | 
46 |     shell:
47 |         "sift4g -q {input.fa} -d {input.db} --out data/sift 2> {log}"
48 | 
49 | rule all_sift_predictions:
50 |     """
51 |     Produce SIFT predictions for all genes
52 |     """
53 |     input:
54 |         expand('data/sift/{gene}.SIFTprediction', gene=GENES.keys())
55 | 
56 | rule porter5:
57 |     """
58 |     Run Porter5 on a FASTA file, generating secondary structure predictions for each residue.
59 |     """
60 |     input:
61 |         "data/fasta/{gene}.fa"
62 | 
63 |     output:
64 |         ss3="data/porter5/{gene}.ss3",
65 |         ss8="data/porter5/{gene}.ss8"
66 | 
67 |     log:
68 |         "logs/porter5/{gene}.log"
69 | 
70 |     threads: 8
71 | 
72 |     resources:
73 |         mem_mb = 30000
74 | 
75 |     shell:
76 |         f"""
77 |         python {config['porter5']['path']} -i {{input}} --cpu 8 --tmp &> {{log}}
78 |         cat data/fasta/{{wildcards.gene}}.fa.log >> {{log}} 2>&1
79 |         mv data/fasta/{{wildcards.gene}}.fa.ss3 {{output.ss3}} >> {{log}} 2>&1
80 |         mv data/fasta/{{wildcards.gene}}.fa.ss8 {{output.ss8}} >> {{log}} 2>&1
81 |         rm data/fasta/{{wildcards.gene}}.fa.* &> {{log}}
82 |         rm data/fasta/{{wildcards.gene}}.hhr &> {{log}}
83 |         """
84 | 
85 | rule all_porter5_predictions:
86 |     """
87 |     Produce SIFT predictions for all genes
88 |     """
89 |     input:
90 |         expand('data/porter5/{gene}.ss8', gene=GENES.keys()),
91 |         expand('data/porter5/{gene}.ss3', gene=GENES.keys())
92 | 


--------------------------------------------------------------------------------
/bin/analysis/0_data/check_normalisation.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Check the validity of the normalisation approach
 3 | source('src/config.R')
 4 | 
 5 | dms <- read_tsv('data/long_combined_mutational_scans.tsv')
 6 | plots <- list()
 7 | 
 8 | ### Check normalisation 10% with an without nonsense ###
 9 | quantiles <- filter(dms, !is.na(score)) %>%
10 |   group_by(study) %>%
11 |   summarise(with = quantile(transformed_score, 0.1, na.rm = TRUE),
12 |             without = quantile(transformed_score[!mut == "*"], 0.1, na.rm = TRUE),
13 |             nonsense = any(mut == "*"))
14 | 
15 | plots$norm_quantiles <- ggplot(filter(quantiles, nonsense), aes(x = with, y = without)) +
16 |   geom_point() +
17 |   geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
18 |   labs(x = "With Nonsense", y = "Without Nonsense")
19 | 
20 | ### Check distribution of bottom 10% scores predictor metrics ###
21 | dms_scores <- select(dms, study, score, transformed_score, sift, log10_sift, total_energy) %>%
22 |   left_join(quantiles, by = "study") %>%
23 |   mutate(with10 = transformed_score < with,
24 |          without10 = transformed_score < without,
25 |          neutral = abs(score) < 0.3)
26 | 
27 | score_groups <- filter(dms_scores, nonsense, with10 | without10 | neutral) %>%
28 |   pivot_longer(c(log10_sift, total_energy), names_to = "metric", values_to = "value") %>%
29 |   pivot_longer(c(with10, without10, neutral), names_to = "class", values_to = "member") %>%
30 |   filter(member, !is.na(value))
31 | 
32 | plots$score_group_density <- ggplot(score_groups, aes(x = value, y = ..scaled.., colour = class)) +
33 |   stat_density(geom = "line", position = "identity") +
34 |   facet_wrap(~metric, nrow = 1, scales = "free_x")
35 | 
36 | plots$score_group_box <- ggplot(score_groups, aes(x = class, y = value, fill = class)) +
37 |   geom_boxplot() +
38 |   facet_wrap(~metric, nrow = 1, scales = "free_y") +
39 |   stat_compare_means(comparisons = list(c("with10", "without10"), c("with10", "neutral"), c("without10", "neutral")))
40 | 
41 | ### SIFT4G Scores across studies
42 | 
43 | plots$per_study_sift_dist <- ggplot(filter(dms_scores, with10), aes(x = study, y = log10_sift)) +
44 |   geom_boxplot(fill = "cornflowerblue") +
45 |   geom_hline(yintercept = log10(0.05)) +
46 |   coord_flip() +
47 |   labs(y = expression("log"[10]~"SIFT4G"), x = "") +
48 |   theme(panel.grid.major.y = element_blank(),
49 |         panel.grid.major.x = element_line(linetype = "dotted", colour = "grey"))
50 | 
51 | plots$per_study_foldx_dist <- ggplot(filter(dms_scores, with10), aes(x = study, y = total_energy)) +
52 |   geom_boxplot(fill = "cornflowerblue") +
53 |   geom_hline(yintercept = c(-1, 1)) +
54 |   coord_flip() +
55 |   labs(y = expression(Delta*Delta*"G"), x = "") +
56 |   theme(panel.grid.major.y = element_blank(),
57 |         panel.grid.major.x = element_line(linetype = "dotted", colour = "grey"))
58 | 
59 | ### Save plots ###
60 | save_plotlist(plots, "figures/0_data/normalisation", overwrite = "all")
61 | 


--------------------------------------------------------------------------------
/bin/data_processing/foldx_variants.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Template Script
 4 | """
 5 | import sys
 6 | import argparse
 7 | from pathlib import Path
 8 | 
 9 | from Bio.PDB import PDBParser
10 | from Bio.SeqUtils import seq1
11 | 
12 | from subtypes_utils import import_sections
13 | 
14 | AA_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY'
15 | 
16 | def main(args):
17 |     """Main script"""
18 |     pdb_parser = PDBParser()
19 | 
20 |     pdb_name = Path(args.pdb).stem
21 |     # deal with FoldX repaired PDBs
22 |     if pdb_name.endswith('_Repair'):
23 |         pdb_name = pdb_name.replace('_Repair', '')
24 | 
25 |     structure = pdb_parser.get_structure(pdb_name, args.pdb)
26 | 
27 |     sections = import_sections(args.yaml, pdb_name)
28 | 
29 |     variants = []
30 |     if sections is not None:
31 |         for section in sections:
32 |             filter_region = 'region' in section
33 |             for residue in structure[0][section['chain']]:
34 |                 if not residue.id[0] == ' ':
35 |                     continue # Filter HETATMs
36 | 
37 |                 position = int(residue.id[1])
38 |                 amino_acid = seq1(residue.get_resname())
39 | 
40 |                 if not amino_acid in AA_ALPHABET:
41 |                     # Filter non-standard AAs, required when processing
42 |                     # foldx repaired PDBs as they turn HETATMs to regular ATOMs
43 |                     # for regular proteins
44 |                     continue
45 | 
46 |                 if (filter_region and
47 |                         (position > section['region'][1] or
48 |                          position < section['region'][0])):
49 |                     continue
50 | 
51 |                 variants.extend([f"{amino_acid}{section['chain']}{position}{x}" for
52 |                                  x in AA_ALPHABET if not x == amino_acid])
53 |     else:
54 |         for chain in structure[0]:
55 |             for residue in chain:
56 |                 if not residue.id[0] == ' ':
57 |                     continue # Filter HETATMs
58 | 
59 |                 position = int(residue.id[1])
60 |                 amino_acid = seq1(residue.get_resname())
61 | 
62 |                 if not amino_acid in AA_ALPHABET:
63 |                     continue
64 | 
65 |                 variants.extend([f"{amino_acid}{chain.id}{position}{x}" for
66 |                                  x in AA_ALPHABET if not x == amino_acid])
67 | 
68 |     print(*variants, sep=';\n', end=';\n', file=sys.stdout)
69 | 
70 | def parse_args():
71 |     """Process input arguments"""
72 |     parser = argparse.ArgumentParser(description=__doc__,
73 |                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
74 | 
75 |     parser.add_argument('pdb', metavar='P', help="Input PDB")
76 | 
77 |     parser.add_argument('--yaml', '-y', help="YAML file/raw YAML defining PDB sections to consider")
78 | 
79 |     return parser.parse_args()
80 | 
81 | if __name__ == "__main__":
82 |     ARGS = parse_args()
83 |     main(ARGS)
84 | 


--------------------------------------------------------------------------------
/data/studies/melnikov_2014_aph3ii/standardise_melnikov_2014_aph3ii.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | # Standardise data from Melnikov et al. 2014 (APH(3')-II)
 3 | source('src/config.R')
 4 | source('src/study_standardising.R')
 5 | 
 6 | # Import and process data
 7 | meta <- read_yaml('data/studies/melnikov_2014_aph3ii/melnikov_2014_aph3ii.yaml')
 8 | 
 9 | count_files <- grep('\\.aacounts\\.txt', dir('data/studies/melnikov_2014_aph3ii/raw/'), value = TRUE)
10 | count_files <- count_files[!grepl('(S[12]\\_Ami|S3\\_Kan)', count_files)] # Discard bad runs - see 00README.txt from Melnikov et al. data
11 | 
12 | counts <- sapply(count_files, read_melnikov_table, simplify = FALSE) %>%
13 |   set_names(gsub('(KKA2\\_|\\.aacounts\\.txt)', '', names(.))) # Set names to drug
14 | bkg_counts <- counts[c('Bkg1', 'Bkg2')]
15 | counts <- counts[which(!names(counts) %in% c('Bkg1', 'Bkg2'))]
16 | 
17 | # Process data, see bin/0_data/validate_melnikov.R for details + comments
18 | dm_data <- mapply(melnikov_fitness, counts, names(counts), MoreArgs = list(bkg=bkg_counts), SIMPLIFY = FALSE) %>% # Calculate fitness for each count
19 |   bind_rows(.id = 'experiment') %>%
20 |   
21 |   # Extract expeciment data, round and library contain the same information (plus round notes which needed a re-test at different MIC, 
22 |   # which we alread accounted for) -> discard round
23 |   separate(experiment, c('round', 'drug', 'library'), sep='_') %>%
24 |   select(position, wt, mut, score, drug, library) %>% 
25 |   mutate(rel_conc = 1/as.numeric(str_sub(drug, -1)), drug = str_sub(drug, 1, -3)) %>%
26 |   
27 |   # Process library pairs - discard datasets where libraries don't agree and filter outlier points, then average L1 & L2
28 |   pivot_wider(id_cols = c('position', 'wt', 'mut', 'drug', 'rel_conc'), names_from = library, values_from = score) %>%
29 |   mutate(diff = abs(L1 - L2)) %>%
30 |   filter(!(drug == 'Ami' & rel_conc == 0.25), !(drug %in% c('G418', 'Ami', 'Kan') & rel_conc == 0.125)) %>%
31 |   filter(diff < sd(diff, na.rm = TRUE) * 3) %>%
32 |   mutate(score = (L1 + L2)/2) %>%
33 |   drop_na(score) %>%
34 |   
35 |   # Select the best conc for each drug, based on ER distribution
36 |   filter((drug == 'Ami' & rel_conc == 0.5) |
37 |            (drug == 'G418' & rel_conc == 0.25) |
38 |            (drug == 'Kan' & rel_conc == 0.25) |
39 |            (drug == 'Neo' & rel_conc == 0.25) |
40 |            (drug == 'Paro' & rel_conc == 0.125) |
41 |            (drug == 'Ribo' & rel_conc == 0.125)) %>%
42 |   select(drug, position, wt, mut, score) %>%
43 |   
44 |   # Filter Ami as it doesn't correlate with other drugs, then average
45 |   filter(!drug == 'Ami') %>%
46 |   group_by(position, wt, mut) %>%
47 |   summarise(score = mean(score, na.rm=TRUE)) %>%
48 |   ungroup() %>%
49 |   mutate(raw_score = score,
50 |          transformed_score = raw_score,
51 |          score = normalise_score(transformed_score),
52 |          class = get_variant_class(wt, mut))
53 | 
54 | # Save output
55 | standardise_study(dm_data, meta$study, meta$transform)
56 | 


--------------------------------------------------------------------------------
/data/studies/findlay_2018_brca1/findlay_2018_brca1.yaml:
--------------------------------------------------------------------------------
 1 | study: 'findlay_2018_brca1' 
 2 | gene: 'BRCA1'
 3 | uniprot_id: 'P38398'
 4 | gene_type: 'E3 Ligase'
 5 | species: 'H. sapiens'
 6 | seq: "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQ\
 7 |       CPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKD\
 8 |       EVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYI\
 9 |       ELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQ\
10 |       PSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVE\
11 |       KAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPC\
12 |       SENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVD\
13 |       EYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTEN\
14 |       LIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTE\
15 |       QNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNI\
16 |       HNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPV\
17 |       RHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKE\
18 |       FVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQ\
19 |       ESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHS\
20 |       RETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVT\
21 |       FECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRG\
22 |       NETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIP\
23 |       STVSTISRNNIRENVFKEASSSNINEVGSSTNEVGSSINEIGSSDENIQAELGRNRGPKL\
24 |       NAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTVNTDFSPYLISDNLEQPMGSS\
25 |       HASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTHTHLAQ\
26 |       GYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENL\
27 |       LSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGS\
28 |       SKQMRHQSESQGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSE\
29 |       DCSGLSSQSDILTTQQRDTMQHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALE\
30 |       DLRNPEQSTSEKAVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSKNKEPGVERSSPSK\
31 |       CPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEESGPHDLTETSYLPRQDLEG\
32 |       TPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTT\
33 |       DTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLI\
34 |       TEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDV\
35 |       VNGRNHQGPKRARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTL\
36 |       GTGVHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPH\
37 |       SHY" 
38 | experiment: 'Complement'
39 | transform: 'None'
40 | authour: 'Findlay et al.'
41 | year: 2018
42 | title: 'Accurate classification of BRCA1 variants with saturation genome editing'
43 | lab: ['Shendure', 'Starita']
44 | doi: '10.1038/s41586-018-0461-z'
45 | pmid: '30209399'
46 | url: 'https://www.nature.com/articles/s41586-018-0461-z'
47 | notes: "Mutated nucleotides exhaustively rather than AAs, so only has \
48 |         AAs that can be reached in a single substitution"
49 | input_files:
50 |   - '41586_2018_461_MOESM3_ESM.xlsx' 
51 | source: 'SI - Table S1'
52 | qc:
53 |   filter: True 
54 |   notes: "Low coverage (only made single nucleotide substitutions)"
55 | 


--------------------------------------------------------------------------------