├── .gitattributes ├── figures └── 4_figures │ ├── parts │ └── arrow.png │ ├── proteins │ ├── amie.png │ ├── cbs.png │ ├── ccr5.png │ ├── cp.png │ ├── dbr1.png │ ├── gal4.png │ ├── ha.png │ ├── infa.png │ ├── np.png │ ├── pab1.png │ ├── pten.png │ ├── ras.png │ ├── src.png │ ├── tem1.png │ ├── tp53.png │ ├── tpk1.png │ ├── tpmt.png │ ├── ubi.png │ ├── yap1.png │ ├── adrb2.png │ ├── aph3ii.png │ ├── brca1.png │ ├── calm1.png │ ├── cxcr4.png │ ├── hsp90.png │ ├── ste12.png │ ├── sumo1.png │ ├── ube2i.png │ ├── ube4b.png │ └── proteing.png │ └── position_examples │ ├── cbs_phe_pi.png │ ├── ccr5_domains.png │ ├── tem1_asp_sa.png │ ├── adrb2_domains.png │ ├── cbs_asp_ionic.png │ ├── cxcr4_domains.png │ ├── gal4_cys_zinc.png │ ├── np_cys_aromatic.png │ ├── ras_met_buried.png │ ├── tem1_asp_ligand.png │ ├── ccr5_cys_aromatic.png │ ├── pab1_arg_not_neg.png │ ├── ccr5_cys_disulphide.png │ ├── adrb2_ala_small_hydro.png │ ├── aph3ii_arg_not_proline.png │ └── ras_aliphatic_entropy.png ├── meta ├── fasta │ ├── streptococcus_proteing.fa │ ├── s_cerevisiae_ubi.fa │ ├── e_coli_infa.fa │ ├── e_coli_ccdb.fa │ ├── h_sapiens_sumo1.fa │ ├── bacteriophage_ms2_cp.fa │ ├── h_sapiens_calm1.fa │ ├── h_sapiens_ube2i.fa │ ├── h_sapiens_ras.fa │ ├── a_victoria_gfp.fa │ ├── h_sapiens_tpk1.fa │ ├── h_sapiens_tpmt.fa │ ├── b_subtilis_gdh.fa │ ├── e_coli_tem1.fa │ ├── e_coli_aph3ii.fa │ ├── s_cerevisiae_gcn4.fa │ ├── h_sapiens_ccr5.fa │ ├── h_sapiens_cxcr4.fa │ ├── h_sapiens_mapk1.fa │ ├── h_sapiens_adrb2.fa │ ├── h_sapiens_tdp43.fa │ ├── p_aeruginosa_amie.fa │ ├── h_sapiens_tp53.fa │ ├── h_sapiens_pten.fa │ ├── h_sapiens_yap1.fa │ ├── h_sapiens_dbr1.fa │ ├── h_sapiens_src.fa │ ├── h_sapiens_cbs.fa │ ├── H3N2_A_Aichi_2_1968_np.fa │ ├── strep_protein_g_precursor.fa │ ├── s_cerevisiae_pab1.fa │ ├── H3N2_A_Perth_16_2009_ha.fa │ ├── s_cerevisiae_hsc82.fa │ ├── s_cerevisiae_ste12.fa │ ├── s_cerevisiae_hsp90.fa │ ├── h_sapiens_braf.fa │ ├── s_cerevisiae_gal4.fa │ ├── m_musculus_ube4b.fa │ ├── s_pyrogenes_cas9.fa │ └── h_sapiens_brca1.fa ├── subtypes │ ├── kmeans_profile.yaml │ ├── hdbscan_pca.yaml │ ├── kmeans_pca.yaml │ ├── pam_profile_cos.yaml │ ├── pam_profile_man.yaml │ ├── hdbscan_profile.yaml │ ├── pam_pca_no_sig_cos.yaml │ ├── pam_pca_no_sig_man.yaml │ ├── hclust_pca_dynamic.yaml │ ├── hdbscan_pca_no_sig.yaml │ ├── kmeans_pca_no_sig.yaml │ ├── gmm_pca.yaml │ ├── gmm_profile.yaml │ ├── gmm_pca_no_sig.yaml │ ├── hclust_profile_dynamic.yaml │ ├── hclust_profile_dynamic_cos.yaml │ ├── hclust_pca_no_sig_dynamic_cos_deep_0.yaml │ ├── hclust_pca_no_sig_dynamic_cos_deep_1.yaml │ └── hclust_pca_no_sig_dynamic.yaml ├── study_template.yaml ├── final_subtypes.yaml ├── residue_hydrophobicity.tsv └── README.md ├── data ├── long_combined_mutational_scans.tsv ├── studies │ ├── weile_2017_tpk1 │ │ ├── standardise_weile_2017_tpk1.R │ │ └── weile_2017_tpk1.yaml │ ├── sun_2018_cbs │ │ ├── standardise_sun_2018_cbs.R │ │ └── sun_2018_cbs.yaml │ ├── weile_2017_calm1 │ │ ├── standardise_weile_2017_calm1.R │ │ └── weile_2017_calm1.yaml │ ├── weile_2017_sumo1 │ │ ├── standardise_weile_2017_sumo1.R │ │ └── weile_2017_sumo1.yaml │ ├── ahler_2019_src │ │ ├── standardise_ahler_2019_src.R │ │ └── ahler_2019_src.yaml │ ├── weile_2017_ube2i │ │ ├── standardise_weile_2017_ube2i.R │ │ └── weile_2017_ube2i.yaml │ ├── kelsic_2016_infa │ │ ├── kelsic_2016_infa.yaml │ │ └── standardise_kelsic_2016_infa.R │ ├── ashenberg_2017_np │ │ ├── standardise_ashenberg_2017_np.R │ │ └── ashenberg_2017_np.yaml │ ├── melamed_2013_pab1 │ │ ├── standardise_melamed_2013_pab1.R │ │ └── melamed_2013_pab1.yaml │ ├── roscoe_2013_ubi │ │ ├── roscoe_2013_ubi.yaml │ │ └── standardise_roscoe_2013_ubi.R │ ├── bandaru_2017_ras │ │ ├── bandaru_2017_ras.yaml │ │ └── standardise_bandaru_2017_ras.R │ ├── bolognesi_2019_tdp43 │ │ ├── standardise_bolognesi_2019_tdp43.R │ │ └── bolognesi_2019_tdp43.yaml │ ├── hietpas_2011_hsp90 │ │ ├── standardise_hietpas_2011_hsp90.R │ │ └── hietpas_2011_hsp90.yaml │ ├── steinberg_2016_tem1 │ │ ├── standardise_steinberg_2016_tem1.R │ │ └── steinberg_2016_tem1.yaml │ ├── matreyek_2018_pten │ │ ├── standardise_matreyek_2018_pten.R │ │ └── matreyek_2018_pten.yaml │ ├── matreyek_2018_tpmt │ │ ├── standardise_matreyek_2018_tpmt.R │ │ └── matreyek_2018_tpmt.yaml │ ├── jiang_2013_hsp90 │ │ ├── standardise_jiang_2013_hsp90.R │ │ └── jiang_2013_hsp90.yaml │ ├── olson_2014_proteing │ │ ├── olson_2014_proteing.yaml │ │ └── standardise_olson_2014_proteing.R │ ├── roscoe_2014_ubi │ │ ├── roscoe_2014_ubi.yaml │ │ └── standardise_roscoe_2014_ubi.R │ ├── firnberg_2014_tem1 │ │ ├── firnberg_2014_tem1.yaml │ │ └── standardise_firnberg_2014_tem1.R │ ├── mishra_2016_hsp90 │ │ ├── standardise_mishra_2016_hsp90.R │ │ └── mishra_2016_hsp90.yaml │ ├── kitzman_2015_gal4 │ │ ├── standardise_kitzman_2015_gal4.R │ │ └── kitzman_2015_gal4.yaml │ ├── sarkisyan_2016_gfp │ │ ├── sarkisyan_2016_gfp.yaml │ │ └── standardise_sarkisyan_2016_gfp.R │ ├── findlay_2018_brca1 │ │ ├── standardise_findlay_2018_brca1.R │ │ └── findlay_2018_brca1.yaml │ ├── hartman_2018_cp │ │ ├── hartman_2018_cp.yaml │ │ └── standardise_hartman_2018_cp.R │ ├── brenan_2016_mapk1 │ │ ├── standardise_brenan_2016_mapk1.R │ │ └── brenan_2016_mapk1.yaml │ ├── wrenbeck_2017_amie │ │ ├── wrenbeck_2017_amie.yaml │ │ └── standardise_wrenbeck_2017_amie.R │ ├── jones_2019_adrb2 │ │ ├── jones_2019_adrb2.yaml │ │ └── standardise_jones_2019_adrb2.R │ ├── starita_2015_brca1 │ │ └── standardise_starita_2015_brca1.R │ ├── heredia_2018_cxcr4 │ │ ├── heredia_2018_cxcr4.yaml │ │ └── standardise_heredia_2018_cxcr4.R │ ├── doud_2015_np │ │ ├── doud_2015_np.yaml │ │ └── standardise_doud_2015_np.R │ ├── findlay_2014_dbr1 │ │ ├── findlay_2014_dbr1.yaml │ │ └── standardise_findlay_2014_dbr1.R │ ├── lee_2018_ha │ │ ├── standardise_lee_2018_ha.R │ │ └── lee_2018_ha.yaml │ ├── araya_2012_yap1 │ │ ├── araya_2012_yap1.yaml │ │ └── standardise_araya_2012_yap1.R │ ├── heredia_2018_ccr5 │ │ ├── heredia_2018_ccr5.yaml │ │ └── standardise_heredia_2018_ccr5.R │ ├── giacomelli_2018_tp53 │ │ ├── giacomelli_2018_tp53.yaml │ │ └── standardise_giacomelli_2018_tp53.R │ ├── wagenaar_2014_braf │ │ ├── standardise_wagenaar_2014_braf.R │ │ └── wagenaar_2014_braf.yaml │ ├── dorrity_2018_ste12 │ │ ├── dorrity_2018_ste12.yaml │ │ └── standardise_dorrity_2018_ste12.R │ ├── starita_2013_ube4b │ │ ├── standardise_starita_2013_ube4b.R │ │ └── starita_2013_ube4b.yaml │ └── melnikov_2014_aph3ii │ │ └── standardise_melnikov_2014_aph3ii.R └── pdb │ └── README.md ├── cluster.yaml ├── snakemake.yaml ├── .rsync_exclude ├── docs ├── subtypes_readme.txt ├── combined_mutational_scans_readme.txt └── foldx_eqn.md ├── .gitignore ├── bin ├── utils │ ├── setup_study_folder.sh │ └── farm_sync.sh ├── data_processing │ ├── standardise_study_template.R │ ├── make_gene_fasta.py │ ├── filter_pdb.py │ └── foldx_variants.py ├── analysis │ ├── 0_data │ │ ├── validate_kitzman_2015_gal4.R │ │ ├── summarise_standardised_data.R │ │ ├── sift_correlation.R │ │ ├── validate_araya_2012_yap1.R │ │ ├── validate_sarkisyan_2016_gfp.R │ │ ├── validate_starita_2013_ube4b.R │ │ └── check_normalisation.R │ └── 2_subtypes │ │ ├── characterise_subtypes.R │ │ ├── compare_hclust_dynamic_deep_split.R │ │ ├── sequence_context.R │ │ └── all_positions.R ├── figures │ ├── figureS5.R │ ├── figureS3.R │ ├── figureS8_27.R │ ├── figureS29.R │ └── figureS4.R └── pipeline │ └── sequence_statistics.smk ├── LICENSE └── src ├── continuous.R ├── pymol_utils.py └── dimensionality_reduction.R /.gitattributes: -------------------------------------------------------------------------------- 1 | data/long_combined_mutational_scans.tsv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /figures/4_figures/parts/arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/parts/arrow.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/amie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/amie.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/cbs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cbs.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ccr5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ccr5.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/cp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cp.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/dbr1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/dbr1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/gal4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/gal4.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ha.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/infa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/infa.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/np.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/np.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/pab1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/pab1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/pten.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/pten.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ras.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ras.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/src.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/src.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/tem1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tem1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/tp53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tp53.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/tpk1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tpk1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/tpmt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/tpmt.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ubi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ubi.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/yap1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/yap1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/adrb2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/adrb2.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/aph3ii.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/aph3ii.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/brca1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/brca1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/calm1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/calm1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/cxcr4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/cxcr4.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/hsp90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/hsp90.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ste12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ste12.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/sumo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/sumo1.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ube2i.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ube2i.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/ube4b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/ube4b.png -------------------------------------------------------------------------------- /figures/4_figures/proteins/proteing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/proteins/proteing.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/cbs_phe_pi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cbs_phe_pi.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/ccr5_domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_domains.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/tem1_asp_sa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/tem1_asp_sa.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/adrb2_domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/adrb2_domains.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/cbs_asp_ionic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cbs_asp_ionic.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/cxcr4_domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/cxcr4_domains.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/gal4_cys_zinc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/gal4_cys_zinc.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/np_cys_aromatic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/np_cys_aromatic.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/ras_met_buried.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ras_met_buried.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/tem1_asp_ligand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/tem1_asp_ligand.png -------------------------------------------------------------------------------- /meta/fasta/streptococcus_proteing.fa: -------------------------------------------------------------------------------- 1 | >streptococcal protein G; from wt seq given in Olson et al. 2014 2 | MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE 3 | -------------------------------------------------------------------------------- /meta/subtypes/kmeans_profile.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic K-means clustering" 2 | method: 'kmeans' 3 | columns: 'A:Y' 4 | args: 5 | k: 4 6 | min_size: 5 7 | nstart: 10 8 | -------------------------------------------------------------------------------- /figures/4_figures/position_examples/ccr5_cys_aromatic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_cys_aromatic.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/pab1_arg_not_neg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/pab1_arg_not_neg.png -------------------------------------------------------------------------------- /meta/subtypes/hdbscan_pca.yaml: -------------------------------------------------------------------------------- 1 | desc: "HDBSCAN clustering on PCs" 2 | method: 'hdbscan' 3 | columns: 'PC1:PC20' 4 | args: 5 | dist_method: 'manhattan' 6 | minPts: 4 7 | -------------------------------------------------------------------------------- /figures/4_figures/position_examples/ccr5_cys_disulphide.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ccr5_cys_disulphide.png -------------------------------------------------------------------------------- /meta/subtypes/kmeans_pca.yaml: -------------------------------------------------------------------------------- 1 | desc: "K-means clustering using all PCs" 2 | method: 'kmeans' 3 | columns: 'PC1:PC20' 4 | args: 5 | k: 4 6 | min_size: 5 7 | nstart: 10 8 | -------------------------------------------------------------------------------- /meta/subtypes/pam_profile_cos.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic PAM clustering" 2 | method: 'pam' 3 | columns: 'A:Y' 4 | args: 5 | k: 4 6 | min_size: 5 7 | distance_method: 'cosine' 8 | -------------------------------------------------------------------------------- /meta/subtypes/pam_profile_man.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic PAM clustering" 2 | method: 'pam' 3 | columns: 'A:Y' 4 | args: 5 | k: 4 6 | min_size: 5 7 | distance_method: 'manhattan' 8 | -------------------------------------------------------------------------------- /figures/4_figures/position_examples/adrb2_ala_small_hydro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/adrb2_ala_small_hydro.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/aph3ii_arg_not_proline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/aph3ii_arg_not_proline.png -------------------------------------------------------------------------------- /figures/4_figures/position_examples/ras_aliphatic_entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allydunham/aa_subtypes/HEAD/figures/4_figures/position_examples/ras_aliphatic_entropy.png -------------------------------------------------------------------------------- /meta/subtypes/hdbscan_profile.yaml: -------------------------------------------------------------------------------- 1 | desc: "HDBSCAN clustering on ER profiles" 2 | method: 'hdbscan' 3 | columns: 'A:Y' 4 | args: 5 | dist_method: 'manhattan' 6 | minPts: 3 7 | -------------------------------------------------------------------------------- /meta/subtypes/pam_pca_no_sig_cos.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic PAM clustering" 2 | method: 'pam' 3 | columns: 'PC2:PC20' 4 | args: 5 | k: 4 6 | min_size: 5 7 | distance_method: 'cosine' 8 | -------------------------------------------------------------------------------- /data/long_combined_mutational_scans.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8b015d600da366adea1437a0c13aab834ddabee5a6368c5d2fd12e5157d5c14c 3 | size 103160026 4 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_ubi.fa: -------------------------------------------------------------------------------- 1 | >ubi modified from first repeat of UBI4|P0CG63 from Uniprot 2 | MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN 3 | IQKESTLHLVLRLRGG 4 | 5 | -------------------------------------------------------------------------------- /meta/subtypes/pam_pca_no_sig_man.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic PAM clustering" 2 | method: 'pam' 3 | columns: 'PC2:PC20' 4 | args: 5 | k: 4 6 | min_size: 5 7 | distance_method: 'manhattan' 8 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_pca_dynamic.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using PCs with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'PC1:PC20' 4 | args: 5 | distance_method: 'manhattan' 6 | 7 | -------------------------------------------------------------------------------- /cluster.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | output: '{rule}.{wildcards}.%J' 3 | error: '{rule}.{wildcards}.%J' 4 | queue: 'research-rh74' 5 | name: 'Subtypes.{rule}.{wildcards}' 6 | memory: 4096 7 | 8 | -------------------------------------------------------------------------------- /meta/subtypes/hdbscan_pca_no_sig.yaml: -------------------------------------------------------------------------------- 1 | desc: "HDBSCAN clustering on PCs excluding positional significance" 2 | method: 'hdbscan' 3 | columns: 'PC2:PC20' 4 | args: 5 | dist_method: 'manhattan' 6 | minPts: 4 7 | -------------------------------------------------------------------------------- /meta/subtypes/kmeans_pca_no_sig.yaml: -------------------------------------------------------------------------------- 1 | desc: "K-means clustering using PCs excluding positional significance" 2 | method: 'kmeans' 3 | columns: 'PC2:PC20' 4 | args: 5 | k: 4 6 | min_size: 5 7 | nstart: 10 8 | -------------------------------------------------------------------------------- /meta/fasta/e_coli_infa.fa: -------------------------------------------------------------------------------- 1 | >sp|P69222|IF1_ECOLI Translation initiation factor IF-1 OS=Escherichia coli (strain K12) OX=83333 GN=infA PE=1 SV=2 2 | MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPY 3 | DLSKGRIVFRSR 4 | 5 | -------------------------------------------------------------------------------- /meta/fasta/e_coli_ccdb.fa: -------------------------------------------------------------------------------- 1 | >sp|P62554|CCDB_ECOLI Toxin CcdB OS=Escherichia coli (strain K12) OX=83333 GN=ccdB PE=1 SV=1 2 | MQFKVYTYKRESRYRLFVDVQSDIIDTPGRRMVIPLASARLLSDKVSRELYPVVHIGDES 3 | WRMMTTDMASVPVSVIGEEVADLSHRENDIKNAINLMFWGI 4 | 5 | -------------------------------------------------------------------------------- /meta/subtypes/gmm_pca.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number" 2 | method: 'gmm' 3 | columns: 'PC1:PC20' 4 | args: 5 | G: [ 2, 3, 4, 5 ] 6 | modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 7 | -------------------------------------------------------------------------------- /meta/subtypes/gmm_profile.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number" 2 | method: 'gmm' 3 | columns: 'A:Y' 4 | args: 5 | G: [ 2, 3, 4, 5 ] 6 | modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 7 | -------------------------------------------------------------------------------- /meta/subtypes/gmm_pca_no_sig.yaml: -------------------------------------------------------------------------------- 1 | desc: "Basic Gaussian Mixture Model, using BIC to select cluster number" 2 | method: 'gmm' 3 | columns: 'PC1:PC20' 4 | args: 5 | G: [ 2, 3, 4, 5 ] 6 | modelNames: [ 'VII', 'EVI', 'VEV', 'EVV', 'VVV' ] 7 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_sumo1.fa: -------------------------------------------------------------------------------- 1 | >sp|P63165|SUMO1_HUMAN Small ubiquitin-related modifier 1 OS=Homo sapiens OX=9606 GN=SUMO1 PE=1 SV=1 2 | MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMN 3 | SLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV 4 | 5 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_profile_dynamic.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'A:Y' 4 | args: 5 | distance_method: 'manhattan' 6 | treecut_args: 7 | deepSplit: 2 8 | 9 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_profile_dynamic_cos.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'A:Y' 4 | args: 5 | distance_method: 'cosine' 6 | treecut_args: 7 | deepSplit: 2 8 | 9 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_pca_no_sig_dynamic_cos_deep_0.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'PC2:PC20' 4 | args: 5 | distance_method: 'cosine' 6 | treecut_args: 7 | deepSplit: 0 8 | 9 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_pca_no_sig_dynamic_cos_deep_1.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using ER profiles with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'PC2:PC20' 4 | args: 5 | distance_method: 'cosine' 6 | treecut_args: 7 | deepSplit: 1 8 | 9 | -------------------------------------------------------------------------------- /meta/fasta/bacteriophage_ms2_cp.fa: -------------------------------------------------------------------------------- 1 | >sp|P03612|CAPSD_BPMS2 Capsid protein OS=Escherichia phage MS2 OX=329852 PE=1 SV=2 2 | MASNFTQFVLVDNGGTGDVTVAPSNFANGVAEWISSNSRSQAYKVTCSVRQSSAQNRKYT 3 | IKVEVPKVATQTVGGVELPVAAWRSYLNMELTIPIFATNSDCELIVKAMQGLLKDGNPIP 4 | SAIAANSGIY 5 | 6 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_calm1.fa: -------------------------------------------------------------------------------- 1 | >sp|P0DP23|CALM1_HUMAN Calmodulin-1 OS=Homo sapiens OX=9606 GN=CALM1 PE=1 SV=1 2 | MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADG 3 | NGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDE 4 | EVDEMIREADIDGDGQVNYEEFVQMMTAK 5 | 6 | -------------------------------------------------------------------------------- /meta/subtypes/hclust_pca_no_sig_dynamic.yaml: -------------------------------------------------------------------------------- 1 | desc: "Hierarchical clustering using PCs excluding positional significance, with dynamic cuts" 2 | method: 'hclust_dynamic' 3 | columns: 'PC2:PC20' 4 | args: 5 | distance_method: 'manhattan' 6 | treecut_args: 7 | deepSplit: 3 8 | 9 | -------------------------------------------------------------------------------- /meta/study_template.yaml: -------------------------------------------------------------------------------- 1 | study: 2 | gene: 3 | domain: 4 | uniprot_id: 5 | gene_type: 6 | species: 7 | strain: 8 | seq: 9 | experiment: 10 | transform: 11 | authour: 12 | year: 13 | title: 14 | doi: 15 | pmid: 16 | url: 17 | input_files: 18 | qc: 19 | filter: False 20 | notes: 21 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_ube2i.fa: -------------------------------------------------------------------------------- 1 | >sp|P63279|UBC9_HUMAN SUMO-conjugating enzyme UBC9 OS=Homo sapiens OX=9606 GN=UBE2I PE=1 SV=1 2 | MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKL 3 | RMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELL 4 | NEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS 5 | 6 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_ras.fa: -------------------------------------------------------------------------------- 1 | >sp|P01112|RASH_HUMAN GTPase HRas OS=Homo sapiens OX=9606 GN=HRAS PE=1 SV=1 2 | MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG 3 | QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDL 4 | AARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPG 5 | CMSCKCVLS 6 | 7 | -------------------------------------------------------------------------------- /snakemake.yaml: -------------------------------------------------------------------------------- 1 | foldx: 2 | variants_per_run: 300 3 | 4 | sift: 5 | uniref90_fa_path: '/hps/research1/beltrao/ally/databases/uniref90/uniref90_2019_1.fasta' 6 | 7 | porter5: 8 | path: '/nfs/research1/beltrao/ally/software/packages/Porter5/Porter5.py' 9 | 10 | misc: 11 | fasta_line_length: 80 12 | 13 | -------------------------------------------------------------------------------- /.rsync_exclude: -------------------------------------------------------------------------------- 1 | studies/*/*.yaml 2 | studies/*/*.R 3 | study_template.yaml 4 | pdb/* 5 | fasta/** 6 | clustering/** 7 | structures.yaml 8 | final_subtypes.yaml 9 | README.md 10 | subtypes/*.yaml 11 | residue_hydrophobicity.tsv 12 | graveyard/*/*.yaml 13 | graveyard/*/*.R 14 | foldx_eqn.md 15 | subtype_descriptions.md 16 | uniprot_* 17 | 18 | -------------------------------------------------------------------------------- /meta/fasta/a_victoria_gfp.fa: -------------------------------------------------------------------------------- 1 | >sp|P42212|GFP_AEQVI Green fluorescent protein OS=Aequorea victoria OX=6100 GN=GFP PE=1 SV=1 2 | MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL 3 | VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV 4 | NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD 5 | HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK 6 | 7 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_tpk1.fa: -------------------------------------------------------------------------------- 1 | >sp|Q9H3S4|TPK1_HUMAN Thiamin pyrophosphokinase 1 OS=Homo sapiens OX=9606 GN=TPK1 PE=1 SV=1 2 | MEHAFTPLEPLLSTGNLKYCLVILNQPLDNYFRHLWNKALLRACADGGANRLYDITEGER 3 | ESFLPEFINGDFDSIRPEVREYYATKGCELISTPDQDHTDFTKCLKMLQKKIEEKDLKVD 4 | VIVTLGGLAGRFDQIMASVNTLFQATHITPFPIIIIQEESLIYLLQPGKHRLHVDTGMEG 5 | DWCGLIPVGQPCMQVTTTGLKWNLTNDVLAFGTLVSTSNTYDGSGVVTVETDHPLLWTMA 6 | IKS 7 | 8 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_tpmt.fa: -------------------------------------------------------------------------------- 1 | >sp|P51580|TPMT_HUMAN Thiopurine S-methyltransferase OS=Homo sapiens OX=9606 GN=TPMT PE=1 SV=1 2 | MDGTRTSLDIEEYSDTEVQKNQVLTLEEWQDKWVNGKTAFHQEQGHQLLKKHLDTFLKGK 3 | SGLRVFFPLCGKAVEMKWFADRGHSVVGVEISELGIQEFFTEQNLSYSEEPITEIPGTKV 4 | FKSSSGNISLYCCSIFDLPRTNIGKFDMIWDRGALVAINPGDRKCYADTMFSLLGKKFQY 5 | LLCVLSYDPTKHPGPPFYVPHAEIERLFGKICNIRCLEKVDAFEERHKSWGIDCLFEKLY 6 | LLTEK 7 | 8 | -------------------------------------------------------------------------------- /meta/fasta/b_subtilis_gdh.fa: -------------------------------------------------------------------------------- 1 | >sp|P12310|DHG_BACSU Glucose 1-dehydrogenase OS=Bacillus subtilis (strain 168) OX=224308 GN=gdh PE=2 SV=2 2 | MYPDLKGKVVAITGAASGLGKAMAIRFGKEQAKVVINYYSNKQDPNEVKEEVIKAGGEAV 3 | VVQGDVTKEEDVKNIVQTAIKEFGTLDIMINNAGLENPVPSHEMPLKDWDKVIGTNLTGA 4 | FLGSREAIKYFVENDIKGNVINMSSVHEVIPWPLFVHYAASKGGIKLMTETLALEYAPKG 5 | IRVNNIGPGAINTPINAEKFADPKQKADVESMIPMGYIGEPEEIAAVAAWLASKEASYVT 6 | GITLFADGGMTQYPSFQAGRG 7 | 8 | -------------------------------------------------------------------------------- /meta/fasta/e_coli_tem1.fa: -------------------------------------------------------------------------------- 1 | >tr|Q6SJ61|Q6SJ61_ECOLX Beta-lactamase OS=Escherichia coli OX=562 GN=TEM-1 PE=3 SV=1 2 | MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP 3 | EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL 4 | CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM 5 | PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS 6 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW 7 | 8 | -------------------------------------------------------------------------------- /meta/fasta/e_coli_aph3ii.fa: -------------------------------------------------------------------------------- 1 | >tr|Q08JA8|Q08JA8_ECOLX APH(3') family aminoglycoside O-phosphotransferase OS=Escherichia coli OX=562 GN=aph PE=3 SV=1 2 | MIEQDGLHAGSPAAWVERLFGYDWAQQTIGCSDAAVFRLSAQGRPVLFVKTDLSGALNEL 3 | QDEAARLSWLATTGVPCAAVLDVVTEAGRDWLLLGEVPGQDLLSSHLAPAEKVSIMADAM 4 | RRLHTLDPATCPFDHQAKHRIERARTRMEAGLVDQDDLDEEHQGLAPAELFARLKARMPD 5 | GEDLVVTHGDACLPNIMVENGRFSGFIDCGRLGVADRYQDIALATRDIAEELGGEWADRF 6 | LVLYGIAAPDSQRIAFYRLLDEFF 7 | 8 | -------------------------------------------------------------------------------- /docs/subtypes_readme.txt: -------------------------------------------------------------------------------- 1 | # Subtype assignments EV3 2 | 3 | Table of subtypes assigned to each position in the deep landscape dataset (EV2) by our algorithm 4 | 5 | ## Columns 6 | cluster: Assigned subtype, denoted Xi for subtype i of amino acid X, with i=O for outliers and i=P for permissive positions 7 | study: Study the position came from 8 | gene: Gene 9 | position: Position in the gene (relative to cannonical Uniprot sequence) 10 | wt: Wild-type amino acid 11 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_gcn4.fa: -------------------------------------------------------------------------------- 1 | >sp|P03069|GCN4_YEAST General control protein GCN4 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=GCN4 PE=1 SV=1 2 | MSEYQPSLFALNPMGFSPLDGSKSTNENVSASTSTAKPMVGQLIFDKFIKTEEDPIIKQD 3 | TPSNLDFDFALPQTATAPDAKTVLPIPELDDAVVESFFSSSTDSTPMFEYENLEDNSKEW 4 | TSLFDNDIPVTTDDVSLADKAIESTEEVSLVPSNLEVSTTSFLPTPVLEDAKLTQTRKVK 5 | KPNSVVKKSHHVGKDDESRLDHLGVVAYNRKQRSIPLSPIVPESSDPAALKRARNTEAAR 6 | RSRARKLQRMKQLEDKVEELLSKNYHLENEVARLKKLVGER 7 | 8 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_ccr5.fa: -------------------------------------------------------------------------------- 1 | >sp|P51681|CCR5_HUMAN C-C chemokine receptor type 5 OS=Homo sapiens OX=9606 GN=CCR5 PE=1 SV=1 2 | MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFVGNMLVILILINCKR 3 | LKSMTDIYLLNLAISDLFFLLTVPFWAHYAAAQWDFGNTMCQLLTGLYFIGFFSGIFFII 4 | LLTIDRYLAVVHAVFALKARTVTFGVVTSVITWVVAVFASLPGIIFTRSQKEGLHYTCSS 5 | HFPYSQYQFWKNFQTLKIVILGLVLPLLVMVICYSGILKTLLRCRNEKKRHRAVRLIFTI 6 | MIVYFLFWAPYNIVLLLNTFQEFFGLNNCSSSNRLDQAMQVTETLGMTHCCINPIIYAFV 7 | GEKFRNYLLVFFQKHIAKRFCKCCSIFQQEAPERASSVYTRSTGEQEISVGL 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Unrecognized_molecules.txt 2 | rotabase.txt 3 | Rplots.pdf 4 | .snakemake/** 5 | /data/* 6 | !/data/studies/ 7 | /data/studies/*/** 8 | !/data/studies/*/*.R 9 | !/data/studies/*/*.yaml 10 | !/data/graveyard/ 11 | /data/graveyard/*/** 12 | !/data/graveyard/*/*.R 13 | !/data/graveyard/*/*.yaml 14 | /figures/** 15 | /meta/* 16 | !/meta/fasta/ 17 | !/meta/clustering/ 18 | /docs/dataset_ideas.md 19 | /docs/subtypes_draft.pdf 20 | /docs/MSB-2021-10305R-Data_Edited_Final.pdf 21 | .Rproj.user 22 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_cxcr4.fa: -------------------------------------------------------------------------------- 1 | >sp|P61073|CXCR4_HUMAN C-X-C chemokine receptor type 4 OS=Homo sapiens OX=9606 GN=CXCR4 PE=1 SV=1 2 | MEGISIYTSDNYTEEMGSGDYDSMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVI 3 | LVMGYQKKLRSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNL 4 | YSSVLILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEA 5 | DDRYICDRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSKGHQKRKALKT 6 | TVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPI 7 | LYAFLGAKFKTSAQHALTSVSRGSSLKILSKGKRGGHSSVSTESESSSFHSS 8 | 9 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_mapk1.fa: -------------------------------------------------------------------------------- 1 | >sp|P28482|MK01_HUMAN Mitogen-activated protein kinase 1 OS=Homo sapiens OX=9606 GN=MAPK1 PE=1 SV=3 2 | MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFE 3 | HQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQH 4 | LSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDH 5 | TGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHI 6 | LGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHK 7 | RIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS 8 | 9 | -------------------------------------------------------------------------------- /bin/utils/setup_study_folder.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Setup a folder for a study with the given ID 3 | 4 | # Config 5 | proj_root="$HOME/phd/subtypes" 6 | template_yaml="$proj_root/meta/study_template.yaml" 7 | template_script="$proj_root/bin/standardise_study_template.R" 8 | 9 | # Setup dir with yaml meta file and raw subdir 10 | mkdir "$1" || exit 11 | mkdir "$1/raw" 12 | cp "$template_yaml" "$1/$1.yaml" || echo "Couldn't copy template YAML" 13 | cp "$template_script" "$1/standardise_$1.R" || echo "Couldn't copy template R script" 14 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_adrb2.fa: -------------------------------------------------------------------------------- 1 | >sp|P07550|ADRB2_HUMAN Beta-2 adrenergic receptor OS=Homo sapiens OX=9606 GN=ADRB2 PE=1 SV=3 2 | MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAK 3 | FERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTAS 4 | IETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQE 5 | AINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRF 6 | HVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQD 7 | NLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNT 8 | GEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL 9 | 10 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_tdp43.fa: -------------------------------------------------------------------------------- 1 | >sp|Q13148|TADBP_HUMAN TAR DNA-binding protein 43 OS=Homo sapiens OX=9606 GN=TARDBP PE=1 SV=1 2 | MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNPVSQCMRGVRLVEGI 3 | LHAPDAGWGNLVYVVNYPKDNKRKMDETDASSAVKVKRAVQKTSDLIVLGLPWKTTEQDL 4 | KEYFSTFGEVLMVQVKKDLKTGHSKGFGFVRFTEYETQVKVMSQRHMIDGRWCDCKLPNS 5 | KQSQDEPLRSRKVFVGRCTEDMTEDELREFFSQYGDVMDVFIPKPFRAFAFVTFADDQIA 6 | QSLCGEDLIIKGISVHISNAEPKHNSNRQLERSGRFGGNPGGFGNQGGFGNSRGGGAGLG 7 | NNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQ 8 | REPNQAFGSGNNSYSGSNSGAAIGWGSASNAGSGSGFNGGFGSSMDSKSSGWGM 9 | 10 | -------------------------------------------------------------------------------- /meta/fasta/p_aeruginosa_amie.fa: -------------------------------------------------------------------------------- 1 | >sp|P11436|AMIE_PSEAE Aliphatic amidase OS=Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1) OX=208964 GN=amiE PE=1 SV=2 2 | MRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEY 3 | SLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLV 4 | LIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAM 5 | KGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDG 6 | RTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAEC 7 | PFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA 8 | 9 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_tp53.fa: -------------------------------------------------------------------------------- 1 | >sp|P04637|P53_HUMAN Cellular tumor antigen p53 OS=Homo sapiens OX=9606 GN=TP53 PE=1 SV=4 NOTE: change P72R to match Giacomelli et al. 2018 seq 2 | MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP 3 | DEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK 4 | SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE 5 | RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS 6 | SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP 7 | PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG 8 | GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD 9 | 10 | -------------------------------------------------------------------------------- /data/studies/weile_2017_tpk1/standardise_weile_2017_tpk1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Weile et al. 2017 (TPK1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE) 7 | 8 | # Import and process data 9 | meta <- read_yaml('data/studies/weile_2017_tpk1/weile_2017_tpk1.yaml') 10 | dm_data <- read_mavedb('data/studies/weile_2017_tpk1/raw/urn_mavedb_00000001-d-2_scores.csv', score_transform = transform_vamp_seq) 11 | 12 | # Save output 13 | standardise_study(dm_data, meta$study, meta$transform) 14 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_pten.fa: -------------------------------------------------------------------------------- 1 | >sp|P60484|PTEN_HUMAN Phosphatidylinositol 3,4,5-trisphosphate 3-phosphatase and dual-specificity protein phosphatase PTEN OS=Homo sapiens OX=9606 GN=PTEN PE=1 SV=1 2 | MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVVRFLDSK 3 | HKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDLDQWLSEDDNHVA 4 | AIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRDKKGVTIPSQRRYVYYYSY 5 | LLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFVVCQLKVKIYSSNSGPTRREDKFMY 6 | FEFPQPLPVCGDIKVEFFHKQNKMLKKDKMFHFWVNTFFIPGPEETSEKVENGSLCDQEI 7 | DSICSIERADNDKEYLVLTLTKNDLDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEAS 8 | SSTSVTPDVSDNEPDHYRYSDTTDSDPENEPFDEDQHTQITKV 9 | 10 | -------------------------------------------------------------------------------- /data/studies/sun_2018_cbs/standardise_sun_2018_cbs.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Sun et al. 2018 (CBS) (Preprint) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/sun_2018_cbs/sun_2018_cbs.yaml') 8 | dm_data <- read_mavedb('data/studies/sun_2018_cbs/raw/urn_mavedb_00000005-a-4_scores.csv', score_transform = function(x){log2(x + min(x[x > 0], na.rm = TRUE))}) %>% 9 | drop_na(position, score) # Drop WT position and variants with no fitness 10 | 11 | # Save output 12 | standardise_study(dm_data, meta$study, meta$transform) 13 | -------------------------------------------------------------------------------- /data/studies/weile_2017_calm1/standardise_weile_2017_calm1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Weile et al. 2017 (CALM1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE) 7 | 8 | # Import and process data 9 | meta <- read_yaml('data/studies/weile_2017_calm1/weile_2017_calm1.yaml') 10 | dm_data <- read_mavedb('data/studies/weile_2017_calm1/raw/urn_mavedb_00000001-c-2_scores.csv', score_transform = transform_vamp_seq) 11 | 12 | # Save output 13 | standardise_study(dm_data, meta$study, meta$transform) 14 | -------------------------------------------------------------------------------- /data/studies/weile_2017_sumo1/standardise_weile_2017_sumo1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Weile et al. 2017 (SUMO1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE) 7 | 8 | # Import and process data 9 | meta <- read_yaml('data/studies/weile_2017_sumo1/weile_2017_sumo1.yaml') 10 | dm_data <- read_mavedb('data/studies/weile_2017_sumo1/raw/urn_mavedb_00000001-b-1_scores.csv', score_transform = transform_vamp_seq) 11 | 12 | # Save output 13 | standardise_study(dm_data, meta$study, meta$transform) 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020 EMBL - European Bioinformatics Institute 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_yap1.fa: -------------------------------------------------------------------------------- 1 | >sp|P46937|YAP1_HUMAN Transcriptional coactivator YAP1 OS=Homo sapiens OX=9606 GN=YAP1 PE=1 SV=2 2 | MDPGQQPPPQPAPQGQGQPPSQPPQGQGPPSGPGQPAPAATQAAPQAPPAGHQIVHVRGD 3 | SETDLEALFNAVMNPKTANVPQTVPMRLRKLPDSFFKPPEPKSHSRQASTDAGTAGALTP 4 | QHVRAHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLRQSSFEIPDDVPLPAGWEMA 5 | KTSSGQRYFLNHIDQTTTWQDPRKAMLSQMNVTAPTSPPVQQNMMNSASGPLPDGWEQAM 6 | TQDGEIYYINHKNKTTSWLDPRLDPRFAMNQRISQSAPVKQPPPLAPQSPQGGVMGGSNS 7 | NQQQQMRLQQLQMEKERLRLKQQELLRQAMRNINPSTANSPKCQELALRSQLPTLEQDGG 8 | TQNPVSSPGMSQELRTMTTNSSDPFLNSGTYHSRDESTDSGLSMSSYSVPRTPDDFLNSV 9 | DEMDTGDTINQSTLPSQQNRFPDYLEAIPGTNVDLGTLEGDGMNIEGEELMPSLQEALSS 10 | DILNDMESVLAATKLDKESFLTWL 11 | -------------------------------------------------------------------------------- /data/studies/ahler_2019_src/standardise_ahler_2019_src.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Ahler et al. 2019 (Src) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/ahler_2019_src/ahler_2019_src.yaml') 8 | dm_data <- bind_rows(read_mavedb('data/studies/ahler_2019_src/raw/urn_mavedb_00000041-b-1_scores.csv', position_offset = 1, score_col = activity_score), 9 | read_mavedb('data/studies/ahler_2019_src/raw/urn_mavedb_00000041-a-1_scores.csv', position_offset = 269, score_col = activity_score)) 10 | 11 | # Save output 12 | standardise_study(dm_data, meta$study, meta$transform) 13 | -------------------------------------------------------------------------------- /data/studies/weile_2017_ube2i/standardise_weile_2017_ube2i.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Weile et al. 2017 (UBE2I) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | aa_code <- structure(names(Biostrings::AMINO_ACID_CODE), names=Biostrings::AMINO_ACID_CODE) 7 | 8 | # Import and process data 9 | meta <- read_yaml('data/studies/weile_2017_ube2i/weile_2017_ube2i.yaml') 10 | dm_data <- read_mavedb('data/studies/weile_2017_ube2i/raw/urn_mavedb_00000001-a-1_scores.csv', score_transform = transform_vamp_seq) %>% 11 | filter(!position == 159) # Drop position not in WT protein 12 | 13 | # Save output 14 | standardise_study(dm_data, meta$study, meta$transform) 15 | -------------------------------------------------------------------------------- /data/studies/kelsic_2016_infa/kelsic_2016_infa.yaml: -------------------------------------------------------------------------------- 1 | study: 'kelsic_2016_infa' 2 | gene: 'infA' 3 | uniprot_id: 'P69222' 4 | gene_type: 'Translation' 5 | species: 'E. coli' 6 | seq: "MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPY\ 7 | DLSKGRIVFRSR" 8 | experiment: 'Growth' 9 | transform: 'VAMP-seq' 10 | authour: 'Kelsic et al.' 11 | year: 2016 12 | title: 'RNA Structural Determinants of Optimal Codons Revealed by MAGE-Seq' 13 | lab: ['Kishony'] 14 | doi: '10.1016/j.cels.2016.11.004' 15 | pmid: '28009265' 16 | url: 'https://www.sciencedirect.com/science/article/pii/S2405471216303684' 17 | input_files: 18 | - 'cels_206_mmc5.csv' 19 | source: 'SI' 20 | qc: 21 | filter: False 22 | notes: 23 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_dbr1.fa: -------------------------------------------------------------------------------- 1 | >sp|Q9UK59|DBR1_HUMAN Lariat debranching enzyme OS=Homo sapiens OX=9606 GN=DBR1 PE=1 SV=2 2 | MRVAVAGCCHGELDKIYETLALAERRGPGPVDLLLCCGDFQAVRNEADLRCMAVPPKYRH 3 | MQTFYRYYSGEKKAPVLTLFIGGNHEASNHLQELPYGGWVAPNIYYLGLAGVVKYRGVRI 4 | GGISGIFKSHDYRKGHFECPPYNSSTIRSIYHVRNIEVYKLKQLKQPIDIFLSHDWPRSI 5 | YHYGNKKQLLKTKSFFRQEVENNTLGSPAASELLEHLKPTYWFSAHLHVKFAALMQHQAK 6 | DKGQTARATKFLALDKCLPHRDFLQILEIEHDPSAPDYLEYDIEWLTILRATDDLINVTG 7 | RLWNMPENNGLHARWDYSATEEGMKEVLEKLNHDLKVPCNFSVTAACYDPSKPQTQMQLI 8 | HRINPQTTEFCAQLGIIDINVRLQKSKEEHHVCGEYEEQDDVESNDSGEDQSEYNTDTSA 9 | LSSINPDEIMLDEEEDEDSIVSAHSGMNTPSVEPSDQASEFSASFSDVRILPGSMIVSSD 10 | DTVDSTIDREGKPGGTVESGNGEDLTKVPLKRLSDEHEPEQRKKIKRRNQAIYAAVDDDD 11 | DDAA 12 | 13 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_src.fa: -------------------------------------------------------------------------------- 1 | >sp|P12931|SRC_HUMAN Proto-oncogene tyrosine-protein kinase Src OS=Homo sapiens OX=9606 GN=SRC PE=1 SV=3 2 | MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAFAPAAAE 3 | PKLFGGFNSSDTVTSPQRAGPLAGGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGD 4 | WWLAHSLSTGQTGYIPSNYVAPSDSIQAEEWYFGKITRRESERLLLNAENPRGTFLVRES 5 | ETTKGAYCLSVSDFDNAKGLNVKHYKIRKLDSGGFYITSRTQFNSLQQLVAYYSKHADGL 6 | CHRLTTVCPTSKPQTQGLAKDAWEIPRESLRLEVKLGQGCFGEVWMGTWNGTTRVAIKTL 7 | KPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKY 8 | LRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYT 9 | ARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVER 10 | GYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFLEDYFTSTEPQYQPGENL 11 | 12 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_cbs.fa: -------------------------------------------------------------------------------- 1 | >sp|P35520|CBS_HUMAN Cystathionine beta-synthase OS=Homo sapiens OX=9606 GN=CBS PE=1 SV=2 2 | MPSETPQAEVGPTGCPHRSGPHSAKGSLEKGSPEDKEAKEPLWIRPDAPSRCTWQLGRPA 3 | SESPHHHTAPAKSPKILPDILKKIGDTPMVRINKIGKKFGLKCELLAKCEFFNAGGSVKD 4 | RISLRMIEDAERDGTLKPGDTIIEPTSGNTGIGLALAAAVRGYRCIIVMPEKMSSEKVDV 5 | LRALGAEIVRTPTNARFDSPESHVGVAWRLKNEIPNSHILDQYRNASNPLAHYDTTADEI 6 | LQQCDGKLDMLVASVGTGGTITGIARKLKEKCPGCRIIGVDPEGSILAEPEELNQTEQTT 7 | YEVEGIGYDFIPTVLDRTVVDKWFKSNDEEAFTFARMLIAQEGLLCGGSAGSTVAVAVKA 8 | AQELQEGQRCVVILPDSVRNYMTKFLSDRWMLQKGFLKEEDLTEKKPWWWHLRVQELGLS 9 | APLTVLPTITCGHTIEILREKGFDQAPVVDEAGVILGMVTLGNMLSSLLAGKVQPSDQVG 10 | KVIYKQFKQIRLTDTLGRLSHILEMDHFALVVHEQIQYHSTGKSSQRQMVFGVVTAIDLL 11 | NFVAAQERDQK 12 | 13 | -------------------------------------------------------------------------------- /meta/final_subtypes.yaml: -------------------------------------------------------------------------------- 1 | desc: "Final subtypes assembly \ 2 | The final subtypes set is assembled by combining two runs, \ 3 | both using hybrid dynamic treecutting on the dendrogram produced \ 4 | by hierarchical clustering. The cosine distance is used along with \ 5 | filtering positions where all |ER| < 0.4. Two runs are combined with \ 6 | deepSplit treecutting parameters chosen varying for each AA, as below." 7 | deepSplit: 8 | "A": 0 9 | "C": 0 10 | "D": 1 11 | "E": 0 12 | "F": 1 13 | "G": 1 14 | "H": 0 15 | "I": 0 16 | "K": 1 17 | "L": 0 18 | "M": 0 19 | "N": 0 20 | "P": 1 21 | "Q": 1 22 | "R": 1 23 | "S": 0 24 | "T": 1 25 | "V": 0 26 | "W": 0 27 | "Y": 1 28 | -------------------------------------------------------------------------------- /data/pdb/README.md: -------------------------------------------------------------------------------- 1 | # Structures and Models from SwissMODEL 2 | 3 | These protein structure PDB files were sourced from [SwissMODEL](swissmodel.expasy.org). 4 | The model or structure chosen for each protein is documented in `meta/structures.yaml`, 5 | which gives the template ID chosen for each protein and the type of structure model (x-ray 6 | crystalography, homology model etc.). The Uniprot ID of each protein is available in 7 | the study YAML config. Those proteins denoated as "model" are homology models from 8 | SwissMODEL ([CC BY-SA 4.0 Creative Commons Attribution-ShareAlike 4.0 International License](https://swissmodel.expasy.org/docs/terms_of_use)) 9 | while the others are experimental structures from the PDB, as cited in the references. 10 | -------------------------------------------------------------------------------- /data/studies/ashenberg_2017_np/standardise_ashenberg_2017_np.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Ashenberg et al. 2017 (Flu NP) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/ashenberg_2017_np/ashenberg_2017_np.yaml') 8 | dm_data <- read_csv('data/studies/ashenberg_2017_np/raw/journal.ppat.1006288.s013.csv') %>% 9 | rename(position = site, raw_score = diffsel) %>% 10 | mutate(transformed_score = raw_score, 11 | score = normalise_score(transformed_score), 12 | class = get_variant_class(wt, mut)) %>% 13 | drop_na(score) # some muts just aren't measured 14 | 15 | # Save output 16 | standardise_study(dm_data, meta$study, meta$transform) 17 | -------------------------------------------------------------------------------- /meta/fasta/H3N2_A_Aichi_2_1968_np.fa: -------------------------------------------------------------------------------- 1 | >gb:CY121120|ncbiId:AFM71861.1|UniProtKB:I6TAH8|Organism:Influenza A virus (A/Aichi/2/1968(H3N2))|Strain Name:A/Aichi/2/1968|Protein Name:NP Nucleoprotein|Gene Symbol:NP|Segment:5|Subtype:H3N2|Host:Human 2 | MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLSA 3 | FDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMIWH 4 | SNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFWRGE 5 | NGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCLPACV 6 | YGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLSFIRGT 7 | KVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSVQRNLPF 8 | DKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSNEGSYFFG 9 | DNAEEYDN 10 | 11 | -------------------------------------------------------------------------------- /data/studies/weile_2017_sumo1/weile_2017_sumo1.yaml: -------------------------------------------------------------------------------- 1 | study: 'weile_2017_sumo1' 2 | gene: 'SUMO1' 3 | uniprot_id: 'P63165' 4 | gene_type: 'PTM' 5 | species: 'H. sapiens' 6 | seq: "MSDQEAKPSTEDLGDKKEGEYIKLKVIGQDSSEIHFKVKMTTHLKKLKESYCQRQGVPMN\ 7 | SLRFLFEGQRIADNHTPKELGMEEEDVIEVYQEQTGGHSTV" 8 | experiment: 'Complement' 9 | transform: 'VAMP-seq' 10 | authour: 'Weile et al.' 11 | year: 2017 12 | title: 'A framework for exhaustively mapping functional missense variants' 13 | lab: ['Roth', 'Fowler'] 14 | doi: '10.15252/msb.20177908' 15 | pmid: '29269382' 16 | url: 'http://msb.embopress.org/content/13/12/957' 17 | input_files: 18 | - 'urn_mavedb_00000001-b-1_scores.csv' 19 | source: 'MaveDB' 20 | mavedb_urn: 'urn:mavedb:00000001-b' 21 | qc: 22 | filter: False 23 | notes: 24 | -------------------------------------------------------------------------------- /meta/fasta/strep_protein_g_precursor.fa: -------------------------------------------------------------------------------- 1 | >sp|P19909|SPG2_STRSG Immunoglobulin G-binding protein G OS=Streptococcus sp. group G OX=1320 GN=spg PE=1 SV=1 2 | MEKEKKVKYFLRKSAFGLASVSAAFLVGSTVFAVDSPIEDTPIIRNGGELTNLLGNSETT 3 | LALRNEESATADLTAAAVADTVAAAAAENAGAAAWEAAAAADALAKAKADALKEFNKYGV 4 | SDYYKNLINNAKTVEGVKDLQAQVVESAKKARISEATDGLSDFLKSQTPAEDTVKSIELA 5 | EAKVLANRELDKYGVSDYHKNLINNAKTVEGVKDLQAQVVESAKKARISEATDGLSDFLK 6 | SQTPAEDTVKSIELAEAKVLANRELDKYGVSDYYKNLINNAKTVEGVKALIDEILAALPK 7 | TDTYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEKPE 8 | VIDASELTPAVTTYKLVINGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDAT 9 | KTFTVTEKPEVIDASELTPAVTTYKLVINGKTLKGETTTKAVDAETAEKAFKQYANDNGV 10 | DGVWTYDDATKTFTVTEMVTEVPGDAPTEPEKPEASIPLVPLTPATPIAKDDAKKDDTKK 11 | EDAKKPEAKKEDAKKAETLPTTGEGSNPFFTAAALAVMAGAGALAVASKRKED 12 | -------------------------------------------------------------------------------- /data/studies/melamed_2013_pab1/standardise_melamed_2013_pab1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Melamed et al. 2013 (PAB1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/melamed_2013_pab1/melamed_2013_pab1.yaml') 8 | dm_data <- read_xlsx('data/studies/melamed_2013_pab1/raw/Supplementary_Table_2.xlsx') %>% 9 | rename(wt = WT_aa) %>% 10 | gather(key = 'mut', value = 'raw_score', -position, -wt) %>% 11 | mutate(transformed_score = raw_score, 12 | score = normalise_score(transformed_score), 13 | class = get_variant_class(wt, mut)) %>% 14 | drop_na(score) # Not all measured 15 | 16 | # Save output 17 | standardise_study(dm_data, meta$study, meta$transform) 18 | 19 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_pab1.fa: -------------------------------------------------------------------------------- 1 | >sp|P04147|PABP_YEAST Polyadenylate-binding protein, cytoplasmic and nuclear OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=PAB1 PE=1 SV=4 2 | MADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYVGDLEPSVSEAHLYDIFSP 3 | IGSVSSIRVCRDAITKTSLGYAYVNFNDHEAGRKAIEQLNYTPIKGRLCRIMWSQRDPSL 4 | RKKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAK 5 | EAIDALNGMLLNGQEIYVAPHLSRKERDSQLEETKAHYTNLYVKNINSETTDEQFQELFA 6 | KFGPIVSASLEKDADGKLKGFGFVNYEKHEDAVKAVEALNDSELNGEKLYVGRAQKKNER 7 | MHVLKKQYEAYRLEKMAKYQGVNLFVKNLDDSVDDEKLEEEFAPYGTITSAKVMRTENGK 8 | SKGFGFVCFSTPEEATKAITEKNQQIVAGKPLYVAIAQRKDVRRSQLAQQIQARNQMRYQ 9 | QATAAAAAAAAGMPGQFMPPMFYGVMPPRGVPFNGPNPQQMNPMGGMPKNGMPPQFRNGP 10 | VYGVPPQGGFPRNANDNNQFYQQKQRQALGEQLYKKVSAKTSNEEAAGKITGMILDLPPQ 11 | EVFPLLESDELFEQHYKEASAAYESFKKEQEQQTEQA 12 | 13 | -------------------------------------------------------------------------------- /bin/data_processing/standardise_study_template.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from STUDY 3 | 4 | ## README - delete in real script versions 5 | # Template script for standardising study data 6 | # Saves a standardised tsv /data/studies/{study}/{study}.tsv 7 | # With columns: position, wt, mut, score (which is the normalised score, all this processing is done here) 8 | # Transform score such that -1 = NULL, 0 = WT, +ve = beneficial 9 | # Tibble passed to standardise_study() must have at least columns position, wt, mut, score, raw_score 10 | 11 | source('src/config.R') 12 | source('src/study_standardising.R') 13 | 14 | # Import and process data 15 | meta <- read_yaml('IMPORT META YAML') 16 | dm_data <- read_csv('IMPORT STUDY DATA') 17 | 18 | # Save output 19 | standardise_study(dm_data, meta$study, meta$transform) -------------------------------------------------------------------------------- /data/studies/roscoe_2013_ubi/roscoe_2013_ubi.yaml: -------------------------------------------------------------------------------- 1 | study: 'roscoe_2013_ubi' 2 | gene: 'UBI' 3 | uniprot_id: 'P0CG63' 4 | gene_type: 'PTM' 5 | species: 'S. cerevisiae' 6 | seq: "MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN\ 7 | IQKESTLHLVLRLRGG" 8 | experiment: 'Growth' 9 | transform: 'None' 10 | authour: 'Roscoe et al.' 11 | year: 2013 12 | title: 'Analyses of the effects of all ubiquitin point mutants on yeast growth rate' 13 | lab: ['Bolon'] 14 | doi: '10.1016/j.jmb.2013.01.032' 15 | pmid: '23376099' 16 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283613000636' 17 | notes: 'Uniprot ID is for UBI4, took first repeat of sequence from here as very conserved' 18 | input_files: 19 | - '1-s2.0-S0022283613000636-mmc3.xlsx' 20 | source: 'SI - Table S2' 21 | qc: 22 | filter: False 23 | notes: 24 | -------------------------------------------------------------------------------- /meta/fasta/H3N2_A_Perth_16_2009_ha.fa: -------------------------------------------------------------------------------- 1 | >gb:KJ609206|ncbiId:AHX37629.1|Organism:Influenza A virus (A/Perth/16/2009(H3N2))|Strain Name:A/Perth/16/2009|Protein Name:HA Hemagglutinin|Gene Symbol:HA|Segment:4|Subtype:H3N2|Host:Human 2 | MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDS 3 | PHQILDGKNCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNES 4 | FNWTGVTQNGTSSACIRRSKNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVLHPGTDKDQIFL 5 | YAQASGRITVSTKRSQQTVSPNIGSRPRVRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGK 6 | SSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNVPEKQTRGIFGA 7 | IAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEG 8 | RIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCD 9 | NACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNI 10 | RCNICI 11 | 12 | -------------------------------------------------------------------------------- /data/studies/weile_2017_calm1/weile_2017_calm1.yaml: -------------------------------------------------------------------------------- 1 | study: 'weile_2017_calm1' 2 | gene: 'CALM1' 3 | uniprot_id: 'P0DP23' 4 | gene_type: 'Regulatory' 5 | species: 'H. sapiens' 6 | seq: "MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADG\ 7 | NGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDE\ 8 | EVDEMIREADIDGDGQVNYEEFVQMMTAK" 9 | experiment: 'Complement' 10 | transform: 'VAMP-seq' 11 | authour: 'Weile et al.' 12 | year: 2017 13 | title: 'A framework for exhaustively mapping functional missense variants' 14 | lab: ['Roth', 'Fowler'] 15 | doi: '10.15252/msb.20177908' 16 | pmid: '29269382' 17 | url: 'http://msb.embopress.org/content/13/12/957' 18 | input_files: 19 | - 'urn_mavedb_00000001-c-2_scores.csv' 20 | source: 'MaveDB' 21 | mavedb_urn: 'urn:mavedb:00000001-c-2' 22 | qc: 23 | filter: False 24 | notes: 25 | -------------------------------------------------------------------------------- /data/studies/bandaru_2017_ras/bandaru_2017_ras.yaml: -------------------------------------------------------------------------------- 1 | study: 'bandaru_2017_ras' 2 | gene: 'Ras' 3 | uniprot_id: 'P01112' 4 | gene_type: 'GTPase' 5 | species: 'H. sapiens' 6 | seq: "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAG\ 7 | QEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHQYREQIKRVKDSDDVPMVLVGNKCDL\ 8 | AARTVESRQAQDLARSYGIPYIETSAKTRQGVEDAFYTLVREIRQHKLRKLNPPDESGPG\ 9 | CMSCKCVLS" 10 | experiment: 'Coupled growth' 11 | transform: 'x/ln(2)' 12 | authour: 'Bandaru et al.' 13 | year: 2017 14 | title: 'Deconstruction of the Ras switching cycle through saturation mutagenesis' 15 | lab: ['Kuriyan', 'Valencia'] 16 | doi: '10.7554/eLife.27810' 17 | pmid: '28686159' 18 | url: 'https://elifesciences.org/articles/27810' 19 | input_files: 20 | - 'elife-27810-supp1-v2.xlsx' 21 | source: 'SI' 22 | qc: 23 | filter: False 24 | notes: 25 | -------------------------------------------------------------------------------- /data/studies/weile_2017_ube2i/weile_2017_ube2i.yaml: -------------------------------------------------------------------------------- 1 | study: 'weile_2017_ube2i' 2 | gene: 'UBE2I' 3 | uniprot_id: 'P63279' 4 | gene_type: 'E2 Conjugase' 5 | species: 'H. sapiens' 6 | seq: "MSGIALSRLAQERKAWRKDHPFGFVAVPTKNPDGTMNLMNWECAIPGKKGTPWEGGLFKL\ 7 | RMLFKDDYPSSPPKCKFEPPLFHPNVYPSGTVCLSILEEDKDWRPAITIKQILLGIQELL\ 8 | NEPNIQDPAQAEAYTIYCQNRVEYEKRVRAQAKKFAPS" 9 | experiment: 'Complement' 10 | transform: 'VAMP-seq' 11 | authour: 'Weile et al.' 12 | year: 2017 13 | title: 'A framework for exhaustively mapping functional missense variants' 14 | lab: ['Roth', 'Fowler'] 15 | doi: '10.15252/msb.20177908' 16 | pmid: '29269382' 17 | url: 'http://msb.embopress.org/content/13/12/957' 18 | input_files: 19 | - 'urn_mavedb_00000001-a-1_scores.csv' 20 | source: 'MaveDB' 21 | mavedb_urn: 'urn:mavedb:00000001-a' 22 | qc: 23 | filter: False 24 | notes: 25 | -------------------------------------------------------------------------------- /data/studies/bolognesi_2019_tdp43/standardise_bolognesi_2019_tdp43.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Bolognesi et al. 2019 (TDP43) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/bolognesi_2019_tdp43/bolognesi_2019_tdp43.yaml') 8 | dm_data <- read_xlsx('data/studies/bolognesi_2019_tdp43/raw/41467_2019_12101_MOESM7_ESM.xlsx', sheet = '1 AA change') %>% 9 | rename(position = Pos_abs, wt = WT_AA, mut = Mut, raw_score = toxicity) %>% 10 | mutate(transformed_score = raw_score / log(2), 11 | score = normalise_score(transformed_score), 12 | class = get_variant_class(wt, mut)) %>% 13 | select(position, wt, mut, score, transformed_score, raw_score, class) 14 | 15 | # Save output 16 | standardise_study(dm_data, meta$study, meta$transform) 17 | -------------------------------------------------------------------------------- /bin/analysis/0_data/validate_kitzman_2015_gal4.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Validate the selection experiment combination method for Kitzman et al. 2015 (GAL4) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | dir.create('figures/0_data/per_study/kitzman_2015_gal4') 7 | 8 | # Import and process data 9 | path <- 'data/studies/kitzman_2015_gal4/raw/41592_2015_BFnmeth3223_MOESM306_ESM.xlsx' 10 | dm_data <- lapply(excel_sheets(path), read_kitzman_sheet, path = path) %>% 11 | bind_rows(.) %>% 12 | spread(key = 'label', value = 'log2_enrichment') %>% 13 | filter(!mut == 'delInFrame') 14 | 15 | # Plot variants 16 | p <- ggpairs(dm_data, columns = c('NONSEL_24h', 'SEL_A_24h', 'SEL_A_40h', 'SEL_B_40h', 'SEL_C_40h', 'SEL_C_64h')) 17 | ggsave('figures/0_data/per_study/kitzman_2015_gal4/validate_selection_combination.pdf', p, units = 'cm', width = 25, height = 25) -------------------------------------------------------------------------------- /data/studies/hietpas_2011_hsp90/standardise_hietpas_2011_hsp90.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Hietpas et al. 2011 (HSP90) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/hietpas_2011_hsp90/hietpas_2011_hsp90.yaml') 8 | dm_data <- read_csv('data/studies/hietpas_2011_hsp90/raw/sd02.csv', skip = 5) %>% 9 | rename(mut = aa, raw_score = s) %>% 10 | mutate(wt = str_split(meta$seq, '')[[1]][position]) %>% 11 | group_by(position, wt, mut) %>% # Average over codons 12 | summarise(raw_score = mean(raw_score)) %>% 13 | ungroup() %>% 14 | mutate(transformed_score = raw_score, 15 | score = normalise_score(transformed_score), 16 | class = get_variant_class(wt, mut)) 17 | 18 | # Save output 19 | standardise_study(dm_data, meta$study, meta$transform) 20 | 21 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_hsc82.fa: -------------------------------------------------------------------------------- 1 | >tr|A0A140HDC6|A0A140HDC6_YEASX Hsp90 family chaperone OS=Saccharomyces cerevisiae OX=4932 GN=HSC82 PE=3 SV=1 2 | MAGETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYQALSDPKQLETEP 3 | DLFIRITPKPEEKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF 4 | GVGFYSLFLVADRVQVISKNNEDEQYIWESNAGGSFTVTLDEVNERIGRGTVLRLFLKDD 5 | QLEYLEEKRIKEVIKRHSEFVAYPIQLLVTKEVEKEVPIPEEEKKDEEKKDEDDKKPKLE 6 | EVDEEEEEKKPKTKKVKEEVQELEELNKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPL 7 | YVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSF 8 | VKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFDKFYSAFAKN 9 | IKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGEFLKAVEK 10 | SPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAEREKEI 11 | KEYEPLTKALKDILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRDSSMS 12 | SYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTNLLFETALLTSGFSLEEPTSF 13 | ASRINRLISLGLNIDEDEETETAPEASTEAPVEEVPADTEMEEVD 14 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_ste12.fa: -------------------------------------------------------------------------------- 1 | >sp|P13574|STE12_YEAST Protein STE12 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=STE12 PE=1 SV=1 2 | MKVQITNSRTEEILKVQANNENDEVSKATPGEVEESLRLIGDLKFFLATAPVNWQENQII 3 | RRYYLNSGQGFVSCVFWNNLYYITGTDIVKCCLYRMQKFGREVVQKKKFEEGIFSDLRNL 4 | KCGIDATLEQPKSEFLSFLFRNMCLKTQKKQKVFFWFSVAHDKLFADALERDLKRESLNQ 5 | PSTTKPVNEPALSFSYDSSSDKPLYDQLLQHLDSRRPSSTTKSDNSPPKLESENFKDNEL 6 | VTVTNQPLLGVGLMDDDAPESPSQINDFIPQKLIIEPNTLELNGLTEETPHDLPKNTAKG 7 | RDEEDFPLDYFPVSVEYPTEENAFDPFPPQAFTPAAPSMPISYDNVNERDSMPVNSLLNR 8 | YPYQLSVAPTFPVPPSSSRQHFMTNRDFYSSNNNKEKLVSPSDPTSYMKYDEPVMDFDES 9 | RPNENCTNAKSHNSGQQTKQHQLYSNNFQQSYPNGMVPGYYPKMPYNPMGGDPLLDQAFY 10 | GADDFFFPPEGCDNNMLYPQTATSWNVLPPQAMQPAPTYVGRPYTPNYRSTPGSAMFPYM 11 | QSSNSMQWNTAVSPYSSRAPSTTAKNYPPSTFYSQNINQYPRRRTVGMKSSQGNVPTGNK 12 | QSVGKSAKISKPLHIKTSAYQKQYKINLETKARPSAGDEDSAHPDKNKEISMPTPDSNTL 13 | VVQSEEGGAHSLEVDTNRRSDKNLPDAT 14 | 15 | -------------------------------------------------------------------------------- /data/studies/bandaru_2017_ras/standardise_bandaru_2017_ras.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Bandaru et al. 2017 (Ras) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/bandaru_2017_ras/bandaru_2017_ras.yaml') 8 | dm_data <- read_xlsx('data/studies/bandaru_2017_ras/raw/elife-27810-supp1-v2.xlsx') %>% 9 | mutate(...1 = replace_na(...1, 'wt')) %>% 10 | transpose_tibble(col_names = ...1, id_col = 'position') %>% 11 | mutate_at(vars(-wt), as.numeric) %>% 12 | mutate(position = as.integer(position)) %>% 13 | pivot_longer(A:Y, names_to = 'mut', values_to = 'raw_score') %>% 14 | mutate(transformed_score = raw_score/log(2), 15 | score = normalise_score(transformed_score), 16 | class = get_variant_class(wt, mut)) 17 | 18 | # Save output 19 | standardise_study(dm_data, meta$study, meta$transform) 20 | -------------------------------------------------------------------------------- /src/continuous.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Functions for characterising Deep Mutational Scanning positions based on continuous ER gradiant 3 | 4 | # Process the various factors 5 | get_factor_type <- function(x){ 6 | out <- rep(NA, length(x)) 7 | out[x == 'all_atom_abs'] <- 'SA' 8 | out[x %in% names(FOLDX_TERMS)] <- 'FoldX' 9 | out[str_starts(x, 'ss_')] <- 'DSSP' 10 | out[str_starts(x, 'within_10_0')] <- 'Chem. Env.' 11 | 12 | return(out) 13 | } 14 | 15 | pretty_factors <- function(x){ 16 | out <- rep(NA, length(x)) 17 | typ <- get_factor_type(x) 18 | 19 | out[typ == 'SA'] <- 'All Atom Abs.' 20 | out[typ == 'FoldX'] <- FOLDX_TERMS[x[typ == 'FoldX']] 21 | out[typ == 'Secondary Structure'] <- DSSP_CLASSES_STR[str_sub(x[typ == 'Secondary Structure'], start = 4)] 22 | out[typ == 'Chemical Environment'] <- str_sub(x[typ == 'Chemical Environment'], start = -1) 23 | 24 | return(out) 25 | } 26 | -------------------------------------------------------------------------------- /data/studies/steinberg_2016_tem1/standardise_steinberg_2016_tem1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Steinberg & Ostermeier 2016 (TEM1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/steinberg_2016_tem1/steinberg_2016_tem1.yaml') 8 | dm_data <- read_xlsx('data/studies/steinberg_2016_tem1/raw/1-s2.0-S0022283616301450-mmc2.xlsx', trim_ws = TRUE) %>% 9 | rename_all(~str_replace_all(str_to_lower(.), ' ', '_')) %>% 10 | drop_na(codon_position) %>% 11 | select(position = codon_position, wt = wt_aa, mut = mutant_aa, raw_score = tem1_amp_fitness) %>% 12 | mutate(transformed_score = log2(raw_score), 13 | score = normalise_score(transformed_score), 14 | class = get_variant_class(wt, mut)) %>% 15 | drop_na(score) # Not all measured 16 | 17 | # Save output 18 | standardise_study(dm_data, meta$study, meta$transform) 19 | -------------------------------------------------------------------------------- /data/studies/matreyek_2018_pten/standardise_matreyek_2018_pten.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Matreyek et al. 2018 (PTEN) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/matreyek_2018_pten/matreyek_2018_pten.yaml') 8 | dm_data <- read_csv('data/studies/matreyek_2018_pten/raw/PTEN.csv', 9 | col_types = cols(.default = col_character(), position = col_integer(), score = col_double())) %>% 10 | select(-X1) %>% 11 | rename(wt = start, mut = end, raw_score = score) %>% 12 | mutate(mut = if_else(mut == 'X', '*', mut), 13 | class = str_to_title(class), 14 | transformed_score = transform_vamp_seq(raw_score), 15 | score = normalise_score(transformed_score)) %>% 16 | drop_na(score) # Not all measured 17 | 18 | # Save output 19 | standardise_study(dm_data, meta$study, meta$transform) 20 | 21 | -------------------------------------------------------------------------------- /data/studies/matreyek_2018_tpmt/standardise_matreyek_2018_tpmt.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Matreyek et al. 2018 (TPMT) 3 | 4 | source('src/config.R') 5 | source('src/study_standardising.R') 6 | 7 | # Import and process data 8 | meta <- read_yaml('data/studies/matreyek_2018_tpmt/matreyek_2018_tpmt.yaml') 9 | dm_data <- read_csv('data/studies/matreyek_2018_tpmt/raw/TPMT.csv', 10 | col_types = cols(.default = col_character(), position = col_integer(), score = col_double())) %>% 11 | select(-X1) %>% 12 | rename(wt = start, mut = end, raw_score = score) %>% 13 | mutate(mut = if_else(mut == 'X', '*', mut), 14 | class = str_to_title(class), 15 | transformed_score = transform_vamp_seq(raw_score), 16 | score = normalise_score(transformed_score)) %>% 17 | drop_na(score) # not all measured 18 | 19 | # Save output 20 | standardise_study(dm_data, meta$study, meta$transform) 21 | 22 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_hsp90.fa: -------------------------------------------------------------------------------- 1 | >sp|P02829|HSP82_YEAST ATP-dependent molecular chaperone HSP82 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=HSP82 PE=1 SV=1 2 | MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPKQLETEP 3 | DLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF 4 | GVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDEVNERIGRGTILRLFLKDD 5 | QLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKEVPIPEEEKKDEEKKDEEKKDEDDK 6 | KPKLEEVDEEEEKKPKTKKVKEEVQEIEELNKTKPLWTRNPSDITQEEYNAFYKSISNDW 7 | EDPLYVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPE 8 | WLSFVKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSA 9 | FSKNIKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLK 10 | AVEKSPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER 11 | EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRD 12 | SSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYETALLTSGFSLDE 13 | PTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPADTEMEEVD 14 | 15 | -------------------------------------------------------------------------------- /data/studies/weile_2017_tpk1/weile_2017_tpk1.yaml: -------------------------------------------------------------------------------- 1 | study: 'weile_2017_tpk1' 2 | gene: 'TPK1' 3 | uniprot_id: 'Q9H3S4' 4 | gene_type: 'Enzyme' 5 | species: 'H. sapiens' 6 | seq: "MEHAFTPLEPLLSTGNLKYCLVILNQPLDNYFRHLWNKALLRACADGGANRLYDITEGER\ 7 | ESFLPEFINGDFDSIRPEVREYYATKGCELISTPDQDHTDFTKCLKMLQKKIEEKDLKVD\ 8 | VIVTLGGLAGRFDQIMASVNTLFQATHITPFPIIIIQEESLIYLLQPGKHRLHVDTGMEG\ 9 | DWCGLIPVGQPCMQVTTTGLKWNLTNDVLAFGTLVSTSNTYDGSGVVTVETDHPLLWTMA\ 10 | IKS" 11 | experiment: 'Complement' 12 | transform: 'VAMP-seq' 13 | authour: 'Weile et al.' 14 | year: 2017 15 | title: 'A framework for exhaustively mapping functional missense variants' 16 | lab: ['Roth', 'Fowler'] 17 | doi: '10.15252/msb.20177908' 18 | pmid: '29269382' 19 | url: 'http://msb.embopress.org/content/13/12/957' 20 | input_files: 21 | - 'urn_mavedb_00000001-d-2_scores.csv' 22 | source: 'MaveDB' 23 | mavedb_urn: 'urn:mavedb:00000001-d' 24 | qc: 25 | filter: False 26 | notes: 27 | -------------------------------------------------------------------------------- /data/studies/jiang_2013_hsp90/standardise_jiang_2013_hsp90.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Jiang et al. 2013 (HSP90) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/jiang_2013_hsp90/jiang_2013_hsp90.yaml') 8 | dm_data <- read_xlsx('data/studies/jiang_2013_hsp90/raw/journal.pgen.1003600.s014.xlsx', skip = 2) %>% 9 | select(-...10) %>% 10 | rename_all(tolower) %>% 11 | rename(mut = `amino acid`, 12 | sd = `standard deviation`) %>% 13 | mutate(raw_score = as.numeric(str_remove(average, '<')), # Set all instances of <0.034 to 0.034 14 | transformed_score = log2(raw_score), 15 | score = normalise_score(transformed_score), 16 | wt = str_split(meta$seq, '')[[1]][position], 17 | class = get_variant_class(wt, mut)) 18 | 19 | # Save output 20 | standardise_study(dm_data, meta$study, meta$transform) 21 | 22 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_braf.fa: -------------------------------------------------------------------------------- 1 | >sp|P15056|BRAF_HUMAN Serine/threonine-protein kinase B-raf OS=Homo sapiens OX=9606 GN=BRAF PE=1 SV=4 MODIFIED: V600E 2 | MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH 3 | IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV 4 | TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS 5 | LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK 6 | TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI 7 | PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR 8 | DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP 9 | GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV 10 | AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH 11 | LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATE 12 | KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN 13 | NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS 14 | LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH 15 | 16 | -------------------------------------------------------------------------------- /data/studies/olson_2014_proteing/olson_2014_proteing.yaml: -------------------------------------------------------------------------------- 1 | study: 'olson_2014_proteing' 2 | gene: 'Protein G' 3 | domain: 'GB1' 4 | uniprot_id: 'P19909' 5 | gene_type: 'Immune' 6 | species: 'Streptococcus' 7 | seq: "MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE" 8 | experiment: 'Ligand binding' 9 | transform: 'None (Processed from counts)' 10 | authour: 'Olson et al.' 11 | year: 2014 12 | title: 'A Comprehensive Biophysical Description of Pairwise Epistasis throughout an Entire Protein Domain' 13 | lab: ['Sun'] 14 | doi: '10.1016/j.cub.2014.09.072' 15 | pmid: '25455030' 16 | url: 'https://www.sciencedirect.com/science/article/pii/S0960982214012688' 17 | notes: "Uniprot ID is for the Protein G precurssor which has a slightly different sequence.\ 18 | The sequence given here is just for the GB1 domain, as given in the paper." 19 | input_files: 20 | - '1-s2.0-S0960982214012688-mmc2.xlsx' 21 | source: 'SI - Table S2' 22 | qc: 23 | filter: False 24 | notes: 25 | -------------------------------------------------------------------------------- /data/studies/roscoe_2014_ubi/roscoe_2014_ubi.yaml: -------------------------------------------------------------------------------- 1 | study: 'roscoe_2014_ubi' 2 | gene: 'UBI' 3 | uniprot_id: 'P0CG63' 4 | gene_type: 'PTM' 5 | species: 'S. cerevisiae' 6 | seq: "MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYN\ 7 | IQKESTLHLVLRLRGG" 8 | experiment: 'Reactivity' 9 | transform: 'None' 10 | authour: 'Roscoe & Bolon' 11 | year: 2014 12 | title: 'Systematic Exploration of Ubiquitin Sequence, E1 Activation Efficiency, and Experimental Fitness in Yeast' 13 | lab: ['Bolon'] 14 | doi: '10.1016/j.jmb.2014.05.019' 15 | pmid: '24862281' 16 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283614002587' 17 | notes: "Uniprot ID is for UBI4, took first repeat of sequence from here as very conserved. 18 | Also have data for excess E1, but for 18 rather than 75 positions, so I only use 19 | the limiting case" 20 | input_files: 21 | - '1-s2.0-S0022283614002587-mmc3.xlsx' 22 | source: 'SI - Table S2' 23 | qc: 24 | filter: False 25 | notes: 26 | -------------------------------------------------------------------------------- /data/studies/matreyek_2018_tpmt/matreyek_2018_tpmt.yaml: -------------------------------------------------------------------------------- 1 | study: 'matreyek_2018_tpmt' 2 | gene: 'TPMT' 3 | uniprot_id: 'P51580' 4 | gene_type: 'Methyltransferase' 5 | species: 'H. sapiens' 6 | seq: "MDGTRTSLDIEEYSDTEVQKNQVLTLEEWQDKWVNGKTAFHQEQGHQLLKKHLD\ 7 | TFLKGKSGLRVFFPLCGKAVEMKWFADRGHSVVGVEISELGIQEFFTEQNLSYS\ 8 | EEPITEIPGTKVFKSSSGNISLYCCSIFDLPRTNIGKFDMIWDRGALVAINPGD\ 9 | RKCYADTMFSLLGKKFQYLLCVLSYDPTKHPGPPFYVPHAEIERLFGKICNIRC\ 10 | LEKVDAFEERHKSWGIDCLFEKLYLLTEK" 11 | experiment: 'VAMP-seq' 12 | transform: "VAMP-seq" 13 | authour: 'Matreyek et al.' 14 | year: 2018 15 | title: 'Multiplex assessment of protein variant abundance by massively parallel sequencing' 16 | lab: ['Fowler', 'Shendure'] 17 | doi: '10.1038/s41588-018-0122-z' 18 | pmid: '29785012' 19 | url: 'https://www.nature.com/articles/s41588-018-0122-z' 20 | input_files: 21 | - 'TPMT.csv' 22 | source: 'Fowler Lab website: https://abundance.gs.washington.edu/shiny/stability/' 23 | qc: 24 | filter: False 25 | notes: 26 | -------------------------------------------------------------------------------- /data/studies/steinberg_2016_tem1/steinberg_2016_tem1.yaml: -------------------------------------------------------------------------------- 1 | study: 'steinberg_2016_tem1' 2 | gene: 'TEM1' 3 | uniprot_id: 'Q6SJ61' 4 | gene_type: 'Metabolic' 5 | species: 'E. coli' 6 | seq: "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP\ 7 | EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL\ 8 | CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM\ 9 | PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS\ 10 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW" 11 | experiment: 'Concentration of beta-lactam' 12 | transform: 'log2(x)' 13 | authour: 'Steinberg & Ostermeier' 14 | year: 2016 15 | title: 'Shifting Fitness and Epistatic Landscapes Reflect Trade-offs along an Evolutionary Pathway' 16 | lab: ['Ostermeier'] 17 | doi: '10.1016/j.jmb.2016.04.033' 18 | pmid: '27173379' 19 | url: 'https://www.sciencedirect.com/science/article/pii/S0022283616301450' 20 | input_files: 21 | - '1-s2.0-S0022283616301450-mmc2.xlsx' 22 | source: 'SI' 23 | qc: 24 | filter: False 25 | notes: 26 | -------------------------------------------------------------------------------- /data/studies/firnberg_2014_tem1/firnberg_2014_tem1.yaml: -------------------------------------------------------------------------------- 1 | study: 'firnberg_2014_tem1' 2 | gene: 'TEM1' 3 | uniprot_id: 'Q6SJ61' 4 | gene_type: 'Metabolic' 5 | species: 'E. coli' 6 | seq: "MSIQHFRVALIPFFAAFCLPVFAHPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRP\ 7 | EERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVREL\ 8 | CSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTM\ 9 | PAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGS\ 10 | RGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW" 11 | experiment: 'Coupled Growth' 12 | transform: 'log2(x)' 13 | authour: 'Firnberg et al.' 14 | year: 2014 15 | title: 'A Comprehensive, High-Resolution Map of a Gene’s Fitness Landscape' 16 | lab: ['Ostermeier'] 17 | doi: '10.1093/molbev/msu081' 18 | pmid: '24567513' 19 | url: 'https://academic.oup.com/mbe/article/31/6/1581/2925654' 20 | input_files: 21 | - 'firnberg_2014_tem1.xlsx' 22 | source: 'SI' 23 | qc: 24 | filter: True 25 | notes: "Essentially identical to steinberg_2016_tem1, but with slightly fewer variants" 26 | -------------------------------------------------------------------------------- /data/studies/mishra_2016_hsp90/standardise_mishra_2016_hsp90.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Mishra et al. 2016 (HSP90) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/mishra_2016_hsp90/mishra_2016_hsp90.yaml') 8 | path <- 'data/studies/mishra_2016_hsp90/raw/1-s2.0-S2211124716303175-mmc2.xlsx' 9 | dm_data <- map(excel_sheets(path), read_mishra_sheet, path = path) %>% 10 | bind_rows() %>% 11 | select(position, mut=aa, raw_score=avg) %>% 12 | mutate(wt = str_split(meta$seq, '')[[1]][position], 13 | raw_score = na_if(raw_score, -999), 14 | transformed_score = raw_score, 15 | score = normalise_score(transformed_score), 16 | class = get_variant_class(wt, mut)) %>% 17 | select(position, wt, mut, transformed_score, raw_score, score, class) %>% 18 | arrange(position, mut) %>% 19 | drop_na(score) # Some not measured 20 | 21 | # Save output 22 | standardise_study(dm_data, meta$study, meta$transform) 23 | -------------------------------------------------------------------------------- /data/studies/roscoe_2013_ubi/standardise_roscoe_2013_ubi.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Roscoe et al. 2013 (Ubi) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/roscoe_2013_ubi/roscoe_2013_ubi.yaml') 8 | dm_data <- read_xlsx('data/studies/roscoe_2013_ubi/raw/1-s2.0-S0022283613000636-mmc3.xlsx', skip = 4) %>% 9 | rename(position = Position, 10 | mut = `Amino Acid`, 11 | selection_chr = Apparent, 12 | sd_chr = `Quantified Synonyms`) %>% 13 | mutate(raw_score = as.numeric(selection_chr), 14 | transformed_score = raw_score, 15 | score = normalise_score(transformed_score), 16 | wt = str_split(meta$seq, '')[[1]][position], 17 | class = get_variant_class(wt, mut)) %>% 18 | select(position, wt, mut, transformed_score, raw_score, score, class) %>% 19 | drop_na(score) # Not all measured sucessfuly 20 | 21 | # Save output 22 | standardise_study(dm_data, meta$study, meta$transform) 23 | 24 | -------------------------------------------------------------------------------- /data/studies/kitzman_2015_gal4/standardise_kitzman_2015_gal4.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Kitzman et al. 2015 (GAL4) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/kitzman_2015_gal4/kitzman_2015_gal4.yaml') 8 | path <- 'data/studies/kitzman_2015_gal4/raw/41592_2015_BFnmeth3223_MOESM306_ESM.xlsx' 9 | dm_data <- lapply(excel_sheets(path), read_kitzman_sheet, path = path) %>% 10 | bind_rows(.) %>% 11 | spread(key = 'label', value = 'log2_enrichment') %>% 12 | mutate(raw_score = rowMeans(select(., SEL_A_24h, SEL_A_40h, SEL_B_40h, SEL_C_40h, SEL_C_64h), na.rm = TRUE) %>% replace_na(NA), # Average over replicates (diff times found to still correlate well) 13 | transformed_score = raw_score, 14 | score = normalise_score(transformed_score), 15 | class = get_variant_class(wt, mut)) %>% 16 | filter(!mut == 'delInFrame') %>% 17 | drop_na(score) # Some variants not measured 18 | 19 | # Save output 20 | standardise_study(dm_data, meta$study, meta$transform) 21 | -------------------------------------------------------------------------------- /data/studies/roscoe_2014_ubi/standardise_roscoe_2014_ubi.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Roscoe & Bolon 2014 (Ubiquitin) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/roscoe_2014_ubi/roscoe_2014_ubi.yaml') 8 | dm_data <- read_xlsx('data/studies/roscoe_2014_ubi/raw/1-s2.0-S0022283614002587-mmc3.xlsx', skip = 3, na = 'NA') %>% 9 | rename(position = Position, 10 | mut = `Amino Acid`, 11 | raw_score = `log2 (E1react/display)`, 12 | rel_e1_reactivity = `Relative E1-reactivity (avg WT=1, avg STOP=0)`, 13 | sd_in_symonoymous_codons = `Standard deviation among synonymous codons`, 14 | notes = Notes) %>% 15 | mutate(transformed_score = raw_score, 16 | score = normalise_score(transformed_score), 17 | wt = str_split(meta$seq, '')[[1]][position], 18 | class = get_variant_class(wt, mut)) %>% 19 | drop_na(score) # Not all measured sucessfully 20 | 21 | # Save output 22 | standardise_study(dm_data, meta$study, meta$transform) 23 | -------------------------------------------------------------------------------- /data/studies/sarkisyan_2016_gfp/sarkisyan_2016_gfp.yaml: -------------------------------------------------------------------------------- 1 | study: 'sarkisyan_2016_gfp' 2 | gene: 'GFP' 3 | uniprot_id: 'P42212' 4 | gene_type: 'Flourescent Protein' 5 | species: 'A. victoria' 6 | seq: "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTL\ 7 | VTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLV\ 8 | NRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLAD\ 9 | HYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" 10 | experiment: 'Flourescence level' 11 | transform: 'log2(x/F_wt)' 12 | authour: 'Sarkisyan et al.' 13 | year: 2016 14 | title: 'Local fitness landscape of the green fluorescent protein' 15 | lab: ['Kondrashov'] 16 | doi: '10.1038/nature17995' 17 | pmid: '27193686' 18 | url: 'https://www.nature.com/articles/nature17995' 19 | notes: "Used a GFP with F64L, which is reflected in the sequence we use" 20 | input_files: 21 | - 'amino_acid_genotypes_to_brightness.tsv' 22 | source: 'SI - Figshare (https://figshare.com/articles/Local_fitness_landscape_of_the_green_fluorescent_protein/3102154)' 23 | qc: 24 | filter: True 25 | notes: "Low coverage" 26 | -------------------------------------------------------------------------------- /meta/fasta/s_cerevisiae_gal4.fa: -------------------------------------------------------------------------------- 1 | >sp|P04386|GAL4_YEAST Regulatory protein GAL4 OS=Saccharomyces cerevisiae (strain ATCC 204508 / S288c) OX=559292 GN=GAL4 PE=1 SV=2 2 | MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKRSPLTRAHLTEVESR 3 | LERLEQLFLLIFPREDLDMILKMDSLQDIKALLTGLFVQDNVNKDAVTDRLASVETDMPL 4 | TLRQHRISATSSSEESSNKGQRQLTVSIDSAAHHDNSTIPLDFMPRDALHGFDWSEEDDM 5 | SDGLPFLKTDPNNNGFFGDGSLLCILRSIGFKPENYTNSNVNRLPTMITDRYTLASRSTT 6 | SRLLQSYLNNFHPYCPIVHSPTLMMLYNNQIEIASKDQWQILFNCILAIGAWCIEGESTD 7 | IDVFYYQNAKSHLTSKVFESGSIILVTALHLLSRYTQWRQKTNTSYNFHSFSIRMAISLG 8 | LNRDLPSSFSDSSILEQRRRIWWSVYSWEIQLSLLYGRSIQLSQNTISFPSSVDDVQRTT 9 | TGPTIYHGIIETARLLQVFTKIYELDKTVTAEKSPICAKKCLMICNEIEEVSRQAPKFLQ 10 | MDISTTALTNLLKEHPWLSFTRFELKWKQLSLIIYVLRDFFTNFTQKKSQLEQDQNDHQS 11 | YEVKRCSIMLSDAAQRTVMSVSSYMDNHNVTPYFAWNCSYYLFNAVLVPIKTLLSNSKSN 12 | AENNETAQLLQQINTVLMLLKKLATFKIQTCEKYIQVLEEVCAPFLLSQCAIPLPHISYN 13 | NSNGSAIKNIVGSATIAQYPTLPEENVNNISVKYVSPGSVGPSPVPLKSGASFSDLVKLL 14 | SNRPPSRNSPVTIPRSTPSHRSVTPFLGQQQQLQSLVPLTPSALFGGANFNQSGNIADSS 15 | LSFTFTNSSNGPNLITTQTNSQALSQPIASSNVHDNFMNNEITASKIDDGNNSKPLSPGW 16 | TDQTAYNAFGITTGMFNTTTMDDVYNYLFDDEDTPPNPKKE 17 | 18 | -------------------------------------------------------------------------------- /data/studies/findlay_2018_brca1/standardise_findlay_2018_brca1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Findlay et al. 2018 (BRCA1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/findlay_2018_brca1/findlay_2018_brca1.yaml') 8 | dm_data <- read_xlsx('data/studies/findlay_2018_brca1/raw/41586_2018_461_MOESM3_ESM.xlsx', skip = 2, na = 'NA') %>% 9 | rename_all(list( ~ gsub('[\\/ \\(\\)]+', '_', .))) %>% 10 | rename(wt_nuc = reference, 11 | mut_nuc = alt, 12 | wt = aa_ref, 13 | mut = aa_alt, 14 | position = aa_pos) %>% 15 | drop_na(position) %>% 16 | group_by(position, wt, mut) %>% 17 | summarise(raw_score = mean(function.score.mean, na.rm=TRUE)) %>% 18 | ungroup() %>% 19 | mutate(transformed_score = raw_score, 20 | score = normalise_score(transformed_score), 21 | class = get_variant_class(wt, mut)) %>% 22 | select(position, wt, mut, score, transformed_score, raw_score, class) 23 | 24 | # Save output 25 | standardise_study(dm_data, meta$study, meta$transform) 26 | -------------------------------------------------------------------------------- /data/studies/hartman_2018_cp/hartman_2018_cp.yaml: -------------------------------------------------------------------------------- 1 | study: 'hartman_2018_cp' 2 | gene: 'CP' 3 | uniprot_id: 'P03612' 4 | gene_type: 'Viral Coat' 5 | species: 'Bacteriophage MS2' 6 | seq: "MASNFTQFVLVDNGGTGDVTVAPSNFANGVAEWISSNSRSQAYKVTCSVRQSSAQNRKYT\ 7 | IKVEVPKVATQTVGGVELPVAAWRSYLNMELTIPIFATNSDCELIVKAMQGLLKDGNPIP\ 8 | SAIAANSGIY" 9 | experiment: 'Viral coat assembly' 10 | transform: '(x - mean(x_wt))/log10(2)' 11 | authour: 'Hartman et al.' 12 | year: 2018 13 | title: 'Quantitative characterization of all single amino acid variants of a viral capsid-based drug delivery vehicle' 14 | lab: ['Tullman-Ercek'] 15 | doi: '10.1038/s41467-018-03783-y' 16 | pmid: '29643335' 17 | url: 'https://www.nature.com/articles/s41467-018-03783-y' 18 | notes: "In their method they arbilarily set variants with 0 reads to a certain score, \ 19 | but this is much lower than the rest of the scale and skews everything. Here \ 20 | we set it (again rather arbitarily, but less so than them) to just below the \ 21 | lowest measured score (-2.5)" 22 | input_files: 23 | - '41467_2018_3783_MOESM4_ESM.xlsx' 24 | source: 'SI' 25 | qc: 26 | filter: False 27 | notes: 28 | -------------------------------------------------------------------------------- /data/studies/brenan_2016_mapk1/standardise_brenan_2016_mapk1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Brenan et al. 2016 (MAPK1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/brenan_2016_mapk1/brenan_2016_mapk1.yaml') 8 | dm_data <- read_xlsx('data/studies/brenan_2016_mapk1/raw/1-s2.0-S2211124716313171-mmc2.xlsx', sheet = 'Supplemental_Table_1') %>% 9 | rename_all(list( ~ gsub(' ', '_', tolower(.)))) %>% 10 | rename(wt = wt_aa, mut = mutant_aa, position = erk2_residue) %>% 11 | mutate(raw_score = `lfc_(etp_vs._dox)`, # Only general condition, other two are for specific drugs 12 | transformed_score = -raw_score, # The selection scheme they used favoured lof > wt > gof 13 | score = normalise_score(transformed_score), 14 | class = get_variant_class(wt, mut)) %>% 15 | mutate_at(vars(nuc_acid_changes, dox_rank, sch_rank, vrt_rank, vrt_specific_allele, sch_specific_allele), as.integer) %>% 16 | select(position, wt, mut, score, transformed_score, raw_score, class) 17 | 18 | # Save output 19 | standardise_study(dm_data, meta$study, meta$transform) 20 | -------------------------------------------------------------------------------- /data/studies/wrenbeck_2017_amie/wrenbeck_2017_amie.yaml: -------------------------------------------------------------------------------- 1 | study: 'wrenbeck_2017_amie' 2 | gene: 'amiE' 3 | uniprot_id: 'P11436' 4 | gene_type: 'Metabolic' 5 | species: 'P. aeruginosa' 6 | seq: "MRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEY\ 7 | SLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLV\ 8 | LIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAM\ 9 | KGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDG\ 10 | RTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAEC\ 11 | PFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA" 12 | experiment: 'Coupled growth' 13 | transform: 'None' 14 | authour: 'Wrenbeck et al.' 15 | year: 2017 16 | title: 'Single-mutation fitness landscapes for an enzyme on multiple substrates reveal specificity is globally encoded' 17 | lab: ['Whitehead'] 18 | doi: '10.1038/ncomms15695' 19 | pmid: '28585537' 20 | url: 'https://www.nature.com/articles/ncomms15695' 21 | input_files: 22 | - 'amiESelectionFitnessData_Acetamide.txt' 23 | - 'amiESelectionFitnessData_Isobutyramide.txt' 24 | - 'amiESelectionFitnessData_Propionamide.txt' 25 | source: 'SI' 26 | qc: 27 | filter: False 28 | notes: 29 | -------------------------------------------------------------------------------- /data/studies/matreyek_2018_pten/matreyek_2018_pten.yaml: -------------------------------------------------------------------------------- 1 | study: 'matreyek_2018_pten' 2 | gene: 'PTEN' 3 | uniprot_id: 'P60484' 4 | gene_type: 'Phosphatase' 5 | species: 'H. sapiens' 6 | seq: "MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVYRNNIDDVV\ 7 | RFLDSKHKNHYKIYNLCAERHYDTAKFNCRVAQYPFEDHNPPQLELIKPFCEDL\ 8 | DQWLSEDDNHVAAIHCKAGKGRTGVMICAYLLHRGKFLKAQEALDFYGEVRTRD\ 9 | KKGVTIPSQRRYVYYYSYLLKNHLDYRPVALLFHKMMFETIPMFSGGTCNPQFV\ 10 | VCQLKVKIYSSNSGPTRREDKFMYFEFPQPLPVCGDIKVEFFHKQNKMLKKDKM\ 11 | FHFWVNTFFIPGPEETSEKVENGSLCDQEIDSICSIERADNDKEYLVLTLTKND\ 12 | LDKANKDKANRYFSPNFKVKLYFTKTVEEPSNPEASSSTSVTPDVSDNEPDHYR\ 13 | YSDTTDSDPENEPFDEDQHTQITKV" 14 | experiment: 'VAMP-seq' 15 | transform: "VAMP-seq" 16 | authour: 'Matreyek et al.' 17 | year: 2018 18 | title: 'Multiplex assessment of protein variant abundance by massively parallel sequencing' 19 | lab: ['Fowler', 'Shendure'] 20 | doi: '10.1038/s41588-018-0122-z' 21 | pmid: '29785012' 22 | url: 'https://www.nature.com/articles/s41588-018-0122-z' 23 | input_files: 24 | - 'PTEN.csv' 25 | source: 'Fowler Lab site: https://abundance.gs.washington.edu/shiny/stability/' 26 | qc: 27 | filter: False 28 | notes: 29 | -------------------------------------------------------------------------------- /data/studies/jones_2019_adrb2/jones_2019_adrb2.yaml: -------------------------------------------------------------------------------- 1 | study: 'jones_2019_adrb2' 2 | gene: 'ADRB2' 3 | uniprot_id: 'P07550' 4 | gene_type: 'GPCR' 5 | species: 'H. sapiens' 6 | seq: "MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLV\ 7 | ITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEF\ 8 | WTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSG\ 9 | LTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMV\ 10 | FVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHK\ 11 | ALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPL\ 12 | IYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLC\ 13 | EDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL" 14 | experiment: 'Expression of reporter gene' 15 | transform: "log2(x/mean(x_{subs with blosum62 >= 2}))" 16 | authour: 'Jones et al.' 17 | year: 2019 18 | title: 'Structural and Functional Characterization of G Protein-Coupled Receptors with Deep Mutational Scanning' 19 | lab: ['Kosuri', 'Dror', 'Babu'] 20 | doi: 'https://doi.org/10.1101/623108' 21 | pmid: 22 | url: 'https://www.biorxiv.org/content/10.1101/623108v2.full' 23 | input_files: 24 | - 'lib-med.csv' 25 | source: 'SI' 26 | qc: 27 | filter: False 28 | notes: 29 | -------------------------------------------------------------------------------- /meta/residue_hydrophobicity.tsv: -------------------------------------------------------------------------------- 1 | # Various protein hydrophbicity scales as found in Bandyopadhyay & Mehler 2008 (https://onlinelibrary.wiley.com/doi/full/10.1002/prot.21958) 2 | AA TW Faupl24 Abodr25 Rose34 Ponnu8 Mijer35 KyteDo36 White4 Eisen2 3 | C 1.15 1.54 NA 0.91 14.93 7.93 2.5 -0.02 0.38 4 | I 0.97 1.8 9.3 0.88 14.77 8.83 4.5 -1.12 1.90 5 | L 0.87 1.7 10.0 0.85 14.10 8.47 3.8 -1.25 1.90 6 | F 0.85 1.79 9.6 0.88 13.43 9.03 2.8 -1.71 2.30 7 | V 0.83 1.22 8.5 0.86 15.07 7.73 4.2 -0.46 1.50 8 | W 0.67 2.25 9.2 0.85 12.95 7.66 -0.9 -2.09 2.60 9 | Y 0.60 0.96 8.0 0.76 13.29 5.89 -1.3 -0.71 1.60 10 | M 0.54 1.23 8.7 0.85 14.33 8.95 1.9 -0.67 2.40 11 | A 0.33 0.31 5.1 0.74 12.28 5.33 1.8 0.50 0.67 12 | P 0.32 0.72 4.9 0.64 11.19 3.87 -1.6 0.14 1.20 13 | H 0.25 0.13 1.6 0.78 12.84 5.1 -3.2 2.33 0.64 14 | T 0.21 0.26 3.5 0.70 11.65 4.49 -0.7 0.25 0.52 15 | S 0.05 -0.04 3.1 0.66 11.26 4.09 -0.8 0.46 0.01 16 | R -0.01 -1.01 2.0 0.64 11.49 4.18 -4.5 1.81 -2.10 17 | Q -0.05 -0.22 1.4 0.62 11.28 3.87 -3.5 0.77 -0.22 18 | N -0.07 -0.6 0.6 0.63 11.00 3.71 -3.5 0.85 -0.6 19 | D -0.22 -0.77 0.7 0.62 10.97 3.59 -3.5 3.64 -1.2 20 | E -0.24 -0.64 1.8 0.62 11.19 3.65 -3.5 3.63 -0.76 21 | K -0.40 -0.99 1.3 0.52 10.8 2.95 -3.9 2.8 -0.57 22 | -------------------------------------------------------------------------------- /bin/figures/figureS5.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce figure S5 (Proportion of variance) 3 | source('src/config.R') 4 | 5 | dms <- read_tsv('data/combined_mutational_scans.tsv') 6 | 7 | pca <- tibble_pca(dms, A:Y) 8 | 9 | pca_summary <- tibble(pc=0:20, sd=c(0, pca$sdev)) %>% 10 | mutate(prop_var = sd^2/sum(sd^2), 11 | cum_var = cumsum(prop_var)) 12 | 13 | figure <- ggplot(pca_summary, aes(x = pc)) + 14 | geom_col(aes(y = prop_var, fill = 'Explained\nVariance')) + 15 | geom_line(aes(y = cum_var, colour = 'Cumulative\nExplained\nVariance')) + 16 | scale_fill_manual(values = c(`Explained\nVariance`='cornflowerblue'), name = '') + 17 | scale_colour_manual(values = c(`Cumulative\nExplained\nVariance`='red'), name = '') + 18 | labs(x = 'Principal Component', y = 'Proportion of Variance') 19 | ggsave('figures/4_figures/figureS5.pdf', figure, width = 183, height = 100, units = 'mm') 20 | ggsave('figures/4_figures/figureS5.png', figure, width = 183, height = 100, units = 'mm') 21 | ggsave('figures/4_figures/figureS5.tiff', figure, width = 183, height = 100, units = 'mm') 22 | ggsave('figures/4_figures/figureS5.eps', figure, width = 183, height = 100, units = 'mm', device=cairo_ps, fallback_resolution = 600) 23 | -------------------------------------------------------------------------------- /data/studies/kelsic_2016_infa/standardise_kelsic_2016_infa.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Kelsic et al. 2016 (infA) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/kelsic_2016_infa/kelsic_2016_infa.yaml') 8 | dm_data <- read_csv('data/studies/kelsic_2016_infa/raw/cels_206_mmc5.csv', skip = 1, 9 | col_names = c('codon', 'mut', 'position', 'is_wt', 'raw_score', 10 | 'sd', 'fitness_rich', 11 | 'fitness_stdev_rich', 'RCS', 12 | 'mfe_ddG_43nt_sliding', 'tmp')) %>% 13 | select(codon:sd) %>% 14 | mutate(transformed_score = transform_vamp_seq(raw_score)) %>% 15 | group_by(position, mut) %>% 16 | summarise_at(vars(transformed_score, raw_score), mean, na.rm=TRUE) %>% 17 | ungroup() %>% 18 | mutate(score = normalise_score(transformed_score), 19 | wt = str_split(meta$seq, '')[[1]][position], 20 | class = get_variant_class(wt, mut)) %>% 21 | drop_na(wt, position) # measured codons for stop codon, which we don't want 22 | 23 | # Save output 24 | standardise_study(dm_data, meta$study, meta$transform) 25 | -------------------------------------------------------------------------------- /data/studies/hartman_2018_cp/standardise_hartman_2018_cp.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Hartman et al. 2018 (CP) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/hartman_2018_cp/hartman_2018_cp.yaml') 8 | dm_data <- read_xlsx('data/studies/hartman_2018_cp/raw/41467_2018_3783_MOESM4_ESM.xlsx', skip = 1, na = 'Not Observed') %>% 9 | rename(position = `Residue #`) %>% 10 | pivot_longer(-position, names_to = 'mut', values_to = 'raw_score') %>% 11 | drop_na(raw_score) %>% # Not all variants measured 12 | mutate(position = position + 1, 13 | wt = str_split(meta$seq, '')[[1]][position], 14 | class = get_variant_class(wt, mut), 15 | raw_score = ifelse(raw_score == -4, -2.5, raw_score), # Set nulls to a value closer to rest of scale (they already set this arbitarily) 16 | transformed_score = raw_score/log10(2), # Transform base 17 | transformed_score = transformed_score - mean(transformed_score[class == 'Synonymous'], na.rm=TRUE), # 'divide' (in log domain) by WT scores 18 | score = normalise_score(transformed_score)) 19 | 20 | # Save output 21 | standardise_study(dm_data, meta$study, meta$transform) 22 | -------------------------------------------------------------------------------- /data/studies/starita_2015_brca1/standardise_starita_2015_brca1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Starita et al. 2015 (BRCA1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/starita_2015_brca1/starita_2015_brca1.yaml') 8 | dm_data <- read_xls('data/studies/starita_2015_brca1/raw/genetics.115.175802-6.xls', na = 'NA') %>% 9 | rename_all(tolower) %>% 10 | rename(position = pos) %>% 11 | # Ref seq given by study has a mysterious, undocumented R at pos 175 where normal refs have K 12 | # using K here since the change is not explained in the paper and appears erroneous 13 | mutate(wt = str_split(meta$seq, '')[[1]][position], 14 | class = get_variant_class(wt, mut)) %>% 15 | filter(!variant_id == 'NA-NA') %>% 16 | # Use E3 score - this is more general funcition based and empirically it seems to find most of the same negative effects as the BARD1 binding assay 17 | mutate(raw_score = e3_score, 18 | transformed_score = transform_vamp_seq(raw_score), 19 | score = normalise_score(transformed_score)) %>% 20 | drop_na(score) # Not all measured 21 | 22 | # Save output 23 | standardise_study(dm_data, meta$study, meta$transform) 24 | -------------------------------------------------------------------------------- /data/studies/bolognesi_2019_tdp43/bolognesi_2019_tdp43.yaml: -------------------------------------------------------------------------------- 1 | study: 'bolognesi_2019_tdp43' 2 | gene: 'TDP43' 3 | domain: 'PRD' 4 | uniprot_id: 'Q13148' 5 | gene_type: 'DNA Binding' 6 | species: 'H. sapiens' 7 | seq: "MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNPVSQCMRGVRLVEGI\ 8 | LHAPDAGWGNLVYVVNYPKDNKRKMDETDASSAVKVKRAVQKTSDLIVLGLPWKTTEQDL\ 9 | KEYFSTFGEVLMVQVKKDLKTGHSKGFGFVRFTEYETQVKVMSQRHMIDGRWCDCKLPNS\ 10 | KQSQDEPLRSRKVFVGRCTEDMTEDELREFFSQYGDVMDVFIPKPFRAFAFVTFADDQIA\ 11 | QSLCGEDLIIKGISVHISNAEPKHNSNRQLERSGRFGGNPGGFGNQGGFGNSRGGGAGLG\ 12 | NNQGSNMGGGMNFGAFSINPAMMAAAQAALQSSWGMMGMLASQQNQSGPSGNNQNQGNMQ\ 13 | REPNQAFGSGNNSYSGSNSGAAIGWGSASNAGSGSGFNGGFGSSMDSKSSGWGM" 14 | experiment: 'Growth' 15 | transform: '-x/ln(2)' 16 | authour: 'Bolognesi et al.' 17 | year: 2019 18 | title: 'The mutational landscape of a prion-like domain' 19 | lab: ['Lehner'] 20 | doi: '10.1038/s41467-019-12101-z' 21 | pmid: '31519910' 22 | url: 'http://www.nature.com/articles/s41467-019-12101-z' 23 | notes: 'Score was in base e rather than 2 and inverted to make toxic positive' 24 | input_files: 25 | - '41467_2019_12101_MOESM7_ESM.xlsx' 26 | source: 'SI' 27 | qc: 28 | filter: True 29 | notes: 'Measure toxicity of a prion like protein - not really related to normal function' 30 | -------------------------------------------------------------------------------- /data/studies/olson_2014_proteing/standardise_olson_2014_proteing.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Olson et al. 2014 (Protein G) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/olson_2014_proteing/olson_2014_proteing.yaml') 8 | wt <- read_xlsx('data/studies/olson_2014_proteing/raw/1-s2.0-S0960982214012688-mmc2.xlsx', range = "U3:V4") %>% 9 | rename(input_count = `Input Count`, 10 | selection_count = `Selection Count`) 11 | E_wt <- wt$selection_count/wt$input_count 12 | 13 | dm_data <- read_xlsx('data/studies/olson_2014_proteing/raw/1-s2.0-S0960982214012688-mmc2.xlsx', range = cell_limits(ul = c(3, 14), lr = c(NA, 18))) %>% 14 | rename(wt = `WT amino acid`, 15 | position = `Position`, 16 | mut = `Mutation`, 17 | input_count = `Input Count`, 18 | selection_count = `Selection Count`) %>% 19 | mutate(raw_score = ((selection_count + min(selection_count[selection_count > 0], na.rm = TRUE))/input_count)/E_wt, 20 | transformed_score = log2(raw_score), 21 | score = normalise_score(transformed_score), 22 | class = get_variant_class(wt, mut)) 23 | 24 | # Save output 25 | standardise_study(dm_data, meta$study, meta$transform) 26 | -------------------------------------------------------------------------------- /data/studies/heredia_2018_cxcr4/heredia_2018_cxcr4.yaml: -------------------------------------------------------------------------------- 1 | study: 'heredia_2018_cxcr4' 2 | gene: 'CXCR4' 3 | uniprot_id: 'P61073' 4 | gene_type: 'GPCR' 5 | species: 'H. sapiens' 6 | seq: "MEGISIYTSDNYTEEMGSGDYDSMKEPCFREENANFNKIFLPTIYSIIFLTGIVGNGLVI\ 7 | LVMGYQKKLRSMTDKYRLHLSVADLLFVITLPFWAVDAVANWYFGNFLCKAVHVIYTVNL\ 8 | YSSVLILAFISLDRYLAIVHATNSQRPRKLLAEKVVYVGVWIPALLLTIPDFIFANVSEA\ 9 | DDRYICDRFYPNDLWVVVFQFQHIMVGLILPGIVILSCYCIIISKLSHSKGHQKRKALKT\ 10 | TVILILAFFACWLPYYIGISIDSFILLEIIKQGCEFENTVHKWISITEALAFFHCCLNPI\ 11 | LYAFLGAKFKTSAQHALTSVSRGSSLKILSKGKRGGHSSVSTESESSSFHSS" 12 | experiment: 'Surface expression and HIV-1 blocking antibody 2D7 affinity via FACS' 13 | transform: 'None' 14 | authour: 'Heredia et al.' 15 | year: 2018 16 | title: 'Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning' 17 | lab: ['Procko'] 18 | doi: '10.4049/jimmunol.1800343' 19 | pmid: '29678950' 20 | url: 'https://www.jimmunol.org/content/200/11/3825' 21 | notes: "Use mean of antibody binding experiment scores as method also incorporated\ 22 | surface expression." 23 | input_files: 24 | - 'GSE100368_enrichment_ratios_CXCR4.xlsx' 25 | source: 'NCBI GEO: GSE100368 - https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368' 26 | qc: 27 | filter: False 28 | notes: 29 | -------------------------------------------------------------------------------- /data/studies/doud_2015_np/doud_2015_np.yaml: -------------------------------------------------------------------------------- 1 | study: 'doud_2015_np' 2 | gene: 'NP' 3 | uniprot_id: 'I6TAH8' 4 | gene_type: 'viral' 5 | species: 'Influenza' 6 | strain: 'Human adapted strain A/Aichi/2/1968, H3N2' 7 | seq: "MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLS\ 8 | AFDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMI\ 9 | WHSNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFW\ 10 | RGENGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCL\ 11 | PACVYGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLS\ 12 | FIRGTKVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSV\ 13 | QRNLPFDKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSN\ 14 | EGSYFFGDNAEEYDN" 15 | experiment: 'Growth' 16 | transform: 'log2(x_mut_i/x_wt_i)' 17 | authour: 'Doud et al.' 18 | year: 2015 19 | title: 'Site-Specific Amino Acid Preferences Are Mostly Conserved in Two Closely Related Protein Homologs' 20 | lab: ['Bloom'] 21 | doi: '10.1093/molbev/msv167' 22 | pmid: '26226986' 23 | url: 'https://academic.oup.com/mbe/article/32/11/2944/982113' 24 | input_files: 25 | - "Supp_file_2_mean_aichi1968_prefs.txt" 26 | source: 'SI - Supplementary data ZIP file' 27 | qc: 28 | filter: False 29 | notes: 30 | -------------------------------------------------------------------------------- /data/studies/findlay_2014_dbr1/findlay_2014_dbr1.yaml: -------------------------------------------------------------------------------- 1 | study: 'findlay_2014_dbr1' 2 | gene: 'DBR1' 3 | domain: 'Exon 2' 4 | uniprot_id: 'Q9UK59' 5 | gene_type: 'Lauriat Debranching' 6 | species: 'H. sapiens' 7 | seq: "MRVAVAGCCHGELDKIYETLALAERRGPGPVDLLLCCGDFQAVRNEADLRCMAVPPKYRH\ 8 | MQTFYRYYSGEKKAPVLTLFIGGNHEASNHLQELPYGGWVAPNIYYLGLAGVVKYRGVRI\ 9 | GGISGIFKSHDYRKGHFECPPYNSSTIRSIYHVRNIEVYKLKQLKQPIDIFLSHDWPRSI\ 10 | YHYGNKKQLLKTKSFFRQEVENNTLGSPAASELLEHLKPTYWFSAHLHVKFAALMQHQAK\ 11 | DKGQTARATKFLALDKCLPHRDFLQILEIEHDPSAPDYLEYDIEWLTILRATDDLINVTG\ 12 | RLWNMPENNGLHARWDYSATEEGMKEVLEKLNHDLKVPCNFSVTAACYDPSKPQTQMQLI\ 13 | HRINPQTTEFCAQLGIIDINVRLQKSKEEHHVCGEYEEQDDVESNDSGEDQSEYNTDTSA\ 14 | LSSINPDEIMLDEEEDEDSIVSAHSGMNTPSVEPSDQASEFSASFSDVRILPGSMIVSSD\ 15 | DTVDSTIDREGKPGGTVESGNGEDLTKVPLKRLSDEHEPEQRKKIKRRNQAIYAAVDDDD\ 16 | DDAA" 17 | experiment: 'Growth' 18 | transform: 'None' 19 | authour: 'Findlay et al.' 20 | year: 2014 21 | title: 'Saturation editing of genomic regions by multiplex homology-directed repair' 22 | lab: ['Shendure'] 23 | doi: 'doi:10.1038/nature13695' 24 | pmid: '25141179' 25 | url: 'https://www.nature.com/articles/nature13695' 26 | input_files: 27 | - '41586_2014_BFnature13695_MOESM383_ESM.xlsx' 28 | source: 'SI - Table S4' 29 | qc: 30 | filter: False 31 | notes: 32 | -------------------------------------------------------------------------------- /data/studies/doud_2015_np/standardise_doud_2015_np.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Doud et al. 2015 (NP) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/doud_2015_np/doud_2015_np.yaml') 8 | dm_data <- read_table2('data/studies/doud_2015_np/raw/Supp_file_2_mean_aichi1968_prefs.txt', skip = 1, 9 | col_names = c('position', 'wt', 'entropy', 'A', 'C', 'D', 'E', 10 | 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 11 | 'R', 'S', 'T', 'V', 'W', 'Y'), 12 | col_types = cols(position=col_integer(), wt=col_character(), .default = col_double())) %>% 13 | pivot_longer(A:Y, names_to = 'mut', values_to = 'raw_score') %>% 14 | mutate(wt = str_split(meta$seq, '')[[1]][position]) %>% # is the same in all but 334, where the data has '?', so replace with known WTs 15 | group_by(position) %>% 16 | mutate(transformed_score = log2(raw_score / raw_score[which(mut == first(wt))])) %>% # Normalise by the WT at that position 17 | ungroup() %>% 18 | mutate(score = normalise_score(transformed_score), 19 | class = get_variant_class(wt, mut)) 20 | 21 | # Save output 22 | standardise_study(dm_data, meta$study, meta$transform) 23 | -------------------------------------------------------------------------------- /data/studies/melamed_2013_pab1/melamed_2013_pab1.yaml: -------------------------------------------------------------------------------- 1 | study: 'melamed_2013_pab1' 2 | gene: 'PAB1' 3 | domain: 'RRM' 4 | uniprot_id: 'P04147' 5 | gene_type: 'RNA Binding' 6 | species: 'S. cerevisiae' 7 | seq: "MADITDKTAEQLENLNIQDDQKQAATGSESQSVENSSASLYVGDLEPSVSEAHL\ 8 | YDIFSPIGSVSSIRVCRDAITKTSLGYAYVNFNDHEAGRKAIEQLNYTPIKGRL\ 9 | CRIMWSQRDPSLRKKGSGNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDEN\ 10 | GKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAPHLSRKERDSQLEETKA\ 11 | HYTNLYVKNINSETTDEQFQELFAKFGPIVSASLEKDADGKLKGFGFVNYEKHE\ 12 | DAVKAVEALNDSELNGEKLYVGRAQKKNERMHVLKKQYEAYRLEKMAKYQGVNL\ 13 | FVKNLDDSVDDEKLEEEFAPYGTITSAKVMRTENGKSKGFGFVCFSTPEEATKA\ 14 | ITEKNQQIVAGKPLYVAIAQRKDVRRSQLAQQIQARNQMRYQQATAAAAAAAAG\ 15 | MPGQFMPPMFYGVMPPRGVPFNGPNPQQMNPMGGMPKNGMPPQFRNGPVYGVPP\ 16 | QGGFPRNANDNNQFYQQKQRQALGEQLYKKVSAKTSNEEAAGKITGMILDLPPQ\ 17 | EVFPLLESDELFEQHYKEASAAYESFKKEQEQQTEQA" 18 | experiment: 'Growth' 19 | transform: 'None' 20 | authour: 'Melamed et al.' 21 | year: 2013 22 | title: 'Deep mutational scanning of an RRM domain of the Saccharomyces cerevisiae poly(A)-binding protein' 23 | lab: ['Fields'] 24 | doi: '10.1261/rna.040709.113' 25 | pmid: '24064791' 26 | url: 'https://rnajournal.cshlp.org/content/19/11/1537' 27 | input_files: 28 | - 'Supplementary_Table_2.xlsx' 29 | source: 'SI' 30 | qc: 31 | filter: False 32 | notes: 33 | -------------------------------------------------------------------------------- /data/studies/brenan_2016_mapk1/brenan_2016_mapk1.yaml: -------------------------------------------------------------------------------- 1 | study: 'brenan_2016_mapk1' 2 | gene: 'MAPK1' 3 | uniprot_id: 'P28482' 4 | gene_type: 'Kinase' 5 | species: 'H. sapiens' 6 | seq: "MAAAAAAGAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNVNKVRVAIKKISPFE\ 7 | HQTYCQRTLREIKILLRFRHENIIGINDIIRAPTIEQMKDVYIVQDLMETDLYKLLKTQH\ 8 | LSNDHICYFLYQILRGLKYIHSANVLHRDLKPSNLLLNTTCDLKICDFGLARVADPDHDH\ 9 | TGFLTEYVATRWYRAPEIMLNSKGYTKSIDIWSVGCILAEMLSNRPIFPGKHYLDQLNHI\ 10 | LGILGSPSQEDLNCIINLKARNYLLSLPHKNKVPWNRLFPNADSKALDLLDKMLTFNPHK\ 11 | RIEVEQALAHPYLEQYYDPSDEPIAEAPFKFDMELDDLPKEKLKELIFEETARFQPGYRS" 12 | experiment: 'Growth' 13 | transform: '-x' 14 | authour: 'Brenan et al.' 15 | year: 2016 16 | title: 'Phenotypic Characterization of a Comprehensive Set of MAPK1/ERK2 Missense Mutants' 17 | lab: ['Johannessen'] 18 | doi: '10.1016/j.celrep.2016.09.061' 19 | pmid: '27760319' 20 | url: 'https://www.sciencedirect.com/science/article/pii/S2211124716313171' 21 | notes: 'Transform by -1 because the selection scheme they used favoured lof > wt > gof.' 22 | input_files: 23 | - '1-s2.0-S2211124716313171-mmc2.xlsx' 24 | source: 'SI - Table S1' 25 | qc: 26 | filter: True 27 | notes: "Experimental setup leads to many substitutions with positive ER values, which\ 28 | are thought to be GOF variants. However in real organisms many GOF changes\ 29 | are still deleterious" 30 | -------------------------------------------------------------------------------- /data/studies/lee_2018_ha/standardise_lee_2018_ha.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Lee et al. 2018 (HA) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/lee_2018_ha/lee_2018_ha.yaml') 8 | 9 | # Extract overall protein position - is split into signal peptide and HA1/HA2 chains in original file 10 | get_position <- function(x){ 11 | if (grepl('\\-', x)){ 12 | return(as.numeric(x) + 17) 13 | } else if (grepl('HA2', x)) { 14 | x <- gsub('\\(HA2\\)', '', x) 15 | return(as.numeric(x) + 345) 16 | } else { 17 | return(as.numeric(x) + 16) 18 | } 19 | } 20 | 21 | dm_data <- read_xlsx('data/studies/lee_2018_ha/raw/pnas.1806133115.sd03.xlsx', sheet = 'avg_prefs') %>% 22 | rename(position = site) %>% 23 | gather(key = 'mut', value = 'raw_score', -position, -entropy, -neffective) %>% 24 | mutate(position = sapply(position, get_position), 25 | wt = str_split(meta$seq, '')[[1]][position], 26 | class = get_variant_class(wt, mut)) %>% 27 | arrange(position, mut) %>% 28 | group_by(position) %>% 29 | mutate(transformed_score = log2(raw_score / raw_score[which(mut == first(wt))])) %>% 30 | ungroup() %>% 31 | mutate(score = normalise_score(transformed_score)) 32 | 33 | # Save output 34 | standardise_study(dm_data, meta$study, meta$transform) 35 | -------------------------------------------------------------------------------- /data/studies/sun_2018_cbs/sun_2018_cbs.yaml: -------------------------------------------------------------------------------- 1 | study: 'sun_2018_cbs' 2 | gene: 'CBS' 3 | uniprot_id: 'P35520' 4 | gene_type: 'Metabolic' 5 | species: 'H. sapiens' 6 | seq: "MPSETPQAEVGPTGCPHRSGPHSAKGSLEKGSPEDKEAKEPLWIRPDAPSRCTWQLGRPA\ 7 | SESPHHHTAPAKSPKILPDILKKIGDTPMVRINKIGKKFGLKCELLAKCEFFNAGGSVKD\ 8 | RISLRMIEDAERDGTLKPGDTIIEPTSGNTGIGLALAAAVRGYRCIIVMPEKMSSEKVDV\ 9 | LRALGAEIVRTPTNARFDSPESHVGVAWRLKNEIPNSHILDQYRNASNPLAHYDTTADEI\ 10 | LQQCDGKLDMLVASVGTGGTITGIARKLKEKCPGCRIIGVDPEGSILAEPEELNQTEQTT\ 11 | YEVEGIGYDFIPTVLDRTVVDKWFKSNDEEAFTFARMLIAQEGLLCGGSAGSTVAVAVKA\ 12 | AQELQEGQRCVVILPDSVRNYMTKFLSDRWMLQKGFLKEEDLTEKKPWWWHLRVQELGLS\ 13 | APLTVLPTITCGHTIEILREKGFDQAPVVDEAGVILGMVTLGNMLSSLLAGKVQPSDQVG\ 14 | KVIYKQFKQIRLTDTLGRLSHILEMDHFALVVHEQIQYHSTGKSSQRQMVFGVVTAIDLL\ 15 | NFVAAQERDQK" 16 | experiment: 'Coupled growth' 17 | transform: 'VAMP-seq' 18 | authour: 'Sun et al.' 19 | year: 2018 20 | title: 'A proactive genotype-to-patient-phenotype map for cystathionine beta-synthase' 21 | lab: ['Roth'] 22 | doi: '10.1101/473983' 23 | pmid: '' 24 | url: 'https://www.biorxiv.org/content/10.1101/473983v3' 25 | mavedb_urn: 'urn:mavedb:00000005-a' 26 | notes: 'Used low B6 dataset as they were similar but it produced stronger effects' 27 | input_files: 28 | - 'urn_mavedb_00000005-a-4_scores.csv' 29 | source: 'MaveDB' 30 | qc: 31 | filter: False 32 | notes: 33 | -------------------------------------------------------------------------------- /meta/fasta/m_musculus_ube4b.fa: -------------------------------------------------------------------------------- 1 | >sp|Q9ES00|UBE4B_MOUSE Ubiquitin conjugation factor E4 B OS=Mus musculus OX=10090 GN=Ube4b PE=1 SV=3 2 | MEELSADEIRRRRLARLAGGQTSQPTTPLTSPQRENPPGPPIAASAPGPSQSLGLNVHNM 3 | TPATSPIGAAGVAHRSQSSEGVSSLSSSPSNSLETQSQSLSRSQSMDIDGVSCEKSMSQV 4 | DVDSGIENMEVDENDRREKRSLSDKEPSSGPEVSEEQALQLVCKIFRVSWKDRDRDVIFL 5 | SSLSAQFKQNPKEVFSDFKDLIGQILMEVLMMSTQTRDENPFASLTATSQPIATAARSPD 6 | RNLMLNTGSSSGTSPMFCNMGSFSTSSLSSLGASGGASNWDSYSDHFTIETCKETDMLNY 7 | LIECFDRVGIEEKKAPKMCSQPAVSQLLSNIRSQCISHTALVLQGSLTQPRSLQQPSFLV 8 | PYMLCRNLPYGFIQELVRTTHQDEEVFKQIFIPILQGLALAAKECSLESDYFKYPLMALG 9 | ELCETKFGKTHPMCNLVASLPLWLPKSLSPGSGRELQRLSYLGAFFSFSVFAEDDAKVVE 10 | KYFSGPAITLENTRVVSQSLQHYLELGRQELFKILHSILLNGETREAALSYMAALVNANM 11 | KKAQMQADDRLVSTDGFMLNLLWVLQQLSTKIKLETVDPTYIFHPRCRITLPNDETRINA 12 | TMEDVNERLTELYGDQPPFSEPKFPTECFFLTLHAHHLSILPSCRRYIRRLRAIRELNRT 13 | VEDLKNNESQWKDSPLATRHREMLKRCKTQLKKLVRCKACADAGLLDESFLRRCLNFYGL 14 | LIQLMLRILDPAYPDVTLPLNSEVPKVFAALPEFYVEDVAEFLFFIVQYSPQVLYEPCTQ 15 | DIVMFLVVMLCNQNYIRNPYLVAKLVEVMFMTNPSVQPRTQKFFEMIENHPLSTKLLVPS 16 | LMKFYTDVEHTGATSEFYDKFTIRYHISTIFKSLWQNIAHHGTFMEEFNSGKQFVRYINM 17 | LINDTTFLLDESLESLKRIHEVQEEMKNKEQWDQLPRDQQQARQSQLAQDERVSRSYLAL 18 | ATETVDMFHLLTKQVQKPFLRPELGPRLAAMLNFNLQQLCGPKCRDLKVENPEKYGFEPK 19 | KLLDQLTDIYLQLDCARFAKAIADDQRSYSKELFEEVISKMRKAGIKSTIAIEKFKLLAE 20 | KVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDP 21 | FNRQMLTESMLEPVPELKEQIQAWMREKQSSDH 22 | 23 | -------------------------------------------------------------------------------- /data/studies/findlay_2014_dbr1/standardise_findlay_2014_dbr1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Findlay et al. 2014 (DBR1 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/findlay_2014_dbr1/findlay_2014_dbr1.yaml') 8 | dm_data <- read_xlsx('data/studies/findlay_2014_dbr1/raw/41586_2014_BFnature13695_MOESM383_ESM.xlsx', skip = 5, na = 'NA', 9 | col_names = c('seq', 'log2_enrichment_score_day11_rep1', 'log2_enrichment_score_day11_rep2', 10 | 'position', 'wt', 'mut', 'mut_type')) %>% 11 | mutate(position = as.integer(na_if(position, "3'SS")), 12 | mut = ifelse(mut == 'WT', wt, mut), 13 | raw_score = rowMeans(select(., log2_enrichment_score_day11_rep1, log2_enrichment_score_day11_rep2), na.rm = TRUE) %>% replace_na(NA)) %>% 14 | group_by(position, wt, mut) %>% 15 | summarise(raw_score = mean(raw_score, na.rm=TRUE)) %>% 16 | ungroup() %>% 17 | mutate(transformed_score = raw_score, 18 | score = normalise_score(transformed_score), 19 | class = get_variant_class(wt, mut)) %>% 20 | drop_na(position, raw_score) %>% 21 | filter(!mut == 'DEL') %>% 22 | select(position, wt, mut, score, transformed_score, raw_score, class) 23 | 24 | # Save output 25 | standardise_study(dm_data, meta$study, meta$transform) 26 | -------------------------------------------------------------------------------- /data/studies/araya_2012_yap1/araya_2012_yap1.yaml: -------------------------------------------------------------------------------- 1 | study: 'araya_2012_yap1' 2 | gene: 'YAP1' 3 | domain: 'WW' 4 | uniprot_id: 'P46937' 5 | gene_type: 'TF' 6 | species: 'H. sapiens' 7 | seq: "MDPGQQPPPQPAPQGQGQPPSQPPQGQGPPSGPGQPAPAATQAAPQAPPAGHQI\ 8 | VHVRGDSETDLEALFNAVMNPKTANVPQTVPMRLRKLPDSFFKPPEPKSHSRQA\ 9 | STDAGTAGALTPQHVRAHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLRQ\ 10 | SSFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLSQMNVTAPT\ 11 | SPPVQQNMMNSASGPLPDGWEQAMTQDGEIYYINHKNKTTSWLDPRLDPRFAMN\ 12 | QRISQSAPVKQPPPLAPQSPQGGVMGGSNSNQQQQMRLQQLQMEKERLRLKQQE\ 13 | LLRQAMRNINPSTANSPKCQELALRSQLPTLEQDGGTQNPVSSPGMSQELRTMT\ 14 | TNSSDPFLNSGTYHSRDESTDSGLSMSSYSVPRTPDDFLNSVDEMDTGDTINQS\ 15 | TLPSQQNRFPDYLEAIPGTNVDLGTLEGDGMNIEGEELMPSLQEALSSDILNDM\ 16 | ESVLAATKLDKESFLTWL" 17 | experiment: 'ligand binding' 18 | transform: 'None' 19 | authour: 'Araya et al.' 20 | year: 2012 21 | title: 'A fundamental protein property, thermodynamic stability, revealed solely from large-scale measurements of protein function' 22 | lab: ['Fields', 'Fowler'] 23 | doi: '10.1073/pnas.1209751109' 24 | pmid: '23035249' 25 | url: 'http://www.pnas.org/content/109/42/16858' 26 | notes: 'Positions from paper are offset by 9, so add 160 to reach same start of WW domain in sequence' 27 | input_files: 28 | - 'urn_mavedb_00000002-a-2_scores.csv' 29 | source: 'MaveDB' 30 | mavedb_urn: 'urn:mavedb:00000002-a-2' 31 | qc: 32 | filter: False 33 | notes: 34 | -------------------------------------------------------------------------------- /data/studies/hietpas_2011_hsp90/hietpas_2011_hsp90.yaml: -------------------------------------------------------------------------------- 1 | study: 'hietpas_2011_hsp90' 2 | gene: 'HSP90' 3 | uniprot_id: 'P02829' 4 | gene_type: 'Chaperone' 5 | species: 'S. cerevisiae' 6 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPK\ 7 | QLETEPDLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEAL\ 8 | SAGADVSMIGQFGVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDE\ 9 | VNERIGRGTILRLFLKDDQLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKE\ 10 | VPIPEEEKKDEEKKDEEKKDEDDKKPKLEEVDEEEEKKPKTKKVKEEVQEIEEL\ 11 | NKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPLYVKHFSVEGQLEFRAILFIP\ 12 | KRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSFVKGVVDSEDLPLNL\ 13 | SREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSAFSKNIKLGVHED\ 14 | TQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLKAVEKSP\ 15 | FLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\ 16 | EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMK\ 17 | AQALRDSSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYE\ 18 | TALLTSGFSLDEPTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPAD\ 19 | TEMEEVD" 20 | experiment: 'Growth' 21 | transform: 'None' 22 | authour: 'Hietpas et al.' 23 | year: 2011 24 | title: 'Experimental illumination of a fitness landscape' 25 | lab: ['Bolon'] 26 | doi: '10.1073/pnas.1016024108' 27 | pmid: '21464309' 28 | url: 'http://www.pnas.org/content/108/19/7896' 29 | input_files: 30 | - 'sd02.csv' 31 | source: 'SI - Dataset S2' 32 | qc: 33 | filter: False 34 | notes: 35 | -------------------------------------------------------------------------------- /data/studies/wrenbeck_2017_amie/standardise_wrenbeck_2017_amie.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Wrenbeck et al. 2017 (amiE) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/wrenbeck_2017_amie/wrenbeck_2017_amie.yaml') 8 | dm_data <- bind_rows(acetamide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Acetamide.txt', na = c('NS', 'None')), 9 | isobutyramide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Isobutyramide.txt', na = c('NS', 'None')), 10 | propionamide = read_tsv('data/studies/wrenbeck_2017_amie/raw/amiESelectionFitnessData_Propionamide.txt', na = c('NS', 'None')), 11 | .id = 'condition') %>% 12 | select(position = location, mut = mutation, raw_score = normalized_fitness, condition) %>% 13 | pivot_wider(names_from = condition, values_from = raw_score) %>% 14 | mutate(raw_score = rowMeans(select(., acetamide, isobutyramide, propionamide), na.rm = TRUE) %>% replace_na(NA), 15 | transformed_score = raw_score, 16 | score = normalise_score(transformed_score), 17 | wt = str_split(meta$seq, '')[[1]][position], 18 | class = get_variant_class(wt, mut)) %>% 19 | drop_na(score) # Some variants not measured in any condition 20 | 21 | # Save output 22 | standardise_study(dm_data, meta$study, meta$transform) 23 | -------------------------------------------------------------------------------- /data/studies/ahler_2019_src/ahler_2019_src.yaml: -------------------------------------------------------------------------------- 1 | study: 'ahler_2019_src' 2 | gene: 'Src' 3 | domain: 'Catalytic and SH4' 4 | uniprot_id: 'P12931' 5 | gene_type: 'Kinase' 6 | species: 'H. sapiens' 7 | seq: "MGSNKSKPKDASQRRRSLEPAENVHGAGGGAFPASQTPSKPASADGHRGPSAAFAPAAAE\ 8 | PKLFGGFNSSDTVTSPQRAGPLAGGVTTFVALYDYESRTETDLSFKKGERLQIVNNTEGD\ 9 | WWLAHSLSTGQTGYIPSNYVAPSDSIQAEEWYFGKITRRESERLLLNAENPRGTFLVRES\ 10 | ETTKGAYCLSVSDFDNAKGLNVKHYKIRKLDSGGFYITSRTQFNSLQQLVAYYSKHADGL\ 11 | CHRLTTVCPTSKPQTQGLAKDAWEIPRESLRLEVKLGQGCFGEVWMGTWNGTTRVAIKTL\ 12 | KPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVVSEEPIYIVTEYMSKGSLLDFLKGETGKY\ 13 | LRLPQLVDMAAQIASGMAYVERMNYVHRDLRAANILVGENLVCKVADFGLARLIEDNEYT\ 14 | ARQGAKFPIKWTAPEAALYGRFTIKSDVWSFGILLTELTTKGRVPYPGMVNREVLDQVER\ 15 | GYRMPCPPECPESLHDLMCQCWRKEPEERPTFEYLQAFLEDYFTSTEPQYQPGENL" 16 | experiment: 'Coupled yeast growth' 17 | transform: '-x' 18 | authour: 'Ahler et al.' 19 | year: 2019 20 | title: 'A Combined Approach Reveals a Regulatory Mechanism Coupling Src’s Kinase Activity, Localization, and Phosphotransferase-Independent Functions' 21 | lab: ['Fowler', 'Maly'] 22 | doi: '10.1016/j.molcel.2019.02.003' 23 | pmid: '30956043' 24 | url: 'https://www.sciencedirect.com/science/article/pii/S1097276519300930' 25 | input_files: 26 | - 'urn_mavedb_00000041-b-1_scores.csv' 27 | - 'urn_mavedb_00000041-a-1_scores.csv' 28 | source: 'MaveDB' 29 | mavedb_urn: 'urn_mavedb_00000041' 30 | qc: 31 | filter: False 32 | notes: 33 | -------------------------------------------------------------------------------- /bin/data_processing/make_gene_fasta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Generate a fasta file for a gene, given a set of study yamls 4 | """ 5 | import argparse 6 | from ruamel.yaml import YAML 7 | 8 | from subtypes_utils import gene_to_filename 9 | 10 | FASTA_LINE_LENGTH = 80 11 | 12 | def main(args): 13 | """Main script""" 14 | yaml = YAML(typ='safe') 15 | seq = None 16 | gene = None 17 | for study_yaml in args.yaml: 18 | with open(study_yaml, 'r') as yaml_file: 19 | conf = yaml.load(yaml_file) 20 | 21 | if seq is None: 22 | seq = conf['seq'] 23 | gene = conf['gene'] 24 | elif not gene == conf['gene']: 25 | raise ValueError(f"Studies are for different genes") 26 | elif not seq == conf['seq']: 27 | raise ValueError(f"Studies have different sequences for {gene}") 28 | 29 | print(f">{gene_to_filename(gene)}") 30 | for i in range(0, len(seq), FASTA_LINE_LENGTH): 31 | print(seq[i:(i + FASTA_LINE_LENGTH)]) 32 | 33 | def parse_args(): 34 | """Process input arguments""" 35 | parser = argparse.ArgumentParser(description=__doc__, 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 37 | 38 | parser.add_argument('yaml', metavar='Y', nargs='+', help="Input study config YAML file(s)") 39 | 40 | return parser.parse_args() 41 | 42 | if __name__ == "__main__": 43 | ARGS = parse_args() 44 | main(ARGS) 45 | -------------------------------------------------------------------------------- /bin/data_processing/filter_pdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Filter PDB file based on section list 4 | """ 5 | import sys 6 | import argparse 7 | from pathlib import Path 8 | 9 | from Bio.PDB import PDBParser 10 | from Bio.PDB.PDBIO import PDBIO 11 | 12 | from subtypes_utils import SectionSelecter, import_sections 13 | 14 | def main(args): 15 | """Main script""" 16 | pdb_name = Path(args.pdb).stem 17 | # deal with FoldX repaired PDBs 18 | if pdb_name.endswith('_Repair'): 19 | pdb_name = pdb_name.replace('_Repair', '') 20 | 21 | pdb_parser = PDBParser() 22 | structure = pdb_parser.get_structure(pdb_name, args.pdb) 23 | 24 | sections = import_sections(args.yaml, pdb_name) 25 | 26 | pdbio = PDBIO() 27 | pdbio.set_structure(structure) 28 | pdbio.save(sys.stdout, select=SectionSelecter(sections)) 29 | 30 | def parse_args(): 31 | """Process input arguments""" 32 | parser = argparse.ArgumentParser(description=__doc__, 33 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 34 | 35 | parser.add_argument('pdb', metavar='P', help="Input PDB file") 36 | 37 | parser.add_argument('--yaml', '-y', 38 | help=("YAML file detailing regions to process for a set of genes or " 39 | "these sections in raw YAML strings")) 40 | 41 | return parser.parse_args() 42 | 43 | if __name__ == "__main__": 44 | ARGS = parse_args() 45 | main(ARGS) 46 | -------------------------------------------------------------------------------- /data/studies/heredia_2018_ccr5/heredia_2018_ccr5.yaml: -------------------------------------------------------------------------------- 1 | study: 'heredia_2018_ccr5' 2 | gene: 'CCR5' 3 | uniprot_id: 'P51681' 4 | gene_type: 'GPCR' 5 | species: 'H. sapiens' 6 | seq: "MDYQVSSPIYDINYYTSEPCQKINVKQIAARLLPPLYSLVFIFGFVGNMLVILILINCKR\ 7 | LKSMTDIYLLNLAISDLFFLLTVPFWAHYAAAQWDFGNTMCQLLTGLYFIGFFSGIFFII\ 8 | LLTIDRYLAVVHAVFALKARTVTFGVVTSVITWVVAVFASLPGIIFTRSQKEGLHYTCSS\ 9 | HFPYSQYQFWKNFQTLKIVILGLVLPLLVMVICYSGILKTLLRCRNEKKRHRAVRLIFTI\ 10 | MIVYFLFWAPYNIVLLLNTFQEFFGLNNCSSSNRLDQAMQVTETLGMTHCCINPIIYAFV\ 11 | GEKFRNYLLVFFQKHIAKRFCKCCSIFQQEAPERASSVYTRSTGEQEISVGL" 12 | experiment: 'Surface expression and HIV-1 blocking antibody 2D7 affinity via FACS' 13 | transform: 'None' 14 | authour: 'Heredia et al.' 15 | year: 2018 16 | title: 'Mapping Interaction Sites on Human Chemokine Receptors by Deep Mutational Scanning' 17 | lab: ['Procko'] 18 | doi: '10.4049/jimmunol.1800343' 19 | pmid: '29678950' 20 | url: 'https://www.jimmunol.org/content/200/11/3825' 21 | notes: "Manually edited the raw XLSX file, moving Y14 in column 1 down one row to its \ 22 | proper place as it had been eroneously offset to be in the last row of the 13th \ 23 | AA data. Use mean of antibody binding experiment scores as method also incorporated\ 24 | surface expression." 25 | input_files: 26 | - 'GSE100368_enrichment_ratios_CCR5.xlsx' 27 | source: 'NCBI GEO: GSE100368 - https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100368' 28 | qc: 29 | filter: False 30 | notes: 31 | -------------------------------------------------------------------------------- /data/studies/lee_2018_ha/lee_2018_ha.yaml: -------------------------------------------------------------------------------- 1 | study: 'lee_2018_ha' 2 | gene: 'HA' 3 | uniprot_id: 'P03437' 4 | gene_type: 'Viral' 5 | species: 'Influenza' 6 | strain: 'Human adapted A/Perth/16/2009, H3N2' 7 | seq: "MKTIIALSYILCLVFAQKLPGNDNSTATLCLGHHAVPNGTIVKTITNDQIEVTNATELVQSSSTGEICDS\ 8 | PHQILDGKNCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNES\ 9 | FNWTGVTQNGTSSACIRRSKNSFFSRLNWLTHLNFKYPALNVTMPNNEQFDKLYIWGVLHPGTDKDQIFL\ 10 | YAQASGRITVSTKRSQQTVSPNIGSRPRVRNIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGK\ 11 | SSIMRSDAPIGKCNSECITPNGSIPNDKPFQNVNRITYGACPRYVKQNTLKLATGMRNVPEKQTRGIFGA\ 12 | IAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEG\ 13 | RIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCD\ 14 | NACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNI\ 15 | RCNICI" 16 | experiment: 'Growth' 17 | transform: 'log2(x_mut_i / x_wt_i)' 18 | authour: 'Lee et al.' 19 | year: 2018 20 | title: 'Deep mutational scanning of hemagglutinin helps predict evolutionary fates of human H3N2 influenza variants' 21 | lab: ['Bloom'] 22 | doi: '10.1073/pnas.1806133115' 23 | pmid: '30104379' 24 | url: 'www.pnas.org/cgi/doi/10.1073/pnas.1806133115' 25 | notes: "Only gives per position frequencies, so cannot easily get comparable \ 26 | scores across the protein without re-running analysis." 27 | input_files: 28 | - 'pnas.1806133115.sd03.xlsx' 29 | source: 'SI - Table S3' 30 | qc: 31 | filter: False 32 | notes: 33 | -------------------------------------------------------------------------------- /data/studies/mishra_2016_hsp90/mishra_2016_hsp90.yaml: -------------------------------------------------------------------------------- 1 | study: 'mishra_2016_hsp90' 2 | gene: 'HSP90' 3 | uniprot_id: 'P02829' 4 | gene_type: 'Chaperone' 5 | species: 'S. cerevisiae' 6 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPKQLETEP\ 7 | DLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEALSAGADVSMIGQF\ 8 | GVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDEVNERIGRGTILRLFLKDD\ 9 | QLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKEVPIPEEEKKDEEKKDEEKKDEDDK\ 10 | KPKLEEVDEEEEKKPKTKKVKEEVQEIEELNKTKPLWTRNPSDITQEEYNAFYKSISNDW\ 11 | EDPLYVKHFSVEGQLEFRAILFIPKRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPE\ 12 | WLSFVKGVVDSEDLPLNLSREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSA\ 13 | FSKNIKLGVHEDTQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLK\ 14 | AVEKSPFLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\ 15 | EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMKAQALRD\ 16 | SSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYETALLTSGFSLDE\ 17 | PTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPADTEMEEVD" 18 | experiment: 'Growth' 19 | transform: 'None' 20 | authour: 'Mishra et al.' 21 | year: 2016 22 | title: 'Systematic Mutant Analyses Elucidate General and Client-Specific Aspects of Hsp90 Function' 23 | lab: ['Bolon'] 24 | doi: '10.1016/j.celrep.2016.03.046' 25 | pmid: '27068472' 26 | url: 'https://www.sciencedirect.com/science/article/pii/S2211124716303175' 27 | input_files: 28 | - '1-s2.0-S2211124716303175-mmc2.xlsx' 29 | source: 'SI - Table S1' 30 | qc: 31 | filter: False 32 | notes: 33 | -------------------------------------------------------------------------------- /data/studies/jones_2019_adrb2/standardise_jones_2019_adrb2.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Jones et al. 2019 (ADRB2) 3 | 4 | source('src/config.R') 5 | source('src/study_standardising.R') 6 | 7 | # Determine WT like 8 | data("BLOSUM62") 9 | similar_aas <- as_tibble(BLOSUM62, rownames = 'aa1') %>% 10 | pivot_longer(-aa1, names_to = 'aa2', values_to = 'blosum') %>% 11 | filter(blosum > 2, !aa1 == aa2, !aa1 %in% c('B', 'J', 'Z', 'X', '*'), !aa2 %in% c('B', 'J', 'Z', 'X', '*')) 12 | similar_aas <- str_c(similar_aas$aa1, similar_aas$aa2) 13 | 14 | # Import and process data 15 | meta <- read_yaml('data/studies/jones_2019_adrb2/jones_2019_adrb2.yaml') 16 | dm_data <- read_csv('data/studies/jones_2019_adrb2/raw/lib-med.csv') %>% 17 | filter(Condition == 0.150) %>% # Select only EC50 measure (generally correlate between these) 18 | select(position = Pos, mut = AA, raw_score = Median, Repeat) %>% 19 | group_by(position, mut) %>% 20 | summarise(raw_score = median(raw_score)) %>% # Average biological repeats 21 | ungroup() %>% 22 | mutate(wt = str_split(meta$seq, '')[[1]][position], 23 | pair = str_c(wt, mut), 24 | # Divide by average score of v. similar AAs (blosum62 > 1) as substitute for wt 25 | transformed_score = log2(raw_score / mean(raw_score[pair %in% similar_aas], na.rm=TRUE)), 26 | score = normalise_score(transformed_score), 27 | class = get_variant_class(wt, mut)) 28 | 29 | # Save output 30 | standardise_study(dm_data, meta$study, meta$transform) 31 | 32 | -------------------------------------------------------------------------------- /data/studies/firnberg_2014_tem1/standardise_firnberg_2014_tem1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Firnberg et al. 2014 (TEM1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/firnberg_2014_tem1/firnberg_2014_tem1.yaml') 8 | dm_data <- read_xlsx('data/studies/firnberg_2014_tem1/raw/firnberg_2014_tem1.xlsx', skip = 1, 9 | col_names = c('position', 'ref_codon', 'alt_codon', 'wt', 'mut', 'base_changes', 'seq_counts_0.25', 10 | 'seq_counts_0.5', 'seq_counts_1', 'seq_counts_2', 'seq_counts_4', 'seq_counts_8', 11 | 'seq_counts_16', 'seq_counts_32', 'seq_counts_64', 'seq_counts_128', 'seq_counts_256', 12 | 'seq_counts_512', 'seq_counts_1024', 'total_seq_count', 'raw_score', 'fitness_err')) %>% 13 | drop_na(position) %>% 14 | filter(!wt == '*') %>% 15 | mutate(position = rep(1:nchar(meta$seq), each=64)) %>% # Numbering seems broken - starts at 3 and then misses 237 & 251 16 | group_by(position, wt, mut) %>% 17 | summarise(raw_score = mean(raw_score, na.rm = TRUE), 18 | transformed_score = mean(log2(raw_score), na.rm = TRUE)) %>% # Average over codons 19 | ungroup() %>% 20 | mutate(score = normalise_score(transformed_score), 21 | class = get_variant_class(wt, mut)) %>% 22 | drop_na(raw_score) # Some variants not measured in any codon 23 | 24 | # Save output 25 | standardise_study(dm_data, meta$study, meta$transform) 26 | -------------------------------------------------------------------------------- /data/studies/jiang_2013_hsp90/jiang_2013_hsp90.yaml: -------------------------------------------------------------------------------- 1 | study: 'jiang_2013_hsp90' 2 | gene: 'HSP90' 3 | domain: 'Putative substrate binding loop' 4 | uniprot_id: 'P02829' 5 | gene_type: 'Chaperone' 6 | species: 'S. cerevisiae' 7 | seq: "MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIRYKSLSDPK\ 8 | QLETEPDLFIRITPKPEQKVLEIRDSGIGMTKAELINNLGTIAKSGTKAFMEAL\ 9 | SAGADVSMIGQFGVGFYSLFLVADRVQVISKSNDDEQYIWESNAGGSFTVTLDE\ 10 | VNERIGRGTILRLFLKDDQLEYLEEKRIKEVIKRHSEFVAYPIQLVVTKEVEKE\ 11 | VPIPEEEKKDEEKKDEEKKDEDDKKPKLEEVDEEEEKKPKTKKVKEEVQEIEEL\ 12 | NKTKPLWTRNPSDITQEEYNAFYKSISNDWEDPLYVKHFSVEGQLEFRAILFIP\ 13 | KRAPFDLFESKKKKNNIKLYVRRVFITDEAEDLIPEWLSFVKGVVDSEDLPLNL\ 14 | SREMLQQNKIMKVIRKNIVKKLIEAFNEIAEDSEQFEKFYSAFSKNIKLGVHED\ 15 | TQNRAALAKLLRYNSTKSVDELTSLTDYVTRMPEHQKNIYYITGESLKAVEKSP\ 16 | FLDALKAKNFEVLFLTDPIDEYAFTQLKEFEGKTLVDITKDFELEETDEEKAER\ 17 | EKEIKEYEPLTKALKEILGDQVEKVVVSYKLLDAPAAIRTGQFGWSANMERIMK\ 18 | AQALRDSSMSSYMSSKKTFEISPKSPIIKELKKRVDEGGAQDKTVKDLTKLLYE\ 19 | TALLTSGFSLDEPTSFASRINRLISLGLNIDEDEETETAPEASTAAPVEEVPAD\ 20 | TEMEEVD" 21 | experiment: 'Complement' 22 | transform: 'log2(x)' 23 | authour: 'Jiang et al.' 24 | year: 2013 25 | title: 'Latent Effects of Hsp90 Mutants Revealed at Reduced Expression Levels' 26 | lab: ['Bolon'] 27 | doi: '10.1371/journal.pgen.1003600' 28 | pmid: '23825969' 29 | url: 'https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1003600' 30 | input_files: 31 | - 'journal.pgen.1003600.s014.xlsx' 32 | source: 'SI - Table S3' 33 | qc: 34 | filter: False 35 | notes: 36 | -------------------------------------------------------------------------------- /data/studies/giacomelli_2018_tp53/giacomelli_2018_tp53.yaml: -------------------------------------------------------------------------------- 1 | study: 'giacomelli_2018_tp53' 2 | gene: 'TP53' 3 | uniprot_id: 'P04637' 4 | gene_type: 'TF' 5 | species: 'H. sapiens' 6 | seq: "MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGP\ 7 | DEAPRMPEAAPRVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAK\ 8 | SVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHE\ 9 | RCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNS\ 10 | SCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELP\ 11 | PGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPG\ 12 | GSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD" 13 | experiment: 'Complement' 14 | transform: '-x' 15 | authour: 'Giacomelli et al.' 16 | year: 2018 17 | title: 'Mutational processes shape the landscape of TP53 mutations in human cancer' 18 | lab: ['Hahn', 'Root', 'Johannessen'] 19 | doi: '10.1038/s41588-018-0204-y' 20 | pmid: '30224644' 21 | url: 'https://www.nature.com/articles/s41588-018-0204-y' 22 | notes: "Score is in Z-score format across 3 different conditions:\ 23 | wt p53 background and nutlin3 to select for dominant negatives;\ 24 | null p53 background and nutlin3 to select for LOF; 25 | and null p53 background and etoposide to select for WT like.\ 26 | Here I only use p53 NULL, Etoposide because it aligns most\ 27 | directly with our objective of a functional protein.\ 28 | The WT for this paper differs by P72R from the Uniprot ID." 29 | input_files: 30 | - '41588_2018_204_MOESM5_ESM.xlsx' 31 | source: 'SI' 32 | qc: 33 | filter: False 34 | notes: 35 | -------------------------------------------------------------------------------- /meta/fasta/s_pyrogenes_cas9.fa: -------------------------------------------------------------------------------- 1 | >sp|Q99ZW2|CAS9_STRP1 CRISPR-associated endonuclease Cas9/Csn1 OS=Streptococcus pyogenes serotype M1 OX=301447 GN=cas9 PE=1 SV=1 2 | MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAE 3 | ATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFG 4 | NIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSD 5 | VDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGN 6 | LIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAI 7 | LLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYA 8 | GYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELH 9 | AILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEE 10 | VVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFL 11 | SGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKI 12 | IKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWG 13 | RLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSL 14 | HEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRER 15 | MKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDH 16 | IVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNL 17 | TKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKS 18 | KLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKKYPKLESEFVYGDYKVYDVRK 19 | MIAKSEQEIGKATAKYFFYSNIMNFFKTEITLANGEIRKRPLIETNGETGEIVWDKGRDF 20 | ATVRKVLSMPQVNIVKKTEVQTGGFSKESILPKRNSDKLIARKKDWDPKKYGGFDSPTVA 21 | YSVLVVAKVEKGKSKKLKSVKELLGITIMERSSFEKNPIDFLEAKGYKEVKKDLIIKLPK 22 | YSLFELENGRKRMLASAGELQKGNELALPSKYVNFLYLASHYEKLKGSPEDNEQKQLFVE 23 | QHKHYLDEIIEQISEFSKRVILADANLDKVLSAYNKHRDKPIREQAENIIHLFTLTNLGA 24 | PAAFKYFDTTIDRKRYTSTKEVLDATLIHQSITGLYETRIDLSQLGGD 25 | 26 | -------------------------------------------------------------------------------- /bin/figures/figureS3.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce figure S3 (Study Confounding) 3 | source('src/config.R') 4 | source('src/subtype_characterisation.R') 5 | 6 | dms <- read_tsv('data/combined_mutational_scans.tsv') %>% 7 | mutate(uniprot_id = unname(UNIPROT_IDS[gene])) 8 | 9 | umap2_breaks <- c(-2.5, 0, 2.5) 10 | 11 | # Alternative format study normalising newline and correcting Ras/Src capitals. Not used in paper, where RAS and SRC were incorrectly lowercase 12 | alt_format_study <- function(x){ 13 | yaml <- read_yaml(str_c('data/studies/', x, '/', x, '.yaml')) 14 | 15 | study <- str_c(yaml$authour, ' ', yaml$year, '\n(', ifelse(yaml$gene == "Ras", "RAS", ifelse(yaml$gene == "Src", "SRC", yaml$gene)), ')') 16 | 17 | return(study) 18 | } 19 | 20 | study_pretty <- sapply(unique(dms$study), alt_format_study) 21 | 22 | figure <- mutate(dms, study_pretty = study_pretty[study]) %>% 23 | ggplot(aes(x = umap1, y = umap2, colour = study_pretty)) + 24 | facet_wrap(~study_pretty, ncol = 4) + 25 | geom_point(data = dms, colour = 'grey90', shape = 20, size = 0.8) + 26 | geom_point(shape = 20, size = 0.8) + 27 | scale_y_continuous(breaks = umap2_breaks) + 28 | labs(x = 'UMAP1', y = 'UMAP2') + 29 | guides(colour = FALSE) 30 | ggsave('figures/4_figures/figureS3.pdf', figure, width = 183, height = 270, units = 'mm') 31 | ggsave('figures/4_figures/figureS3.png', figure, width = 183, height = 270, units = 'mm') 32 | ggsave('figures/4_figures/figureS3.tiff', figure, width = 183, height = 270, units = 'mm') 33 | ggsave('figures/4_figures/figureS3.eps', figure, width = 183, height = 270, units = 'mm', device=cairo_ps, fallback_resolution = 600) 34 | -------------------------------------------------------------------------------- /data/studies/ashenberg_2017_np/ashenberg_2017_np.yaml: -------------------------------------------------------------------------------- 1 | study: 'ashenberg_2017_np' 2 | gene: 'NP' 3 | uniprot_id: 'I6TAH8' 4 | gene_type: 'viral' 5 | species: 'Influenza' 6 | strain: 'Human adapted strain A/Aichi/2/1968, H3N2' 7 | seq: "MASQGTKRSYEQMETDGERQNATEIRASVGKMIDGIGRFYIQMCTELKLSDYEGRLIQNSLTIERMVLS\ 8 | AFDERRNKYLEEHPSAGKDPKKTGGPIYKRVDRKWMRELVLYDKEEIRRIWRQANNGDDATAGLTHMMI\ 9 | WHSNLNDTTYQRTRALVRTGMDPRMCSLMQGSTLPRRSGAAGAAVKGVGTMVMELIRMIKRGINDRNFW\ 10 | RGENGRKTRSAYERMCNILKGKFQTAAQRAMMDQVRESRNPGNAEIEDLIFLARSALILRGSVAHKSCL\ 11 | PACVYGPAVASGYDFEKEGYSLVGIDPFKLLQNSQVYSLIRPNENPAHKSQLVWMACNSAAFEDLRVLS\ 12 | FIRGTKVSPRGKLSTRGVQIASNENMDAMESSTLELRSRYWAIRTRSGGNTNQQRASAGQISVQPAFSV\ 13 | QRNLPFDKPTIMAAFTGNTEGRTSDMRAEIIRMMEGAKPEEMSFQGRGVFELSDERAANPIVPSFDMSN\ 14 | EGSYFFGDNAEEYDN" 15 | experiment: 'Complement' 16 | transform: 'None' 17 | authour: 'Ashenberg et al.' 18 | year: 2017 19 | title: 'Deep mutational scanning identifies sites in influenza nucleoprotein that affect viral inhibition by MxA' 20 | lab: ['Bloom'] 21 | doi: '10.1371/journal.ppat.1006288' 22 | pmid: '28346537' 23 | url: 'https://journals.plos.org/plospathogens/article?id=10.1371/journal.ppat.1006288' 24 | input_files: 25 | - 'journal.ppat.1006288.s013.csv' 26 | source: 'SI - S3 File (https://doi.org/10.1371/journal.ppat.1006288.s013)' 27 | qc: 28 | filter: True 29 | notes: "Only report relative selection when grown in cell with MxA and\ 30 | without, effectively normalising for most selective effects.\ 31 | Consequently the score doesn't align with full fitness (and \ 32 | wasn't meant to) as shown by no correlation with SIFT scores." 33 | -------------------------------------------------------------------------------- /data/studies/wagenaar_2014_braf/standardise_wagenaar_2014_braf.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Wagenaar et al. 2014 (BRAF) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/wagenaar_2014_braf/wagenaar_2014_braf.yaml') 8 | dm_data <- read_xls('data/studies/wagenaar_2014_braf/raw/pcmr12171-sup-0011-TableS1.xls', skip = 3) %>% 9 | rename(position = Position, 10 | mut = acid, 11 | median_enrichment = Median, 12 | rep1_codon1 = `Replicate 1`, 13 | rep1_codon2 = ...5, 14 | rep1_codon3 = ...6, 15 | rep1_codon4 = ...7, 16 | rep1_codon5 = ...8, 17 | rep1_codon6 = ...9, 18 | rep2_codon1 = `Replicate 2`, 19 | rep2_codon2 = ...11, 20 | rep2_codon3 = ...12, 21 | rep2_codon4 = ...13, 22 | rep2_codon5 = ...14, 23 | rep2_codon6 = ...15, 24 | ic50_vs_brafV600E = BRAFV600E, 25 | individually_tested = `mutant?`, 26 | possible_by_single_sub = `substitution?`) %>% 27 | filter(!is.na(rep1_codon1) & !rep1_codon1 == 'Replicate 1') %>% 28 | mutate_at(vars(-mut, -individually_tested, -possible_by_single_sub, -ic50_vs_brafV600E), as.numeric)%>% 29 | mutate(wt = str_split(meta$seq, '')[[1]][position], 30 | raw_score = median_enrichment, 31 | transformed_score = -log2(median_enrichment), 32 | score = normalise_score(transformed_score), 33 | class = get_variant_class(wt, mut)) %>% 34 | select(position, wt, mut, score, transformed_score, raw_score, class) 35 | 36 | # Save output 37 | standardise_study(dm_data, meta$study, meta$transform) -------------------------------------------------------------------------------- /data/studies/sarkisyan_2016_gfp/standardise_sarkisyan_2016_gfp.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Sarkisyan et al. 2016 (GFP) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/sarkisyan_2016_gfp/sarkisyan_2016_gfp.yaml') 8 | raw_data <- read_tsv('data/studies/sarkisyan_2016_gfp/raw/amino_acid_genotypes_to_brightness.tsv', skip = 1, 9 | col_names = c('mut', 'barcodes', 'median_brightness', 'std')) 10 | 11 | wt_brightness <- filter(raw_data, is.na(mut)) %>% pull(median_brightness) 12 | 13 | dm_data <- mutate(raw_data, n_mut = str_count(mut, ':') + 1) %>% 14 | filter(n_mut <= 3) %>% 15 | separate(mut, into = str_c('mut', 1:3), sep = ':', fill = 'right') %>% 16 | pivot_longer(cols = starts_with('mut'), names_to = 'n', names_prefix = 'mut', values_to = 'mut') %>% 17 | drop_na(mut) %>% 18 | select(-n, -barcodes, -std) %>% 19 | tidyr::extract(mut, into = c('wt', 'position', 'mut'), 'S([A-Z])([0-9]+)([A-Z*])', convert=TRUE, remove=FALSE) %>% 20 | mutate(position = position + 2) %>% # Numbered from 3rd residue for some reason 21 | arrange(position, mut) %>% 22 | group_by(position, wt, mut) %>% 23 | summarise(raw_score = if_else(1 %in% n_mut, mean(median_brightness[n_mut == 1], na.rm = TRUE), # Use value of single mut if available 24 | mean(median_brightness[n_mut <= 4], na.rm = TRUE))) %>% 25 | ungroup() %>% 26 | mutate(transformed_score = log2(raw_score / wt_brightness), 27 | score = normalise_score(transformed_score), 28 | class = get_variant_class(wt, mut)) 29 | 30 | # Save output 31 | standardise_study(dm_data, meta$study, meta$transform) 32 | -------------------------------------------------------------------------------- /data/studies/wagenaar_2014_braf/wagenaar_2014_braf.yaml: -------------------------------------------------------------------------------- 1 | study: 'wagenaar_2014_braf' 2 | gene: 'BRAF' 3 | uniprot_id: 'P15056' 4 | gene_type: 'Kinase' 5 | species: 'H. sapiens' 6 | seq: "MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEEVWNIKQMIKLTQEH\ 7 | IEALLDKFGGEHNPPSIYLEAYEEYTSKLDALQQREQQLLESLGNGTDFSVSSSASMDTV\ 8 | TSSSSSSLSVLPSSLSVFQNPTDVARSNPKSPQKPIVRVFLPNKQRTVVPARCGVTVRDS\ 9 | LKKALMMRGLIPECCAVYRIQDGEKKPIGWDTDISWLTGEELHVEVLENVPLTTHNFVRK\ 10 | TFFTLAFCDFCRKLLFQGFRCQTCGYKFHQRCSTEVPLMCVNYDQLDLLFVSKFFEHHPI\ 11 | PQEEASLAETALTSGSSPSAPASDSIGPQILTSPSPSKSIPIPQPFRPADEDHRNQFGQR\ 12 | DRSSSAPNVHINTIEPVNIDDLIRDQGFRGDGGSTTGLSATPPASLPGSLTNVKALQKSP\ 13 | GPQRERKSSSSSEDRNRMKTLGRRDSSDDWEIPDGQITVGQRIGSGSFGTVYKGKWHGDV\ 14 | AVKMLNVTAPTPQQLQAFKNEVGVLRKTRHVNILLFMGYSTKPQLAIVTQWCEGSSLYHH\ 15 | LHIIETKFEMIKLIDIARQTAQGMDYLHAKSIIHRDLKSNNIFLHEDLTVKIGDFGLATE\ 16 | KSRWSGSHQFEQLSGSILWMAPEVIRMQDKNPYSFQSDVYAFGIVLYELMTGQLPYSNIN\ 17 | NRDQIIFMVGRGYLSPDLSKVRSNCPKAMKRLMAECLKKKRDERPLFPQILASIELLARS\ 18 | LPKIHRSASEPSLNRAGFQTEDFSLYACASPKTPIQAGGYGAFPVH" 19 | experiment: 'Complement' 20 | transform: '-log2(x)' 21 | authour: 'Wagenaar et al.' 22 | year: 2014 23 | title: 'Resistance to vemurafenib resulting from a novel mutation in the BRAFV600E kinase domain' 24 | lab: ['Green', 'Bolon'] 25 | doi: '10.1111/pcmr.12171' 26 | pmid: '24112705' 27 | url: 'https://onlinelibrary.wiley.com/doi/full/10.1111/pcmr.12171' 28 | notes: 'Only retained variants they deemed significantly different from wt' 29 | input_files: 30 | - 'pcmr12171-sup-0011-TableS1.xls' 31 | source: 'SI - Table S1' 32 | qc: 33 | filter: True 34 | notes: 'Selecting for drug resistance in oncogenic state - not exactly normal function' 35 | -------------------------------------------------------------------------------- /data/studies/dorrity_2018_ste12/dorrity_2018_ste12.yaml: -------------------------------------------------------------------------------- 1 | study: 'dorrity_2018_ste12' 2 | gene: 'STE12' 3 | domain: 'DNA-binding' 4 | uniprot_id: 'P13574' 5 | gene_type: 'Transcription Factor' 6 | species: 'S. cerevisiae' 7 | seq: "MKVQITNSRTEEILKVQANNENDEVSKATPGEVEESLRLIGDLKFFLATAPVNWQENQII\ 8 | RRYYLNSGQGFVSCVFWNNLYYITGTDIVKCCLYRMQKFGREVVQKKKFEEGIFSDLRNL\ 9 | KCGIDATLEQPKSEFLSFLFRNMCLKTQKKQKVFFWFSVAHDKLFADALERDLKRESLNQ\ 10 | PSTTKPVNEPALSFSYDSSSDKPLYDQLLQHLDSRRPSSTTKSDNSPPKLESENFKDNEL\ 11 | VTVTNQPLLGVGLMDDDAPESPSQINDFIPQKLIIEPNTLELNGLTEETPHDLPKNTAKG\ 12 | RDEEDFPLDYFPVSVEYPTEENAFDPFPPQAFTPAAPSMPISYDNVNERDSMPVNSLLNR\ 13 | YPYQLSVAPTFPVPPSSSRQHFMTNRDFYSSNNNKEKLVSPSDPTSYMKYDEPVMDFDES\ 14 | RPNENCTNAKSHNSGQQTKQHQLYSNNFQQSYPNGMVPGYYPKMPYNPMGGDPLLDQAFY\ 15 | GADDFFFPPEGCDNNMLYPQTATSWNVLPPQAMQPAPTYVGRPYTPNYRSTPGSAMFPYM\ 16 | QSSNSMQWNTAVSPYSSRAPSTTAKNYPPSTFYSQNINQYPRRRTVGMKSSQGNVPTGNK\ 17 | QSVGKSAKISKPLHIKTSAYQKQYKINLETKARPSAGDEDSAHPDKNKEISMPTPDSNTL\ 18 | VVQSEEGGAHSLEVDTNRRSDKNLPDAT" 19 | experiment: 'Mating and invasion efficiency, measured by proxy growth through plasmid count' 20 | transform: 'None' 21 | authour: 'Dorrity et al.' 22 | year: 2018 23 | title: 'Preferences in a trait decision determined by transcription factor variants' 24 | lab: ['Queitsch', 'Fields'] 25 | doi: '10.1073/pnas.1805882115' 26 | pmid: '30068600' 27 | url: 'https://www.pnas.org/content/115/34/E7997' 28 | notes: "Use worst of two scores as their effects tend to be position exclusive - one \ 29 | domain governs invasion and one mating function" 30 | input_files: 31 | - 'pnas.1805882115.sd01.xlsx' 32 | - 'pnas.1805882115.sd02.xlsx' 33 | source: 'SI' 34 | qc: 35 | filter: False 36 | notes: 37 | -------------------------------------------------------------------------------- /data/studies/araya_2012_yap1/standardise_araya_2012_yap1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Araya et al. 2012 (YAP1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/araya_2012_yap1/araya_2012_yap1.yaml') 8 | dm_data <- read_csv('data/studies/araya_2012_yap1/raw/urn_mavedb_00000002-a-2_scores.csv', skip = 4) %>% 9 | select(hgvs_pro, raw_score = score) %>% 10 | mutate(hgvs_pro = if_else(str_ends(hgvs_pro, ']'), str_sub(hgvs_pro, start = 4, end = -2), str_sub(hgvs_pro, start = 3)), 11 | n_mut = str_count(hgvs_pro, ';') + 1) %>% 12 | separate(hgvs_pro, str_c('mut', 1:max(.$n_mut)), sep = ';', fill = 'right') %>% 13 | pivot_longer(cols = starts_with('mut'), values_to = 'mut') %>% 14 | drop_na(mut) %>% 15 | select(-name) %>% 16 | tidyr::extract(mut, into = c('wt', 'position', 'mut'), "([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", convert = TRUE) %>% 17 | mutate(wt = AA_THREE_2_ONE[wt], mut = AA_THREE_2_ONE[mut], position = position + 169) %>% 18 | mutate(transformed_score = raw_score) %>% 19 | group_by(position, wt, mut) %>% 20 | summarise(transformed_score = ifelse(1 %in% n_mut, mean(transformed_score[n_mut == 1], na.rm=TRUE), mean(transformed_score[n_mut <= 2], na.rm=TRUE)), 21 | raw_score = ifelse(1 %in% n_mut, mean(raw_score[n_mut == 1], na.rm=TRUE), mean(raw_score[n_mut <= 2], na.rm=TRUE))) %>% 22 | ungroup() %>% 23 | mutate(class = get_variant_class(wt, mut), 24 | score = normalise_score(transformed_score)) %>% 25 | drop_na(score) %>% # Some mutant arent found in seqs with <= 2 variants 26 | arrange(position, mut) 27 | 28 | # Save output 29 | standardise_study(dm_data, meta$study, meta$transform) 30 | -------------------------------------------------------------------------------- /bin/figures/figureS8_27.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce figure S8-27 (Subtype Characterisation) 3 | source('src/config.R') 4 | source('src/subtype_characterisation.R') 5 | 6 | dms <- full_join(read_tsv('data/subtypes/final_subtypes.tsv'), 7 | read_tsv('data/combined_mutational_scans.tsv'), 8 | by = c('study', 'gene', 'position', 'wt')) %>% 9 | arrange(study, position) 10 | 11 | full_characterisation <- full_cluster_characterisation(dms) 12 | 13 | figures <- group_by(dms, wt) %>% 14 | group_map(~plot_full_characterisation(unique(.$cluster), full_characterisation, exclude_outliers = TRUE, global_scale = FALSE)) %>% 15 | map(extract2, 'overall') %>% 16 | set_names(str_c('figureS', 8:27)) 17 | 18 | save_plotlist(figures, 'figures/4_figures/', default_format = 'pdf') 19 | save_plotlist(figures, 'figures/4_figures/', default_format = 'png') 20 | save_plotlist(figures, 'figures/4_figures/', default_format = 'eps') 21 | save_plotlist(figures, 'figures/4_figures/', default_format = 'tiff') 22 | 23 | # Save single PDF version 24 | pdf('figures/4_figures/figureS8_27.pdf', onefile = TRUE, width = 11.7, height = 8.3) 25 | for (name in names(figures)){ 26 | p <- ggplot() + 27 | geom_blank() + 28 | lims(x=c(0,1), y=c(0,1)) + 29 | labs(caption = str_c('Figure ', str_sub(name, start = -2))) + 30 | annotation_custom(figures[[name]], xmin = 0, xmax = 1, ymin = 0, ymax = 1) + 31 | theme(plot.caption = element_text(hjust = 0.5), 32 | panel.grid.major.y = element_blank(), 33 | axis.ticks = element_blank(), 34 | axis.title = element_blank(), 35 | axis.text = element_blank(), 36 | plot.margin = unit(c(8, 1, 1, 1), 'mm')) 37 | print(p) 38 | } 39 | dev.off() 40 | -------------------------------------------------------------------------------- /bin/analysis/2_subtypes/characterise_subtypes.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Characterise generated clusters 3 | source('src/config.R') 4 | source('src/subtype_characterisation.R') 5 | library(argparser) 6 | 7 | ### Parse args and setup ### 8 | parser <- arg_parser(description = 'Characterise AA subtypes', name = 'AA Subtype Characerisation') 9 | parser <- add_argument(parser, arg = 'subtypes', help = 'Root filename assigning positions to subtypes. Should have a tsv (subtype per position) and rds (list of cluster objects) file.') 10 | parser <- add_argument(parser, arg = '--dms', help = 'Path to DMS data', default = 'data/combined_mutational_scans.tsv') 11 | parser <- add_argument(parser, arg = '--figures', help = 'Directory to save figures', default = '.') 12 | args <- parse_args(parser) 13 | 14 | subtypes <- read_tsv(str_c(args$subtypes, '.tsv')) 15 | clusters <- readRDS(str_c(args$subtypes, '.rds')) 16 | 17 | dms <- read_tsv(args$dms) %>% 18 | left_join(subtypes, ., by = c('study', 'gene', 'position', 'wt')) 19 | 20 | ### Calculate Profiles for all clusters ### 21 | full_characterisation <- full_cluster_characterisation(dms) 22 | 23 | # Make profiles with permissive/outlier clusters excluded 24 | outlier_clusters <- filter(full_characterisation$profiles, str_detect(cluster, CLUSTER_PERMISSIVE_RE) | str_detect(cluster, CLUSTER_OUTLIER_RE)) %>% 25 | pull(cluster) %>% 26 | unique() 27 | selective_characterisation <- full_cluster_characterisation(filter(dms, !cluster %in% outlier_clusters)) 28 | n_clusters_selective <- nrow(full_characterisation$summary) 29 | 30 | ### Plot all cluster characterisation ### 31 | plots <- plot_cluster_characterisation(full_characterisation, selective_characterisation, clusters) 32 | 33 | ### Save Plots ### 34 | save_plotlist(plots, args$figures, verbose = 2) 35 | -------------------------------------------------------------------------------- /data/studies/kitzman_2015_gal4/kitzman_2015_gal4.yaml: -------------------------------------------------------------------------------- 1 | study: 'kitzman_2015_gal4' 2 | gene: 'GAL4' 3 | uniprot_id: 'P04386' 4 | gene_type: 'TF' 5 | species: 'S. cerevisiae' 6 | seq: "MKLLSSIEQACDICRLKKLKCSKEKPKCAKCLKNNWECRYSPKTKRSPLTRAHLTEVESR\ 7 | LERLEQLFLLIFPREDLDMILKMDSLQDIKALLTGLFVQDNVNKDAVTDRLASVETDMPL\ 8 | TLRQHRISATSSSEESSNKGQRQLTVSIDSAAHHDNSTIPLDFMPRDALHGFDWSEEDDM\ 9 | SDGLPFLKTDPNNNGFFGDGSLLCILRSIGFKPENYTNSNVNRLPTMITDRYTLASRSTT\ 10 | SRLLQSYLNNFHPYCPIVHSPTLMMLYNNQIEIASKDQWQILFNCILAIGAWCIEGESTD\ 11 | IDVFYYQNAKSHLTSKVFESGSIILVTALHLLSRYTQWRQKTNTSYNFHSFSIRMAISLG\ 12 | LNRDLPSSFSDSSILEQRRRIWWSVYSWEIQLSLLYGRSIQLSQNTISFPSSVDDVQRTT\ 13 | TGPTIYHGIIETARLLQVFTKIYELDKTVTAEKSPICAKKCLMICNEIEEVSRQAPKFLQ\ 14 | MDISTTALTNLLKEHPWLSFTRFELKWKQLSLIIYVLRDFFTNFTQKKSQLEQDQNDHQS\ 15 | YEVKRCSIMLSDAAQRTVMSVSSYMDNHNVTPYFAWNCSYYLFNAVLVPIKTLLSNSKSN\ 16 | AENNETAQLLQQINTVLMLLKKLATFKIQTCEKYIQVLEEVCAPFLLSQCAIPLPHISYN\ 17 | NSNGSAIKNIVGSATIAQYPTLPEENVNNISVKYVSPGSVGPSPVPLKSGASFSDLVKLL\ 18 | SNRPPSRNSPVTIPRSTPSHRSVTPFLGQQQQLQSLVPLTPSALFGGANFNQSGNIADSS\ 19 | LSFTFTNSSNGPNLITTQTNSQALSQPIASSNVHDNFMNNEITASKIDDGNNSKPLSPGW\ 20 | TDQTAYNAFGITTGMFNTTTMDDVYNYLFDDEDTPPNPKKE" 21 | experiment: 'Coupled growth' 22 | transform: 'log2(x)' 23 | authour: 'Kitzman et al.' 24 | year: 2015 25 | title: 'Massively parallel single amino-acid mutagenesis' 26 | lab: ['Shendure', 'Fields'] 27 | doi: '10.1038/nmeth.3223' 28 | pmid: '25559584' 29 | url: 'https://www.nature.com/articles/nmeth.3223' 30 | notes: "Average of A (24h, 40h), B (40h) & C (40h, 64h) reps at different timepoints (since all timepoints correlate)" 31 | input_files: 32 | - '41592_2015_BFnmeth3223_MOESM306_ESM.xlsx' 33 | source: 'SI - Supplementary Data' 34 | qc: 35 | filter: False 36 | notes: 37 | -------------------------------------------------------------------------------- /bin/analysis/2_subtypes/compare_hclust_dynamic_deep_split.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Compare deepSplit values for dynamic tree cutting 3 | source('src/config.R') 4 | 5 | dms <- left_join(rename(read_tsv('data/subtypes/hclust_pca_no_sig_dynamic_cos_deep_0_no_permissive.tsv'), cluster_ds0 = cluster), 6 | rename(read_tsv('data/subtypes/hclust_pca_no_sig_dynamic_cos_deep_1_no_permissive.tsv'), cluster_ds1 = cluster), 7 | by = c("study", "gene", "position", "wt")) %>% 8 | left_join(read_tsv('data/combined_mutational_scans.tsv'), by = c("study", "gene", "position", "wt")) %>% 9 | select(cluster_ds0, cluster_ds1, everything()) 10 | 11 | # Get the mean correlation for profiles in a tibble 12 | get_mean_cor <- function(dms, method='spearman'){ 13 | f <- function(x, ...){ 14 | tibble_to_matrix(x, A:Y) %>% 15 | t() %>% 16 | cor(method = method) %>% 17 | tril() %>% 18 | mean() 19 | } 20 | 21 | group_map(dms, f) %>% 22 | unlist() %>% 23 | tibble(cluster=group_keys(dms)[[1]], mean_cor=.) 24 | } 25 | 26 | mean_cors <- bind_rows(.id = 'deepSplit', 27 | "0"=group_by(dms, cluster_ds0) %>% get_mean_cor(), 28 | "1"=group_by(dms, cluster_ds1) %>% get_mean_cor()) %>% 29 | mutate(wt = str_sub(cluster, end = 1)) %>% 30 | filter(!str_detect(cluster, CLUSTER_OUTLIER_RE), !str_detect(cluster, CLUSTER_PERMISSIVE_RE)) 31 | 32 | p_mean_cors <- ggplot(mean_cors, aes(x = deepSplit, y = mean_cor, colour = wt)) + 33 | geom_boxplot() + 34 | geom_point() + 35 | facet_wrap(~wt) + 36 | scale_color_manual(values = AA_COLOURS) + 37 | guides(colour=FALSE) + 38 | labs(y = expression('Cluster Mean Spearmans'~rho)) 39 | ggsave('figures/2_subtypes/hclust_dynamic_deep_split_mean_corelation.pdf', units = 'cm', width = 15, height = 15) -------------------------------------------------------------------------------- /data/studies/heredia_2018_ccr5/standardise_heredia_2018_ccr5.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Heredia et al. 2018 (CCR5) 3 | 4 | source('src/config.R') 5 | source('src/study_standardising.R') 6 | 7 | # Import and process data 8 | meta <- read_yaml('data/studies/heredia_2018_ccr5/heredia_2018_ccr5.yaml') 9 | dm_data <- read_xlsx('data/studies/heredia_2018_ccr5/raw/GSE100368_enrichment_ratios_CCR5.xlsx', skip = 7, 10 | col_names = c('wt', 'mut', 'reads_l1', 'surface_exp_fitc_l1_r1', 'surface_exp_fitc_l1_r2', 'binding_2d7_l1_r1', 'binding_2d7_l1_r2', 11 | 'empty', 'reads_l2', 'surface_exp_alexa_l2_r1', 'surface_exp_alexa_l2_r2', 'binding_gp120_cd4_l2_r1', 'binding_gp120_cd4_l2_r2')) %>% 12 | select(-empty) %>% 13 | mutate(wt = rep(wt[!is.na(wt)], each = 21)) %>% 14 | tidyr::extract(wt, into = c('wt', 'position'), '([A-Z])([0-9]+)', convert = TRUE) %>% 15 | 16 | # Average replicates for binding (which also incorporate surface expression) 17 | mutate(binding_2d7_l1 = rowMeans(select(., binding_2d7_l1_r1, binding_2d7_l1_r2), na.rm = TRUE) %>% replace_na(NA), 18 | binding_gp120_cd4_l2 = rowMeans(select(., binding_gp120_cd4_l2_r1, binding_gp120_cd4_l2_r2), na.rm = TRUE) %>% replace_na(NA)) %>% 19 | 20 | # Average binding scores for two conditions (0.61 correlation) 21 | mutate(raw_score = rowMeans(select(., binding_2d7_l1, binding_gp120_cd4_l2), na.rm = TRUE) %>% replace_na(NA), 22 | transformed_score = raw_score, 23 | score = normalise_score(transformed_score), 24 | class = get_variant_class(wt, mut)) %>% 25 | select(position, wt, mut, score, transformed_score, raw_score, class) %>% 26 | drop_na(score) # not all measured 27 | 28 | 29 | # Save output 30 | standardise_study(dm_data, meta$study, meta$transform) 31 | -------------------------------------------------------------------------------- /data/studies/heredia_2018_cxcr4/standardise_heredia_2018_cxcr4.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Heredia et al. 2018 (CXCR4) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/heredia_2018_cxcr4/heredia_2018_cxcr4.yaml') 8 | dm_data <- read_xlsx('data/studies/heredia_2018_cxcr4/raw/GSE100368_enrichment_ratios_CXCR4.xlsx', skip = 6, 9 | col_names = c('wt', 'mut', 'reads', 10 | 'surface_exp_fitc_r1', 'surface_exp_fitc_r2', 11 | 'binding_12g5_r1', 'binding_12g5_r2', 12 | 'surface_exp_alexa_r1', 'surface_exp_alexa_r2', 13 | 'binding_cxcl12_r1', 'binding_cxcl12_r2')) %>% 14 | mutate(wt = rep(wt[!is.na(wt)], each = 21)) %>% 15 | tidyr::extract(wt, into = c('wt', 'position'), '([A-Z])([0-9]+)', convert = TRUE) %>% 16 | 17 | # Average replicates for binding (which also incorporate surface expression) 18 | mutate(binding_12g5 = rowMeans(select(., binding_12g5_r1, binding_12g5_r2), na.rm = TRUE) %>% replace_na(NA), 19 | binding_cxcl12 = rowMeans(select(., binding_cxcl12_r1, binding_cxcl12_r2), na.rm = TRUE) %>% replace_na(NA)) %>% 20 | 21 | # Average binding scores for two conditions (0.48 correlation) 22 | mutate(raw_score = rowMeans(select(., binding_12g5, binding_cxcl12), na.rm = TRUE) %>% replace_na(NA), 23 | transformed_score = raw_score, 24 | score = normalise_score(transformed_score), 25 | class = get_variant_class(wt, mut)) %>% 26 | select(position, wt, mut, score, transformed_score, raw_score, class) %>% 27 | drop_na(score) # not all measured 28 | 29 | # Save output 30 | standardise_study(dm_data, meta$study, meta$transform) 31 | -------------------------------------------------------------------------------- /bin/analysis/2_subtypes/sequence_context.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Analyse subtype sequence context 3 | library(Biostrings) 4 | source('src/config.R') 5 | library(ggseqlogo) 6 | 7 | dms <- full_join(read_tsv('data/subtypes/final_subtypes.tsv'), 8 | read_tsv('data/combined_mutational_scans.tsv'), 9 | by = c('study', 'gene', 'position', 'wt')) %>% 10 | arrange(study, position) 11 | 12 | fasta <- map(dir('data/fasta', full.names = TRUE), readAAStringSet) %>% reduce(c) 13 | 14 | extract_seq_context <- function(seq, window=10){ 15 | s <- as.matrix(seq)[,1] 16 | s <- c(rep('-', window), s, rep('-', window)) 17 | w <- map_chr((window + 1):(length(s) - window), ~str_c(s[(. - window):(. + window)], collapse = '')) 18 | tibble(position = 1:length(seq), wt = as.matrix(seq)[,1], seq_context = w) 19 | } 20 | 21 | windows <- lapply(fasta, extract_seq_context) %>% 22 | bind_rows(.id = 'geneid') 23 | 24 | build_profiles <- function(tbl, ...){ 25 | seq <- AAStringSet(tbl$seq_context) 26 | consensusMatrix(seq) 27 | } 28 | 29 | cluster_contexts <- select(dms, cluster, study, gene, position, wt) %>% 30 | mutate(geneid = gene_to_filename(gene)) %>% 31 | left_join(windows, by = c('geneid', 'position', 'wt')) %>% 32 | select(-geneid) %>% 33 | group_by(cluster) 34 | cluster_contexts <- group_map(cluster_contexts, build_profiles) %>% 35 | set_names(group_keys(cluster_contexts)$cluster) 36 | 37 | context_plots <- map(sort(unique(dms$wt)), 38 | ~labeled_plot( 39 | ggseqlogo(cluster_contexts[str_starts(names(cluster_contexts), .)], method = 'probability'), 40 | unit = 'cm', height = 20, width = 30) 41 | ) %>% 42 | set_names(sort(unique(dms$wt))) 43 | 44 | save_plotlist(context_plots, 'figures/2_subtypes/final_subtypes/sequence_contexts') 45 | -------------------------------------------------------------------------------- /data/studies/starita_2013_ube4b/standardise_starita_2013_ube4b.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Starita et al. 2013 (UBE4B) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml') 8 | dm_data <- read_xlsx('data/studies/starita_2013_ube4b/raw/sd01.xlsx', na = c('NA', '')) %>% 9 | filter(!seqID == 'NA-NA') %>% # Filter WT 10 | rename(raw_score = log2_ratio) %>% 11 | separate(seqID, into = c('position', 'mut'), sep='-') %>% 12 | select(-nscor_log2_ratio) %>% 13 | mutate(n_mut = sapply(position, function(x){str_count(x, ',') + 1})) %>% 14 | separate(mut, str_c('mut', 1:max(.$n_mut)), sep = ',', fill = 'right') %>% 15 | separate(position, str_c('position', 1:max(.$n_mut)), sep = ',', fill = 'right') %>% 16 | pivot_longer(starts_with('position'), names_to = 'pos_num', names_prefix = 'position', values_to = 'position') %>% 17 | drop_na(position) %>% 18 | pivot_longer(starts_with('mut'), names_to = 'mut_num', names_prefix = 'mut', values_to = 'mut') %>% 19 | drop_na(mut) %>% 20 | filter(pos_num == mut_num) %>% 21 | select(-pos_num, -mut_num) %>% 22 | group_by(position, mut) %>% 23 | summarise(raw_score = ifelse(1 %in% n_mut, mean(raw_score[n_mut == 1], na.rm=TRUE), mean(raw_score[n_mut <= 3], na.rm=TRUE))) %>% 24 | ungroup() %>% 25 | mutate(transformed_score = raw_score, 26 | score = normalise_score(transformed_score), 27 | position = as.integer(position) + 1072, # tested region starts at +1072 according to Starita (slightly before uniprot UBOX) This does lead to ref seq aligning 28 | wt = str_split(meta$seq, '')[[1]][position], 29 | class = get_variant_class(wt, mut)) %>% 30 | drop_na(score) # Not all variants present in seqs with <= 3 variants 31 | 32 | # Save output 33 | standardise_study(dm_data, meta$study, meta$transform) 34 | 35 | -------------------------------------------------------------------------------- /bin/analysis/0_data/summarise_standardised_data.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Summarise standardised dataset 3 | source('src/config.R') 4 | dms <- read_tsv('data/long_combined_mutational_scans.tsv') 5 | 6 | p_score_dist <- ggplot(dms, aes(x=imputed_score, fill=ifelse(!is.na(score), 'Experiment', 'Imputed'))) + 7 | facet_wrap(~study, ncol = 4, labeller = as_labeller(sapply(unique(dms$study), format_study, max_width = 28)), scales = 'free') + 8 | geom_histogram(bins=30) + 9 | labs(x = 'Normalised ER', y='Count') + 10 | scale_fill_manual(values = c(Experiment='cornflowerblue', Imputed='firebrick2')) + 11 | guides(fill=guide_legend(title = '')) 12 | ggsave('figures/0_data/standardised_distributions.pdf', p_score_dist, units = 'cm', height = 35, width = 30) 13 | 14 | summary_tbl <- group_by(dms, study, position, wt) %>% 15 | summarise(fx = sum(!is.na(total_energy)) == 19, 16 | sift = all(!is.na(sift)), 17 | phi = all(!is.na(phi)), 18 | psi = all(!is.na(psi)), 19 | sa = all(!is.na(all_atom_abs))) %>% 20 | ungroup() %>% 21 | summarise(Total = n(), 22 | `FoldX Results` = sum(fx), 23 | `SIFT Results` = sum(sift), 24 | Phi = sum(phi), 25 | Psi = sum(psi), 26 | `Surface Accessibility` = sum(sa)) %>% 27 | pivot_longer(everything(), names_to = 'metric', values_to = 'Count') %>% 28 | mutate(metric = factor(metric, levels = metric[order(Count, c(1, rep(0, length(metric) - 1)))])) # Second order arg ensures total is always first 29 | 30 | p_position_summary <- ggplot(summary_tbl, aes(x = metric, y = Count, fill = metric)) + 31 | geom_col(width = 0.5) + 32 | geom_text(aes(label = Count), nudge_y = 500) + 33 | coord_flip() + 34 | guides(fill=FALSE) + 35 | labs(x='', title = 'Summary of data collected after filtering') + 36 | theme(panel.grid.major.y = element_blank(), 37 | axis.ticks.y = element_blank()) 38 | ggsave('figures/0_data/position_data_summary.pdf', p_position_summary, units = 'cm', height = 8, width = 15) 39 | -------------------------------------------------------------------------------- /data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml: -------------------------------------------------------------------------------- 1 | study: 'starita_2013_ube4b' 2 | gene: 'UBE4B' 3 | domain: 'UBOX' 4 | uniprot_id: 'Q9ES00' 5 | gene_type: 'E3 Ubiquitin Ligase' 6 | species: 'M. musculus' 7 | seq: "MEELSADEIRRRRLARLAGGQTSQPTTPLTSPQRENPPGPPIAASAPGPSQSLGLNVHNM\ 8 | TPATSPIGAAGVAHRSQSSEGVSSLSSSPSNSLETQSQSLSRSQSMDIDGVSCEKSMSQV\ 9 | DVDSGIENMEVDENDRREKRSLSDKEPSSGPEVSEEQALQLVCKIFRVSWKDRDRDVIFL\ 10 | SSLSAQFKQNPKEVFSDFKDLIGQILMEVLMMSTQTRDENPFASLTATSQPIATAARSPD\ 11 | RNLMLNTGSSSGTSPMFCNMGSFSTSSLSSLGASGGASNWDSYSDHFTIETCKETDMLNY\ 12 | LIECFDRVGIEEKKAPKMCSQPAVSQLLSNIRSQCISHTALVLQGSLTQPRSLQQPSFLV\ 13 | PYMLCRNLPYGFIQELVRTTHQDEEVFKQIFIPILQGLALAAKECSLESDYFKYPLMALG\ 14 | ELCETKFGKTHPMCNLVASLPLWLPKSLSPGSGRELQRLSYLGAFFSFSVFAEDDAKVVE\ 15 | KYFSGPAITLENTRVVSQSLQHYLELGRQELFKILHSILLNGETREAALSYMAALVNANM\ 16 | KKAQMQADDRLVSTDGFMLNLLWVLQQLSTKIKLETVDPTYIFHPRCRITLPNDETRINA\ 17 | TMEDVNERLTELYGDQPPFSEPKFPTECFFLTLHAHHLSILPSCRRYIRRLRAIRELNRT\ 18 | VEDLKNNESQWKDSPLATRHREMLKRCKTQLKKLVRCKACADAGLLDESFLRRCLNFYGL\ 19 | LIQLMLRILDPAYPDVTLPLNSEVPKVFAALPEFYVEDVAEFLFFIVQYSPQVLYEPCTQ\ 20 | DIVMFLVVMLCNQNYIRNPYLVAKLVEVMFMTNPSVQPRTQKFFEMIENHPLSTKLLVPS\ 21 | LMKFYTDVEHTGATSEFYDKFTIRYHISTIFKSLWQNIAHHGTFMEEFNSGKQFVRYINM\ 22 | LINDTTFLLDESLESLKRIHEVQEEMKNKEQWDQLPRDQQQARQSQLAQDERVSRSYLAL\ 23 | ATETVDMFHLLTKQVQKPFLRPELGPRLAAMLNFNLQQLCGPKCRDLKVENPEKYGFEPK\ 24 | KLLDQLTDIYLQLDCARFAKAIADDQRSYSKELFEEVISKMRKAGIKSTIAIEKFKLLAE\ 25 | KVEEIVAKNARAEIDYSDAPDEFRDPLMDTLMTDPVRLPSGTVMDRSIILRHLLNSPTDP\ 26 | FNRQMLTESMLEPVPELKEQIQAWMREKQSSDH" 27 | experiment: 'Activity' 28 | transform: 'None' 29 | authour: 'Starita et al.' 30 | year: 2013 31 | title: 'Activity-enhancing mutations in an E3 ubiquitin ligaseidentified by high-throughput mutagenesis' 32 | lab: ['Klevit', 'Fields', 'Shendure', 'Fowler'] 33 | doi: '10.1073/pnas.1303309110' 34 | pmid: '23509263' 35 | url: 'http://www.pnas.org/content/110/14/E1263.long' 36 | input_files: 37 | - 'sd01.xlsx' 38 | source: 'SI - Dataset S1' 39 | qc: 40 | filter: False 41 | notes: 42 | -------------------------------------------------------------------------------- /meta/fasta/h_sapiens_brca1.fa: -------------------------------------------------------------------------------- 1 | >sp|P38398|BRCA1_HUMAN Breast cancer type 1 susceptibility protein OS=Homo sapiens OX=9606 GN=BRCA1 PE=1 SV=2 2 | MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQ 3 | CPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKD 4 | EVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYI 5 | ELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQ 6 | PSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVE 7 | KAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPC 8 | SENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVD 9 | EYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTEN 10 | LIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTE 11 | QNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNI 12 | HNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPV 13 | RHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKE 14 | FVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQ 15 | ESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHS 16 | RETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVT 17 | FECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRG 18 | NETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIP 19 | STVSTISRNNIRENVFKEASSSNINEVGSSTNEVGSSINEIGSSDENIQAELGRNRGPKL 20 | NAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTVNTDFSPYLISDNLEQPMGSS 21 | HASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTHTHLAQ 22 | GYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENL 23 | LSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGS 24 | SKQMRHQSESQGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSE 25 | DCSGLSSQSDILTTQQRDTMQHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALE 26 | DLRNPEQSTSEKAVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSKNKEPGVERSSPSK 27 | CPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEESGPHDLTETSYLPRQDLEG 28 | TPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTT 29 | DTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLI 30 | TEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDV 31 | VNGRNHQGPKRARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTL 32 | GTGVHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPH 33 | SHY 34 | 35 | -------------------------------------------------------------------------------- /src/pymol_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Library for visualising data in PyMOL, paticularly projecting arbitary values onto proteins. 3 | """ 4 | from itertools import cycle 5 | 6 | from colour_spectrum import ColourSpectrum 7 | import subtypes_utils as su 8 | 9 | def quick_highlight(cmd, dms, gene, factor='cluster_num', res=None, root='data/pdb'): 10 | """ 11 | Load a new protein and highlight a given factor. 12 | """ 13 | cmd.delete('all') 14 | cmd.load(f'{root}/{su.gene_to_filename(gene)}.pdb') 15 | 16 | if res: 17 | pdb_dms = dms[(dms.gene == gene) & (dms.wt == res)] 18 | else: 19 | pdb_dms = dms[dms.gene == gene] 20 | pdb_dms = pdb_dms.dropna(subset=['pdb_position']) 21 | 22 | colourer = su.SubtypesColourMap.lookup_map(factor, dms[factor]) 23 | project_landscape(cmd, pdb_dms.pdb_chain, pdb_dms.pdb_position, pdb_dms[factor], colourer) 24 | return pdb_dms 25 | 26 | def project_landscape(cmd, chain, position, value, colourer=None, na_colour=None): 27 | """ 28 | Colour specific residues according to a colourmap. colourer must return a Hexcode 29 | when called with a value as well as have an 'na_colour' attribute if no na_colour 30 | is specifically supplied. Chain can either be a single identifier (str) or an 31 | iterable of identifiers 32 | """ 33 | if colourer is None: 34 | colourer = ColourSpectrum(min(value), max(value), colourmap='viridis') 35 | 36 | if isinstance(chain, str): 37 | chain = cycle([chain]) 38 | 39 | colour_residues(cmd, *zip(chain, position, [colourer(val) for val in value]), 40 | base_colour=na_colour or colourer.na_colour) 41 | 42 | def colour_residues(cmd, *args, base_colour=None): 43 | """ 44 | Colour multiple residues programatically. Each argument should be a 45 | (chain, position index, hex code) tuple 46 | """ 47 | if base_colour is not None: 48 | cmd.color(base_colour, 'polymer') 49 | 50 | for chn, pos, col in args: 51 | pos = int(pos) 52 | pos = f'\{pos}' if pos < 0 else pos # Negative indices must be escaped 53 | cmd.color(col, f'polymer and chain {chn} and resi {pos}') 54 | -------------------------------------------------------------------------------- /meta/README.md: -------------------------------------------------------------------------------- 1 | # Meta Data folder 2 | 3 | This folder contains various bits of metadata required to execute the project 4 | and a few summary tables output by the pipeline. 5 | 6 | Metadata: 7 | * **structures.yaml** - details the SwissMODEL structures chosen for the studied proteins. 8 | Each gene is linked to a SwissMODEL template ID, the type of structure it is (e.g. 9 | x-ray, homology model etc.) and a list of sections to use. Each section is defined by 10 | a chain and region, with an optional offset for sequence numbering compared to Uniprot. 11 | This manual system could theoretically be replaced by automated model selection via the 12 | SwissMODEL API, but the complexity of the choice and the small number of proteins made 13 | manual selection easier. This file is used downstream to select regions of the PDB to 14 | process (e.g. with FoldX and Naccess) and to convert results to Uniprot sequence numbering. 15 | If a new study is added for a new protein a model must be chosen and added here. 16 | * **study\_template.yaml** - Template YAML file for adding new studies to the project 17 | * **subtypes/** - folder containing YAML config files for the various clustering approaches 18 | to extracting amino acid subtypes. 19 | * **residue\_hydrophobicity.tsv** - Average amino acid hydrophobicity table, sourced from 20 | [Bandyopadhyay & Mehler (2008)](https://onlinelibrary.wiley.com/doi/full/10.1002/prot.21958) 21 | * **fasta/** - can contain .fa files of the form {species/strain}\_{gene}.fa that act as 22 | master copies when validating study config files. Not required for normal execution. 23 | In general the fasta files are sourced from Uniprot, although some come from the 24 | studies directly or have manual edits based on the mutations used in a study. 25 | 26 | Generated Files: 27 | * **overall\_summary** - Summary of the project overall, giving the total number of 28 | studies, genes, etc. processed. 29 | * **gene\_summary.tsv** - Table summarising the properties of the genes included in the 30 | project, including stats such as the number of mutants and mutant coverage. 31 | * **study\_summary.tsv** - Table summarising the properties of the studies included in 32 | the project. 33 | -------------------------------------------------------------------------------- /docs/combined_mutational_scans_readme.txt: -------------------------------------------------------------------------------- 1 | # Combined Mutational Scans Dataset EV2 2 | 3 | Full deep landscape dataset, containing all positions normalised ER scores and additional annotations 4 | 5 | ## Columns 6 | 7 | study: Study the position comes from 8 | gene: Gene 9 | position: Position in gene (relative to cannonical Uniprot sequence) 10 | wt: Wild-type amino acid 11 | A-Y: Normalised ER score when the position is mutated to each amino acid 12 | log10_sift_A-Y: Log10 SIFT4G score for each substitution at this position 13 | mean_score: Mean normalised ER score across all substitutions 14 | mean_sift: Mean log10 SIFT4G score across all substitutions 15 | total_energy: Mean FoldX ddG prediction (kJ/mol) across all substitutions 16 | backbone_hbond-energy_ionisation: Mean prediction for each energy term in FoldX's force field model 17 | phi: Phi backbone angle 18 | psi: Psi backbone angle 19 | all_atom_abs: All atom absolute surface accessibility as calculated by Naccess 20 | all_atom_rel: All atom relative surface accessibility as calculated by Naccess 21 | side_chain_abs: Side chain absolute surface accessibility as calculated by Naccess 22 | side_chain_rel: Side chain relative surface accessibility as calculated by Naccess 23 | backbone_abs: Backbone atom absolute surface accessibility as calculated by Naccess 24 | backbone_rel: Backbone atom relative surface accessibility as calculated by Naccess 25 | non_polar_abs: Non-polar absolute surface accessibility as calculated by Naccess 26 | non_polar_rel: Non-polar relative surface accessibility as calculated by Naccess 27 | polar_abs: Polar absolute surface accessibility as calculated by Naccess 28 | polar_rel: Polar relative surface accessibility as calculated by Naccess 29 | within_10_0_A-Y: Count of each amino acid within a 10 angstrom sphere centered at the position 30 | angstroms_to_A-Y: Distance to each amino acid, in angstroms 31 | ss_g-t: Porter5 predictions for each DSSP secondary structure class 32 | hydrophobicity: Hydrophobicity score from Bandyopadhyay adn Mehler (2008) 33 | PC1-20: Principal component projection of the normalised ER score profile 34 | tSNE1-2: tSNE coordinates of the normalised ER score profile 35 | umap1-2: UMAP projection of the normalised ER score profile 36 | -------------------------------------------------------------------------------- /bin/utils/farm_sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # TODO: Convert to python script with more convenient behaviour 3 | # Script to sync between a local and remote directory(s) 4 | # 5 | # Setup for a given project folder using the config beow 6 | # 7 | # Will look for global and local .rsync_exclude/rsync_include files 8 | # .rsync_exclude in $HOME is assumed to be global prefs 9 | # Manual include overrides all excludes 10 | # 11 | # Sync only desired folders based on args 12 | 13 | ## Config ## 14 | project_name="Amino Acid Subtypes Project" 15 | local_dir=$HOME/phd/subtypes 16 | remote_dirs=( "ebi:/hps/research1/beltrao/ally/subtypes" ) 17 | folders=( "data" "meta" "figures" "docs" "logs" ) 18 | 19 | ## Colours for printf ## 20 | green=$(tput setaf 2) 21 | magenta=$(tput setaf 5) 22 | bold=$(tput bold) 23 | normal=$(tput sgr0) 24 | 25 | ## Check for presence of include/exclude files ## 26 | rsync_options=( -a -u -h ) 27 | 28 | if [ -e "$HOME/.rsync_exclude" ]; then 29 | rsync_options+=( --exclude-from "$HOME/.rsync_exclude" ) 30 | fi 31 | 32 | if [ -e "$local_dir/.rsync_include" ]; then 33 | rsync_options+=( --include-from "$local_dir/.rsync_include" --exclude"="'*') 34 | elif [ -e "$local_dir/.rsync_exclude" ]; then 35 | rsync_options+=( --exclude-from "$local_dir/.rsync_exclude" ) 36 | fi 37 | 38 | ## Sync function ## 39 | syncr () { 40 | rsync -v --dry-run "${rsync_options[@]}" "$1" "$2" 41 | 42 | read -p "Transfer? " -n 1 -r 43 | echo 44 | if [[ $REPLY =~ ^[Yy]$ ]] 45 | then 46 | rsync "${rsync_options[@]}" "$1" "$2" 47 | fi 48 | } 49 | 50 | ## Override folders if argument passed ## 51 | if [ $# -ne 0 ]; then 52 | folders=( "$@" ) 53 | fi 54 | 55 | ## Perform sync ## 56 | printf "%s\n" "${magenta}${bold}Rsyncing $project_name${normal}" 57 | for r in "${remote_dirs[@]}"; do 58 | read -p "Sync to remote: $r? " -n 1 -r 59 | echo 60 | if [[ $REPLY =~ ^[Yy]$ ]]; then 61 | for f in "${folders[@]}"; do 62 | printf "\n%s\n%s\n" "${green}${bold}Folder: $f${normal}" "${green}Local -> Remote${normal}" 63 | syncr "$local_dir/$f/" "$r/$f" 64 | printf "\n%s\n" "${green}Local <- Remote${normal}" 65 | syncr "$r/$f/" "$local_dir/$f" 66 | done 67 | fi 68 | done 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/foldx_eqn.md: -------------------------------------------------------------------------------- 1 | # Force Field 2 | ## Overall equation: 3 | ΔG = Wvdw \* ΔGvdw + WsolvH \* ΔGsolvH + WsolvP \* ΔGsolvP + ΔGwb + ΔGhbond + ΔGel + ΔGKon + Wmc \* T \* ΔSmc + Wsc \* T \* ΔSsc 4 | 5 | Basically: 6 | ΔG = ΔH - TΔS 7 | 8 | ## Terms: 9 | * ΔGvdw = Sum of van der waals of all atoms wrt same interactions with solvent 10 | * ΔGsolvH = Difference in solvation energy for apolar groups 11 | * ΔGsolvP = Difference in solvation energy for polar groups 12 | * ΔGwbi = Extra stabalising energy for H2O making 2 h-bonds to protein 13 | * ΔGhbond = Energy change from forming h-bonds within the protein rather than with solvent 14 | * ΔGel = Electrostatic contribution of charged groups, including helix dipole 15 | * ΔGKon = Electrostatic effect on association constant k[on] 16 | * T = Temperature (K) 17 | * ΔSmc = Entropy of fixing main chain in conformation 18 | * ΔSsc = Entropy of fixing side chains in conformation 19 | * Wxxx = Weighting terms, all 1 apart from Wvdv = 0.33 20 | 21 | # BuildModel output terms 22 | From: http://foldxsuite.crg.eu/command/BuildModel 23 | 24 | * Total Energy - predicted overall stability 25 | * Backbone Hbond - contribution of backbone Hbonds 26 | * Sidechain Hbond - contribution of sidechain-sidechain and sidechain-backbone Hbonds 27 | * Van der Waals - contribution of the VanderWaals 28 | * Electrostatics - electrostatic interactions 29 | * Solvation Polar - penalization for burying polar groups 30 | * Solvation Hydrophobic - contribution of hydrophobic groups 31 | * Van der Waals clashes - energy penalization due to interresidue VanderWaals’ clashes 32 | * Entropy Side Chain - entropy cost of fixing the side chain 33 | * Entropy Main Chain - entropy cost of fixing the main chain 34 | * Sloop Entropy - ONLY FOR ADVANCED USERS 35 | * Mloop Entropy - ONLY FOR ADVANCED USERS 36 | * Cis Bond - cost of having a cis peptide bond 37 | * Torsional Clash - intraresidue VanderWaals’ torsional clashes 38 | * Backbone Clash - backbone-backbone VanderWaals. *not considered in the total* 39 | * Helix Dipole - electrostatic contribution of helix dipole 40 | * Water Bridge - water bridges 41 | * Disulfide - disulfide bonds 42 | * Electrostatic Kon - electrostatic interaction between molecules in the precomplex 43 | * Partial Covalent Bonds - interactions with bound metals 44 | * Energy Ionisation - ionisation energy 45 | * Entropy Complex - entropy cost of forming a complex 46 | -------------------------------------------------------------------------------- /bin/analysis/2_subtypes/all_positions.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Perform clustering on all positions at once 3 | source('src/config.R') 4 | source('src/subtype_characterisation.R') 5 | 6 | dms <- read_tsv('data/combined_mutational_scans.tsv') 7 | 8 | ### Make Dynamic Hclust Clusters ### 9 | clusters <- make_dynamic_hclust_clusters(dms, PC2:PC20, distance_method = 'cosine', treecut_args = list(deepSplit=0)) 10 | dms <- mutate(clusters$tbl, 11 | cluster = str_c('X', cluster) %>% relabel_outlier_clusters(), 12 | cluster = factor(cluster, levels = sort_clusters(unique(cluster)))) %>% 13 | select(cluster, everything()) 14 | 15 | ### Analyse clusters ### 16 | n_clusters <- n_distinct(dms$cluster) 17 | plots <- list() 18 | 19 | plots$umap <- (ggplot(dms, aes(x=umap1, y=umap2, colour=wt)) + 20 | facet_wrap(~cluster) + 21 | scale_colour_manual(values = AA_COLOURS) + 22 | geom_point() + 23 | labs(x = 'UMAP1', y = 'UMAP2')) %>% 24 | labeled_plot(units='cm', height=25, width=25) 25 | 26 | plots$tsne <- (ggplot(dms, aes(x=tSNE1, y=tSNE2, colour=wt)) + 27 | facet_wrap(~cluster) + 28 | geom_point() + 29 | scale_colour_manual(values = AA_COLOURS)) %>% 30 | labeled_plot(units='cm', height=25, width=25) 31 | 32 | plots$silhouette <- labeled_plot(plot_silhouette(dms, A:Y, 'cosine'), 33 | units='cm', height = n_clusters*0.33 + 2, width = 15, limitsize=FALSE) 34 | 35 | 36 | cluster_occupancy <- group_by(dms, cluster, wt) %>% 37 | tally() %>% 38 | mutate(rel_n = n / sum(n)) 39 | plots$cluster_occupancy <- filter(cluster_occupancy, !str_ends(cluster, '0')) %>% 40 | ggplot(aes(x = wt, y = cluster, fill = rel_n)) + 41 | geom_raster() + 42 | coord_equal() + 43 | guides(fill = guide_colourbar(title = 'Proportion')) + 44 | scale_fill_distiller(type = 'seq', palette = 'Reds', direction = 1) + 45 | theme(axis.ticks = element_blank(), 46 | axis.title = element_blank(), 47 | panel.grid.major.y = element_blank()) 48 | 49 | ### Save results ### 50 | write_tsv(select(dms, cluster, study, gene, position, wt), 'data/subtypes/all_positions.tsv') 51 | saveRDS(clusters, 'data/subtypes/all_positions.rds') 52 | 53 | root <- 'figures/2_subtypes/all_positions' 54 | dir.create(root) 55 | save_plotlist(plots, root) 56 | -------------------------------------------------------------------------------- /bin/figures/figureS29.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Figure S29 (SIFT subtypes) 3 | source('src/config.R') 4 | 5 | dms <- left_join(read_tsv('data/subtypes/final_subtypes.tsv'), 6 | read_tsv('data/subtypes/sift_scores.tsv') %>% rename(sift_cluster = cluster), 7 | by = c('study', 'gene', 'position', 'wt')) %>% 8 | left_join(read_tsv('data/combined_mutational_scans.tsv'), by = c('study', 'gene', 'position', 'wt')) 9 | 10 | dms_dist <- filter(dms, !str_detect(cluster, CLUSTER_OUTLIER_RE)) %>% 11 | group_by(wt, cluster) %>% 12 | summarise_at(vars(A:Y), mean) %>% 13 | group_map(~tibble_to_matrix(., A:Y) %>% 14 | cosine_distance_matrix() %>% 15 | rowMeans() %>% 16 | set_names(.x$cluster)) %>% 17 | unlist() 18 | 19 | sift_dist <- filter(dms, !str_detect(sift_cluster, CLUSTER_OUTLIER_RE)) %>% 20 | group_by(wt, sift_cluster) %>% 21 | summarise_at(vars(starts_with('log10_sift')), mean) %>% 22 | group_map(~tibble_to_matrix(., starts_with('log10_sift')) %>% 23 | cosine_distance_matrix() %>% 24 | rowMeans() %>% 25 | set_names(.x$sift_cluster)) %>% 26 | unlist() 27 | 28 | cosine <- tibble(type=c(rep('dms', length(dms_dist)), rep('sift', length(sift_dist))), 29 | cluster=c(names(dms_dist), names(sift_dist)), 30 | cosine=c(dms_dist, sift_dist) %>% unname()) 31 | 32 | figure <- ggplot(cosine, aes(x = type, y = cosine, fill = type)) + 33 | geom_boxplot(show.legend = FALSE) + 34 | stat_compare_means(paired = FALSE, comparisons = list(c('dms', 'sift'))) + 35 | scale_x_discrete(name = 'Profiles used for clustering', labels = c(dms='ER', sift='log10SIFT4G')) + 36 | scale_y_continuous(name = 'Mean Cosine Distance', limits = c(0, 0.6)) + 37 | scale_fill_manual(values = c(dms='firebrick2', sift='cornflowerblue')) + 38 | theme(axis.text.x = element_markdown(), 39 | axis.ticks.x = element_blank()) 40 | 41 | ggsave('figures/4_figures/figureS29.pdf', figure, width = 120, height = 120, units = 'mm') 42 | ggsave('figures/4_figures/figureS29.png', figure, width = 120, height = 120, units = 'mm') 43 | ggsave('figures/4_figures/figureS29.tiff', figure, width = 120, height = 120, units = 'mm') 44 | ggsave('figures/4_figures/figureS29.eps', figure, width = 120, height = 120, units = 'mm', device=cairo_ps, fallback_resolution = 600) 45 | 46 | -------------------------------------------------------------------------------- /data/studies/dorrity_2018_ste12/standardise_dorrity_2018_ste12.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Dorrity et al. 2018 (STE12) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/dorrity_2018_ste12/dorrity_2018_ste12.yaml') 8 | mating_data <- read_xlsx('data/studies/dorrity_2018_ste12/raw/pnas.1805882115.sd01.xlsx') %>% 9 | mutate(mut = sapply(seqID, process_split_seqid, USE.NAMES = FALSE), 10 | nmut = str_count(mut, ',') + 1, 11 | mating_avg = rowMeans(select(., mating_30C_rep1, mating_30C_rep2, mating_30C_rep3), na.rm = TRUE) %>% replace_na(NA)) %>% 12 | select(mut, nmut, mating_avg, starts_with('mating_30C_')) 13 | 14 | invasion_data <- read_xlsx('data/studies/dorrity_2018_ste12/raw/pnas.1805882115.sd02.xlsx') %>% 15 | mutate(mut = sapply(seqID, process_split_seqid, USE.NAMES = FALSE), 16 | nmut = str_count(mut, ',') + 1, 17 | invasion_avg = rowMeans(select(., invasion_30C_rep1, invasion_30C_rep2, invasion_30C_rep3), na.rm = TRUE) %>% replace_na(NA)) %>% 18 | select(mut, nmut, invasion_avg, starts_with('invasion_30C_')) 19 | 20 | dm_data <- full_join(mating_data, invasion_data, by = c('mut', 'nmut')) %>% 21 | 22 | # Take worst of two scores (both are essential functions) 23 | mutate(raw_score = pmin(mating_avg, invasion_avg, na.rm = TRUE)) %>% 24 | select(mut, nmut, raw_score) %>% 25 | separate(mut, into = str_c('mut', 1:max(.$nmut)), sep=',', fill = 'right') %>% 26 | pivot_longer(starts_with('mut'), names_to = 'tmp', values_to = 'mut') %>% 27 | select(-tmp) %>% 28 | drop_na(mut) %>% 29 | tidyr::extract(mut, into = c('position', 'mut'), '([0-9]*)([A-Z*])', convert = TRUE) %>% 30 | mutate(position = position + 140, 31 | wt = str_split(meta$seq, '')[[1]][position]) %>% 32 | 33 | # Take value from a single variant if possible, average over multiple mutations otherwise (<=4 gives 97% coverage without using very heavily mutated seqs) 34 | group_by(position, wt, mut) %>% 35 | summarise(raw_score = ifelse(any(nmut == 1), mean(raw_score[nmut == 1], na.rm = TRUE), mean(raw_score[nmut <= 4], na.rm = TRUE))) %>% 36 | ungroup() %>% 37 | mutate(transformed_score = raw_score, 38 | score = normalise_score(transformed_score), 39 | class = get_variant_class(wt, mut)) %>% 40 | drop_na(position, raw_score) # Some muts arent found in seqs with <= 4 variants 41 | 42 | # Save output 43 | standardise_study(dm_data, meta$study, meta$transform) 44 | -------------------------------------------------------------------------------- /data/studies/giacomelli_2018_tp53/standardise_giacomelli_2018_tp53.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Giacomelli et al. (TP53) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/giacomelli_2018_tp53/giacomelli_2018_tp53.yaml') 8 | dm_data <- read_xlsx('data/studies/giacomelli_2018_tp53/raw/41588_2018_204_MOESM5_ESM.xlsx', skip=1) %>% 9 | rename_all(str_to_lower) %>% 10 | rename(wt = wt_aa, mut = vt_aa) %>% 11 | mutate(mut = if_else(mut == 'Z', '*', mut), 12 | wt = if_else(wt == 'Z', '*', wt)) %>% 13 | filter(a549_p53wt_early_time_point_experiment_1 > 0, 14 | a549_p53wt_early_time_point_experiment_2 > 0) %>% 15 | 16 | # Transform to frequencies with pseudocount 17 | mutate_at(vars(starts_with('a549')), .funs = ~(. + min(.[. > 0], na.rm = TRUE))/sum(., na.rm = TRUE)) %>% 18 | 19 | # Extract columns for the combination of states 20 | pivot_longer(starts_with('a549'), names_to = 'id', values_to = 'count') %>% 21 | tidyr::extract(id, into = c('p53', 'state', 'experiment'), "a549_p53(wt|null)_(early_time_point|nutlin-3|etoposide)_experiment_([12])") %>% 22 | pivot_wider(names_from = state, values_from = count) %>% 23 | rename(nutlin3 = `nutlin-3`, initial_freq = early_time_point) %>% 24 | pivot_longer(c('nutlin3', 'etoposide'), names_to = 'drug', values_to = 'freq') %>% 25 | drop_na(freq) %>% 26 | 27 | # Determine ER and fitness for each combination 28 | mutate(er = freq / initial_freq) %>% 29 | select(position, wt, mut, wt_codon, vt_codon, variant_group, p53, experiment, drug, initial_freq, freq, er) %>% 30 | group_by(p53, experiment, drug) %>% 31 | mutate(fitness = log2(er / mean(er[variant_group == 'BackboneWt'], na.rm = TRUE))) %>% 32 | 33 | # Average codons and then experiments 34 | group_by(position, wt, mut, p53, experiment, drug) %>% 35 | summarise(raw_score = weighted.mean(fitness, initial_freq, na.rm = TRUE)) %>% 36 | group_by(position, wt, mut, p53, drug) %>% 37 | summarise(raw_score = mean(raw_score, na.rm = TRUE)) %>% 38 | ungroup() %>% 39 | 40 | # Select appropriate experiment (p53 NULL, Etoposide selects for funcional p53), others test other functions that could possibly be integrated carefully 41 | filter(p53 == 'null', drug == 'etoposide') %>% 42 | 43 | mutate(transformed_score = raw_score, 44 | score = normalise_score(transformed_score), 45 | class = get_variant_class(wt, mut)) %>% 46 | select(position, wt, mut, score, transformed_score, raw_score, class) 47 | 48 | 49 | # Save output 50 | standardise_study(dm_data, meta$study, meta$transform) 51 | -------------------------------------------------------------------------------- /bin/analysis/0_data/sift_correlation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Calculate the correlation between study scores and SIFT results 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | sift_dir <- 'data/sift/' 7 | study_dirs <- dir('data/studies', full.names = TRUE) 8 | 9 | dms <- lapply(study_dirs, import_study, fields = c('gene')) %>% 10 | bind_rows() 11 | 12 | sift <- sapply(unique(dms$gene), import_sift, simplify = FALSE) %>% 13 | bind_rows(.id = 'gene') 14 | 15 | dms <- left_join(dms, sift, by = c('gene', 'position', 'wt', 'mut')) 16 | 17 | sift_correlations <- bind_rows(group_by(dms, study) %>% 18 | do(tidy(cor.test(.$score, .$log10_sift, method = 'kendall'))), 19 | group_by(dms, study) %>% 20 | do(tidy(cor.test(.$score, .$log10_sift, method = 'pearson')))) %>% 21 | mutate(study_pretty = sapply(study, format_study, USE.NAMES = FALSE), 22 | p_cat = pretty_p_values(p.value, breaks = c(1e-48, 1e-12, 1e-06, 1e-3, 0.01, 0.05))) 23 | 24 | filtered <- sapply(unique(sift_correlations$study), function(x){ 25 | y <- read_yaml(str_c('data/studies/', x, '/', x, '.yaml')) 26 | return(ifelse(y$qc$filter, 'red', 'black')) 27 | }) 28 | names(filtered) <- sapply(names(filtered), format_study, USE.NAMES = FALSE) 29 | 30 | p_sift_cor <- ungroup(sift_correlations) %>% 31 | mutate(study_pretty = add_markdown(study_pretty, colour = filtered)) %>% 32 | ggplot(aes(x = study_pretty, y = estimate, fill = p_cat)) + 33 | facet_wrap(~method, ncol = 1) + 34 | geom_col(position = position_dodge()) + 35 | geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=0.5, position = position_dodge(0.9)) + 36 | geom_hline(yintercept = 0) + 37 | ggtitle('Correlation between Normalised Score and log10(SIFT)') + 38 | xlab('') + 39 | ylab('Correlation') + 40 | scale_fill_viridis_d(guide=guide_legend(title='p-value'), drop=FALSE) + 41 | theme(axis.text.x = element_markdown(angle = 90, hjust = 1, vjust = 0.5)) 42 | ggsave('figures/0_data/sift_score_correlation.pdf', p_sift_cor, width = 20, height = 20, units = 'cm') 43 | 44 | p_sift_density <- ggplot(dms, aes(x = score, y = jitter(log10_sift, 0.1), colour = class)) + 45 | facet_wrap(~study, labeller = as_labeller(sapply(unique(dms$study), format_study)), ncol = 6, scales = 'free_x') + 46 | geom_density2d(data = filter(dms, class == 'Missense')) + 47 | geom_point(data = filter(dms, !class == 'Missense')) + 48 | scale_colour_manual(values = MUT_CLASS_COLOURS) + 49 | labs(x = 'Score', y = 'log10(SIFT)') 50 | ggsave('figures/0_data/sift_score_density.pdf', p_sift_density, width = 35, height = 35, units = 'cm') 51 | -------------------------------------------------------------------------------- /bin/figures/figureS4.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Produce figure S4 (Consistency of positions in UMAP space) 3 | source('src/config.R') 4 | source('src/subtype_characterisation.R') 5 | 6 | dms <- read_tsv('data/combined_mutational_scans.tsv') 7 | dms <- left_join(dms, count(dms, gene, position), by = c('gene', 'position')) 8 | 9 | repeated <- filter(dms, n > 1) %>% 10 | select(study, gene, position, umap1, umap2) %>% 11 | group_by(gene, position) %>% 12 | summarise(umap1_1 = umap1[1], 13 | umap1_2 = umap1[2], 14 | umap2_1 = umap2[1], 15 | umap2_2 = umap2[2]) 16 | 17 | p_umap <- ggplot() + 18 | geom_point(data = dms, mapping = aes(x = umap1, y = umap2), colour = 'grey90', shape = 20, size = 0.8) + 19 | geom_segment(data = repeated, aes(x = umap1_1, y = umap2_1, xend = umap1_2, yend = umap2_2)) + 20 | geom_point(data = filter(dms, n > 1), mapping = aes(x = umap1, y = umap2, colour = gene)) + 21 | scale_colour_brewer(type = 'qual', palette = 'Set1') + 22 | guides(colour = guide_legend(title = '')) + 23 | labs(x = 'UMAP1', y = 'UMAP2') 24 | 25 | # Distribution of distances n > 1/n == 1 26 | distances <- mutate(dms, gene_pos = str_c(gene, '_', position)) %>% 27 | tibble_to_matrix(umap1, umap2, row_names = 'gene_pos') %>% 28 | dist() %>% 29 | as.matrix() 30 | distances[upper.tri(distances, diag = TRUE)] <- NA 31 | distances <- as_tibble(distances, rownames = 'gene_pos1') %>% 32 | pivot_longer(-gene_pos1, names_to = 'gene_pos2', values_to = 'dist') %>% 33 | drop_na() %>% 34 | mutate(rep = ifelse(gene_pos1 == gene_pos2, 'Repeated Position', 'Background')) 35 | 36 | # t.test 37 | # wilcox.test(x = filter(distances, rep == 'Repeated Position')$dist, y = filter(distances, rep == 'Background')$dist, alternative = 'less') 38 | 39 | p_dists <- ggplot(distances, aes(x = dist, y = ..scaled.., colour = rep)) + 40 | stat_density(geom = 'line', position = 'identity') + 41 | labs(x = 'UMAP Space Euclidean Distance', y = 'Scaled Density') + 42 | scale_colour_brewer(type = 'qual', palette = 'Dark2') + 43 | guides(colour = guide_legend(title = '')) 44 | 45 | figure <- multi_panel_figure(width = 183, height = c(89, 89), unit = 'mm', columns = 1) %>% 46 | fill_panel(p_umap, row = 1, column = 1) %>% 47 | fill_panel(p_dists, row = 2, column = 1) 48 | ggsave('figures/4_figures/figureS4.pdf', figure, width = 183, height = 185, units = 'mm') 49 | ggsave('figures/4_figures/figureS4.png', figure, width = 183, height = 185, units = 'mm') 50 | ggsave('figures/4_figures/figureS4.tiff', figure, width = 183, height = 185, units = 'mm') 51 | ggsave('figures/4_figures/figureS4.eps', figure, width = 183, height = 185, units = 'mm', device=cairo_ps, fallback_resolution = 600) 52 | 53 | -------------------------------------------------------------------------------- /bin/analysis/0_data/validate_araya_2012_yap1.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Validate multi mutant combination method for Araya et al. 2012 (YAP1) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/araya_2012_yap1/araya_2012_yap1.yaml') 8 | dm_data <- read_csv('data/studies/araya_2012_yap1/raw/urn_mavedb_00000002-a-2_scores.csv', skip = 4) %>% 9 | select(hgvs_pro, raw_score = score) %>% 10 | mutate(hgvs_pro = if_else(str_ends(hgvs_pro, ']'), str_sub(hgvs_pro, start = 4, end = -2), str_sub(hgvs_pro, start = 3)), 11 | n_mut = str_count(hgvs_pro, ';') + 1) %>% 12 | separate(hgvs_pro, str_c('mut', 1:max(.$n_mut)), sep = ';', fill = 'right') %>% 13 | pivot_longer(cols = starts_with('mut'), values_to = 'mut') %>% 14 | drop_na(mut) %>% 15 | select(-name) %>% 16 | tidyr::extract(mut, into = c('wt', 'position', 'mut'), "([A-Za-z]{3})([0-9]+)([A-Za-z]{3})", convert = TRUE) %>% 17 | mutate(wt = AA_THREE_2_ONE[wt], mut = AA_THREE_2_ONE[mut], position = position + 169) %>% 18 | mutate(transformed_score = raw_score) %>% 19 | group_by(position, wt, mut) 20 | 21 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(transformed_score[n_mut == 1], na.rm = TRUE), NA)) %>% 22 | ungroup() 23 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score) 24 | 25 | mut_count_data <- lapply(2:9, function(x){ 26 | summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(transformed_score[n_mut <= x], na.rm = TRUE), NA)) %>% 27 | ungroup() %>% 28 | select(!!str_c('score_', x)) 29 | }) %>% 30 | bind_cols(singles, .) %>% 31 | pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>% 32 | group_by(n_mut) %>% 33 | mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>% 34 | ungroup() %>% 35 | mutate(n_mut = as.integer(n_mut)) 36 | 37 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) + 38 | facet_wrap(~n_mut, ncol = 5) + 39 | geom_point(colour = 'cornflowerblue') + 40 | geom_abline(slope = 1, linetype='dashed') + 41 | geom_text(x = -5, y = 1, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) + 42 | labs(x = 'log2(Score) (Single Variant)', 43 | y = 'log2(Score) (Mean Over Multiple Variants)', 44 | title = 'Accuracy of multi-variant averaging for scoring in Araya et al. 2012 (YAP1)', 45 | subtitle = str_c('Fraction of variants with individual measures: ', single_frac)) 46 | ggsave('figures/0_data/per_study/araya_2012_yap1/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25) 47 | -------------------------------------------------------------------------------- /src/dimensionality_reduction.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Functions for dimensionality reduction analysis 3 | 4 | ### tSNE/UMAP Plots ### 5 | tsne_umap_plots <- function(tbl, x, y, name){ 6 | x <- enquo(x) 7 | y <- enquo(y) 8 | p <- list(study = plot_dim_red_study(tbl, !!x, !!y), 9 | aa = plot_dim_red_aa(tbl, !!x, !!y), 10 | hydrophobicity = plot_dim_red_hydrophobicity(tbl, !!x, !!y), 11 | surface_accessibility = plot_dim_red_surface_accessibility(tbl, !!x, !!y), 12 | mean_er = plot_dim_red_mean_er(tbl, !!x, !!y)) 13 | names(p) <- str_c(name, '_', names(p)) 14 | return(p) 15 | } 16 | 17 | plot_dim_red_study <- function(tbl, x, y){ 18 | x <- enquo(x) 19 | y <- enquo(y) 20 | (ggplot(tbl, aes(x=!!x, y=!!y, colour=study)) + 21 | geom_point() + 22 | facet_wrap(~study, labeller = as_labeller(sapply(unique(dms_wide$study), format_study, max_width=18)), nrow = 4) + 23 | guides(colour=FALSE)) %>% 24 | labeled_plot(units='cm', height = 20, width = 32) 25 | } 26 | 27 | plot_dim_red_aa <- function(tbl, x, y){ 28 | x <- enquo(x) 29 | y <- enquo(y) 30 | (mutate(tbl, aa_class = AA_REDUCED_HASH[wt]) %>% 31 | ggplot(aes(x=!!x, y=!!y, colour=wt)) + 32 | geom_point() + 33 | facet_wrap(~aa_class) + 34 | scale_colour_manual(values = AA_COLOURS) + 35 | guides(colour = guide_legend(title='AA')) + 36 | theme(panel.grid.major.x = element_line(colour = 'gray', linetype = 'dotted'))) %>% 37 | labeled_plot(units='cm', height = 25, width = 25) 38 | } 39 | 40 | plot_dim_red_hydrophobicity <- function(tbl, x, y){ 41 | x <- enquo(x) 42 | y <- enquo(y) 43 | (ggplot(tbl, aes(x=!!x, y=!!y, colour=hydrophobicity)) + 44 | geom_point() + 45 | scale_colour_gradient2() + 46 | guides(colour = guide_colourbar(title = 'Hydrophobicity'))) %>% 47 | labeled_plot(units='cm', height = 10, width = 15) 48 | } 49 | 50 | plot_dim_red_surface_accessibility <- function(tbl, x, y){ 51 | x <- enquo(x) 52 | y <- enquo(y) 53 | (drop_na(tbl, all_atom_abs) %>% 54 | ggplot(aes(x=!!x, y=!!y, colour=all_atom_abs)) + 55 | geom_point() + 56 | scale_colour_viridis_c() + 57 | guides(colour = guide_colourbar(title = 'Surface Accessibility\n(All Atom Abs)'))) %>% 58 | labeled_plot(units='cm', height = 10, width = 15) 59 | } 60 | 61 | plot_dim_red_mean_er <- function(tbl, x, y){ 62 | x <- enquo(x) 63 | y <- enquo(y) 64 | (drop_na(tbl, all_atom_abs) %>% 65 | ggplot(aes(x=!!x, y=!!y, colour=mean_score)) + 66 | geom_point() + 67 | scale_colour_gradient2() + 68 | guides(colour = guide_colourbar(title = 'Mean Norm. ER'))) %>% 69 | labeled_plot(units='cm', height = 10, width = 15) 70 | } 71 | 72 | -------------------------------------------------------------------------------- /bin/analysis/0_data/validate_sarkisyan_2016_gfp.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Validate the strategy used for Sarkisyan et al. 2016 (GFP) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | dir.create('figures/0_data/per_study/sarkisyan_2016_gfp') 7 | 8 | # Import and process data 9 | raw_data <- read_tsv('data/studies/sarkisyan_2016_gfp/raw/amino_acid_genotypes_to_brightness.tsv', skip = 1, 10 | col_names = c('mut', 'barcodes', 'median_brightness', 'std')) %>% 11 | mutate(n_mut = str_count(mut, ':') + 1) 12 | wt_brightness <- filter(raw_data, is.na(mut)) %>% pull(median_brightness) 13 | 14 | dm_data <- separate(raw_data, mut, into = str_c('mut', 1:15), sep = ':', fill = 'right') %>% 15 | pivot_longer(cols = starts_with('mut'), names_to = 'n', names_prefix = 'mut', values_to = 'mut') %>% 16 | drop_na(mut) %>% 17 | select(-n, -barcodes, -std) %>% 18 | tidyr::extract(mut, into = c('wt', 'position', 'mut'), 'S([A-Z])([0-9]+)([A-Z])', convert=TRUE) %>% 19 | arrange(position, mut) %>% 20 | mutate(single_score = ifelse(n_mut == 1, median_brightness, NA)) %>% 21 | group_by(position, wt, mut) 22 | 23 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(median_brightness[n_mut == 1], na.rm = TRUE), NA)) %>% 24 | ungroup() 25 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score) 26 | 27 | mut_count_data <- lapply(2:15, function(x){ 28 | summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(median_brightness[n_mut <= x], na.rm = TRUE), NA)) %>% 29 | ungroup() %>% 30 | select(!!str_c('score_', x)) 31 | }) %>% 32 | bind_cols(singles, .) %>% 33 | pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>% 34 | group_by(n_mut) %>% 35 | mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>% 36 | ungroup() %>% 37 | mutate(n_mut = as.integer(n_mut)) 38 | 39 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) + 40 | facet_wrap(~n_mut, ncol = 5) + 41 | geom_point(colour = 'cornflowerblue') + 42 | geom_abline(slope = 1, linetype='dashed') + 43 | geom_text(x = 1.5, y = 3.5, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) + 44 | labs(x = 'Median Brightness (Single Variant)', 45 | y = 'Median Brightness (Mean Over Multiple Variants)', 46 | title = 'Accuracy of multi-variant averaging for scoring in Sarkisyan et al. 2016 (GFP)', 47 | subtitle = str_c('Fraction of variants with individual measures: ', single_frac)) 48 | ggsave('figures/0_data/per_study/sarkisyan_2016_gfp/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25) 49 | -------------------------------------------------------------------------------- /bin/analysis/0_data/validate_starita_2013_ube4b.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Validate multiple mutation method for Starita et al. 2013 (UBE4B) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | dir.create('figures/0_data/per_study/starita_2013_ube4b/') 7 | 8 | # Import and process data 9 | meta <- read_yaml('data/studies/starita_2013_ube4b/starita_2013_ube4b.yaml') 10 | dm_data <- read_xlsx('data/studies/starita_2013_ube4b/raw/sd01.xlsx', na = c('NA', '')) %>% 11 | filter(!seqID == 'NA-NA') %>% # Filter WT 12 | rename(raw_score = log2_ratio) %>% 13 | separate(seqID, into = c('position', 'mut'), sep='-') %>% 14 | select(-nscor_log2_ratio) %>% 15 | mutate(n_mut = sapply(position, function(x){str_count(x, ',') + 1})) %>% 16 | separate(mut, str_c('mut', 1:max(.$n_mut)), sep = ',', fill = 'right') %>% 17 | separate(position, str_c('position', 1:max(.$n_mut)), sep = ',', fill = 'right') %>% 18 | pivot_longer(starts_with('position'), names_to = 'pos_num', names_prefix = 'position', values_to = 'position') %>% 19 | drop_na(position) %>% 20 | pivot_longer(starts_with('mut'), names_to = 'mut_num', names_prefix = 'mut', values_to = 'mut') %>% 21 | drop_na(mut) %>% 22 | filter(pos_num == mut_num) %>% 23 | select(-pos_num, -mut_num) %>% 24 | group_by(position, mut) 25 | 26 | singles <- summarise(dm_data, single_score := ifelse(any(n_mut == 1), mean(raw_score[n_mut == 1], na.rm = TRUE), NA)) %>% 27 | ungroup() 28 | single_frac <- sum(!is.na(singles$single_score))/length(singles$single_score) 29 | 30 | mut_count_data <- lapply(2:9, function(x){ 31 | summarise(dm_data, !!str_c('score_', x) := ifelse(any(n_mut <= x), mean(raw_score[n_mut <= x], na.rm = TRUE), NA)) %>% 32 | ungroup() %>% 33 | select(!!str_c('score_', x)) 34 | }) %>% 35 | bind_cols(singles, .) %>% 36 | pivot_longer(starts_with('score_'), names_to = 'n_mut', values_to = 'multi_score', names_prefix = 'score_', names_ptypes = list(n_mut=integer())) %>% 37 | group_by(n_mut) %>% 38 | mutate(frac = sum(!is.na(multi_score))/length(multi_score)) %>% 39 | ungroup() %>% 40 | mutate(n_mut = as.integer(n_mut)) 41 | 42 | p_sing_multi <- ggplot(mut_count_data, aes(x = single_score, y = multi_score)) + 43 | facet_wrap(~n_mut, ncol = 5) + 44 | geom_point(colour = 'cornflowerblue') + 45 | geom_abline(slope = 1, linetype='dashed') + 46 | geom_text(x = -4, y = 3, aes(label = str_c('frac = ', signif(frac, digits = 4))), hjust = 0) + 47 | labs(x = 'log2(Score) (Single Variant)', 48 | y = 'log2(Score) (Mean Over Multiple Variants)', 49 | title = 'Accuracy of multi-variant averaging for scoring in Starita et al. 2013 (UBE4B)', 50 | subtitle = str_c('Fraction of variants with individual measures: ', single_frac)) 51 | ggsave('figures/0_data/per_study/starita_2013_ube4b/multi_mut_validation.pdf', p_sing_multi, units = 'cm', height = 15, width = 25) 52 | -------------------------------------------------------------------------------- /bin/pipeline/sequence_statistics.smk: -------------------------------------------------------------------------------- 1 | """ 2 | Rules for generating statistics from protein sequences, including the Porter5 and SIFT pipeline 3 | 4 | Expects global variables: 5 | - GENES: dict mapping genes to lists of the studies assessing them 6 | """ 7 | 8 | rule make_gene_fasta: 9 | """ 10 | Generate a FASTA file for a gene from the sequence in the studies YAML configs 11 | """ 12 | # marked ancient as seq shouldn't change when .yaml's do 13 | # force re-run if neccessary by deleting relevant .fa 14 | input: 15 | lambda wildcards: [ancient(f'data/studies/{i}/{i}.yaml') for i 16 | in GENES[wildcards.gene]] 17 | 18 | output: 19 | "data/fasta/{gene}.fa" 20 | 21 | log: 22 | "logs/make_gene_fasta/{gene}.log" 23 | 24 | shell: 25 | "python bin/data_processing/make_gene_fasta.py {input} > {output} 2> {log}" 26 | 27 | rule sift4g: 28 | """ 29 | Run SIFT4G on a FASTA file, assessing all possible variants. 30 | Note: for this project I have been using a modified version of SIFT4G that 31 | outputs to 4.d.p rather than 2. 32 | """ 33 | input: 34 | fa = "data/fasta/{gene}.fa", 35 | db = config['sift']['uniref90_fa_path'] 36 | 37 | output: 38 | "data/sift/{gene}.SIFTprediction" 39 | 40 | log: 41 | 'logs/sift4g/{gene}.log' 42 | 43 | resources: 44 | mem_mb = 8000 45 | 46 | shell: 47 | "sift4g -q {input.fa} -d {input.db} --out data/sift 2> {log}" 48 | 49 | rule all_sift_predictions: 50 | """ 51 | Produce SIFT predictions for all genes 52 | """ 53 | input: 54 | expand('data/sift/{gene}.SIFTprediction', gene=GENES.keys()) 55 | 56 | rule porter5: 57 | """ 58 | Run Porter5 on a FASTA file, generating secondary structure predictions for each residue. 59 | """ 60 | input: 61 | "data/fasta/{gene}.fa" 62 | 63 | output: 64 | ss3="data/porter5/{gene}.ss3", 65 | ss8="data/porter5/{gene}.ss8" 66 | 67 | log: 68 | "logs/porter5/{gene}.log" 69 | 70 | threads: 8 71 | 72 | resources: 73 | mem_mb = 30000 74 | 75 | shell: 76 | f""" 77 | python {config['porter5']['path']} -i {{input}} --cpu 8 --tmp &> {{log}} 78 | cat data/fasta/{{wildcards.gene}}.fa.log >> {{log}} 2>&1 79 | mv data/fasta/{{wildcards.gene}}.fa.ss3 {{output.ss3}} >> {{log}} 2>&1 80 | mv data/fasta/{{wildcards.gene}}.fa.ss8 {{output.ss8}} >> {{log}} 2>&1 81 | rm data/fasta/{{wildcards.gene}}.fa.* &> {{log}} 82 | rm data/fasta/{{wildcards.gene}}.hhr &> {{log}} 83 | """ 84 | 85 | rule all_porter5_predictions: 86 | """ 87 | Produce SIFT predictions for all genes 88 | """ 89 | input: 90 | expand('data/porter5/{gene}.ss8', gene=GENES.keys()), 91 | expand('data/porter5/{gene}.ss3', gene=GENES.keys()) 92 | -------------------------------------------------------------------------------- /bin/analysis/0_data/check_normalisation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Check the validity of the normalisation approach 3 | source('src/config.R') 4 | 5 | dms <- read_tsv('data/long_combined_mutational_scans.tsv') 6 | plots <- list() 7 | 8 | ### Check normalisation 10% with an without nonsense ### 9 | quantiles <- filter(dms, !is.na(score)) %>% 10 | group_by(study) %>% 11 | summarise(with = quantile(transformed_score, 0.1, na.rm = TRUE), 12 | without = quantile(transformed_score[!mut == "*"], 0.1, na.rm = TRUE), 13 | nonsense = any(mut == "*")) 14 | 15 | plots$norm_quantiles <- ggplot(filter(quantiles, nonsense), aes(x = with, y = without)) + 16 | geom_point() + 17 | geom_abline(slope = 1, intercept = 0, linetype = "dashed") + 18 | labs(x = "With Nonsense", y = "Without Nonsense") 19 | 20 | ### Check distribution of bottom 10% scores predictor metrics ### 21 | dms_scores <- select(dms, study, score, transformed_score, sift, log10_sift, total_energy) %>% 22 | left_join(quantiles, by = "study") %>% 23 | mutate(with10 = transformed_score < with, 24 | without10 = transformed_score < without, 25 | neutral = abs(score) < 0.3) 26 | 27 | score_groups <- filter(dms_scores, nonsense, with10 | without10 | neutral) %>% 28 | pivot_longer(c(log10_sift, total_energy), names_to = "metric", values_to = "value") %>% 29 | pivot_longer(c(with10, without10, neutral), names_to = "class", values_to = "member") %>% 30 | filter(member, !is.na(value)) 31 | 32 | plots$score_group_density <- ggplot(score_groups, aes(x = value, y = ..scaled.., colour = class)) + 33 | stat_density(geom = "line", position = "identity") + 34 | facet_wrap(~metric, nrow = 1, scales = "free_x") 35 | 36 | plots$score_group_box <- ggplot(score_groups, aes(x = class, y = value, fill = class)) + 37 | geom_boxplot() + 38 | facet_wrap(~metric, nrow = 1, scales = "free_y") + 39 | stat_compare_means(comparisons = list(c("with10", "without10"), c("with10", "neutral"), c("without10", "neutral"))) 40 | 41 | ### SIFT4G Scores across studies 42 | 43 | plots$per_study_sift_dist <- ggplot(filter(dms_scores, with10), aes(x = study, y = log10_sift)) + 44 | geom_boxplot(fill = "cornflowerblue") + 45 | geom_hline(yintercept = log10(0.05)) + 46 | coord_flip() + 47 | labs(y = expression("log"[10]~"SIFT4G"), x = "") + 48 | theme(panel.grid.major.y = element_blank(), 49 | panel.grid.major.x = element_line(linetype = "dotted", colour = "grey")) 50 | 51 | plots$per_study_foldx_dist <- ggplot(filter(dms_scores, with10), aes(x = study, y = total_energy)) + 52 | geom_boxplot(fill = "cornflowerblue") + 53 | geom_hline(yintercept = c(-1, 1)) + 54 | coord_flip() + 55 | labs(y = expression(Delta*Delta*"G"), x = "") + 56 | theme(panel.grid.major.y = element_blank(), 57 | panel.grid.major.x = element_line(linetype = "dotted", colour = "grey")) 58 | 59 | ### Save plots ### 60 | save_plotlist(plots, "figures/0_data/normalisation", overwrite = "all") 61 | -------------------------------------------------------------------------------- /bin/data_processing/foldx_variants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Template Script 4 | """ 5 | import sys 6 | import argparse 7 | from pathlib import Path 8 | 9 | from Bio.PDB import PDBParser 10 | from Bio.SeqUtils import seq1 11 | 12 | from subtypes_utils import import_sections 13 | 14 | AA_ALPHABET = 'ACDEFGHIKLMNPQRSTVWY' 15 | 16 | def main(args): 17 | """Main script""" 18 | pdb_parser = PDBParser() 19 | 20 | pdb_name = Path(args.pdb).stem 21 | # deal with FoldX repaired PDBs 22 | if pdb_name.endswith('_Repair'): 23 | pdb_name = pdb_name.replace('_Repair', '') 24 | 25 | structure = pdb_parser.get_structure(pdb_name, args.pdb) 26 | 27 | sections = import_sections(args.yaml, pdb_name) 28 | 29 | variants = [] 30 | if sections is not None: 31 | for section in sections: 32 | filter_region = 'region' in section 33 | for residue in structure[0][section['chain']]: 34 | if not residue.id[0] == ' ': 35 | continue # Filter HETATMs 36 | 37 | position = int(residue.id[1]) 38 | amino_acid = seq1(residue.get_resname()) 39 | 40 | if not amino_acid in AA_ALPHABET: 41 | # Filter non-standard AAs, required when processing 42 | # foldx repaired PDBs as they turn HETATMs to regular ATOMs 43 | # for regular proteins 44 | continue 45 | 46 | if (filter_region and 47 | (position > section['region'][1] or 48 | position < section['region'][0])): 49 | continue 50 | 51 | variants.extend([f"{amino_acid}{section['chain']}{position}{x}" for 52 | x in AA_ALPHABET if not x == amino_acid]) 53 | else: 54 | for chain in structure[0]: 55 | for residue in chain: 56 | if not residue.id[0] == ' ': 57 | continue # Filter HETATMs 58 | 59 | position = int(residue.id[1]) 60 | amino_acid = seq1(residue.get_resname()) 61 | 62 | if not amino_acid in AA_ALPHABET: 63 | continue 64 | 65 | variants.extend([f"{amino_acid}{chain.id}{position}{x}" for 66 | x in AA_ALPHABET if not x == amino_acid]) 67 | 68 | print(*variants, sep=';\n', end=';\n', file=sys.stdout) 69 | 70 | def parse_args(): 71 | """Process input arguments""" 72 | parser = argparse.ArgumentParser(description=__doc__, 73 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 74 | 75 | parser.add_argument('pdb', metavar='P', help="Input PDB") 76 | 77 | parser.add_argument('--yaml', '-y', help="YAML file/raw YAML defining PDB sections to consider") 78 | 79 | return parser.parse_args() 80 | 81 | if __name__ == "__main__": 82 | ARGS = parse_args() 83 | main(ARGS) 84 | -------------------------------------------------------------------------------- /data/studies/melnikov_2014_aph3ii/standardise_melnikov_2014_aph3ii.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # Standardise data from Melnikov et al. 2014 (APH(3')-II) 3 | source('src/config.R') 4 | source('src/study_standardising.R') 5 | 6 | # Import and process data 7 | meta <- read_yaml('data/studies/melnikov_2014_aph3ii/melnikov_2014_aph3ii.yaml') 8 | 9 | count_files <- grep('\\.aacounts\\.txt', dir('data/studies/melnikov_2014_aph3ii/raw/'), value = TRUE) 10 | count_files <- count_files[!grepl('(S[12]\\_Ami|S3\\_Kan)', count_files)] # Discard bad runs - see 00README.txt from Melnikov et al. data 11 | 12 | counts <- sapply(count_files, read_melnikov_table, simplify = FALSE) %>% 13 | set_names(gsub('(KKA2\\_|\\.aacounts\\.txt)', '', names(.))) # Set names to drug 14 | bkg_counts <- counts[c('Bkg1', 'Bkg2')] 15 | counts <- counts[which(!names(counts) %in% c('Bkg1', 'Bkg2'))] 16 | 17 | # Process data, see bin/0_data/validate_melnikov.R for details + comments 18 | dm_data <- mapply(melnikov_fitness, counts, names(counts), MoreArgs = list(bkg=bkg_counts), SIMPLIFY = FALSE) %>% # Calculate fitness for each count 19 | bind_rows(.id = 'experiment') %>% 20 | 21 | # Extract expeciment data, round and library contain the same information (plus round notes which needed a re-test at different MIC, 22 | # which we alread accounted for) -> discard round 23 | separate(experiment, c('round', 'drug', 'library'), sep='_') %>% 24 | select(position, wt, mut, score, drug, library) %>% 25 | mutate(rel_conc = 1/as.numeric(str_sub(drug, -1)), drug = str_sub(drug, 1, -3)) %>% 26 | 27 | # Process library pairs - discard datasets where libraries don't agree and filter outlier points, then average L1 & L2 28 | pivot_wider(id_cols = c('position', 'wt', 'mut', 'drug', 'rel_conc'), names_from = library, values_from = score) %>% 29 | mutate(diff = abs(L1 - L2)) %>% 30 | filter(!(drug == 'Ami' & rel_conc == 0.25), !(drug %in% c('G418', 'Ami', 'Kan') & rel_conc == 0.125)) %>% 31 | filter(diff < sd(diff, na.rm = TRUE) * 3) %>% 32 | mutate(score = (L1 + L2)/2) %>% 33 | drop_na(score) %>% 34 | 35 | # Select the best conc for each drug, based on ER distribution 36 | filter((drug == 'Ami' & rel_conc == 0.5) | 37 | (drug == 'G418' & rel_conc == 0.25) | 38 | (drug == 'Kan' & rel_conc == 0.25) | 39 | (drug == 'Neo' & rel_conc == 0.25) | 40 | (drug == 'Paro' & rel_conc == 0.125) | 41 | (drug == 'Ribo' & rel_conc == 0.125)) %>% 42 | select(drug, position, wt, mut, score) %>% 43 | 44 | # Filter Ami as it doesn't correlate with other drugs, then average 45 | filter(!drug == 'Ami') %>% 46 | group_by(position, wt, mut) %>% 47 | summarise(score = mean(score, na.rm=TRUE)) %>% 48 | ungroup() %>% 49 | mutate(raw_score = score, 50 | transformed_score = raw_score, 51 | score = normalise_score(transformed_score), 52 | class = get_variant_class(wt, mut)) 53 | 54 | # Save output 55 | standardise_study(dm_data, meta$study, meta$transform) 56 | -------------------------------------------------------------------------------- /data/studies/findlay_2018_brca1/findlay_2018_brca1.yaml: -------------------------------------------------------------------------------- 1 | study: 'findlay_2018_brca1' 2 | gene: 'BRCA1' 3 | uniprot_id: 'P38398' 4 | gene_type: 'E3 Ligase' 5 | species: 'H. sapiens' 6 | seq: "MDLSALRVEEVQNVINAMQKILECPICLELIKEPVSTKCDHIFCKFCMLKLLNQKKGPSQ\ 7 | CPLCKNDITKRSLQESTRFSQLVEELLKIICAFQLDTGLEYANSYNFAKKENNSPEHLKD\ 8 | EVSIIQSMGYRNRAKRLLQSEPENPSLQETSLSVQLSNLGTVRTLRTKQRIQPQKTSVYI\ 9 | ELGSDSSEDTVNKATYCSVGDQELLQITPQGTRDEISLDSAKKAACEFSETDVTNTEHHQ\ 10 | PSNNDLNTTEKRAAERHPEKYQGSSVSNLHVEPCGTNTHASSLQHENSSLLLTKDRMNVE\ 11 | KAEFCNKSKQPGLARSQHNRWAGSKETCNDRRTPSTEKKVDLNADPLCERKEWNKQKLPC\ 12 | SENPRDTEDVPWITLNSSIQKVNEWFSRSDELLGSDDSHDGESESNAKVADVLDVLNEVD\ 13 | EYSGSSEKIDLLASDPHEALICKSERVHSKSVESNIEDKIFGKTYRKKASLPNLSHVTEN\ 14 | LIIGAFVTEPQIIQERPLTNKLKRKRRPTSGLHPEDFIKKADLAVQKTPEMINQGTNQTE\ 15 | QNGQVMNITNSGHENKTKGDSIQNEKNPNPIESLEKESAFKTKAEPISSSISNMELELNI\ 16 | HNSKAPKKNRLRRKSSTRHIHALELVVSRNLSPPNCTELQIDSCSSSEEIKKKKYNQMPV\ 17 | RHSRNLQLMEGKEPATGAKKSNKPNEQTSKRHDSDTFPELKLTNAPGSFTKCSNTSELKE\ 18 | FVNPSLPREEKEEKLETVKVSNNAEDPKDLMLSGERVLQTERSVESSSISLVPGTDYGTQ\ 19 | ESISLLEVSTLGKAKTEPNKCVSQCAAFENPKGLIHGCSKDNRNDTEGFKYPLGHEVNHS\ 20 | RETSIEMEESELDAQYLQNTFKVSKRQSFAPFSNPGNAEEECATFSAHSGSLKKQSPKVT\ 21 | FECEQKEENQGKNESNIKPVQTVNITAGFPVVGQKDKPVDNAKCSIKGGSRFCLSSQFRG\ 22 | NETGLITPNKHGLLQNPYRIPPLFPIKSFVKTKCKKNLLEENFEEHSMSPEREMGNENIP\ 23 | STVSTISRNNIRENVFKEASSSNINEVGSSTNEVGSSINEIGSSDENIQAELGRNRGPKL\ 24 | NAMLRLGVLQPEVYKQSLPGSNCKHPEIKKQEYEEVVQTVNTDFSPYLISDNLEQPMGSS\ 25 | HASQVCSETPDDLLDDGEIKEDTSFAENDIKESSAVFSKSVQKGELSRSPSPFTHTHLAQ\ 26 | GYRRGAKKLESSEENLSSEDEELPCFQHLLFGKVNNIPSQSTRHSTVATECLSKNTEENL\ 27 | LSLKNSLNDCSNQVILAKASQEHHLSEETKCSASLFSSQCSELEDLTANTNTQDPFLIGS\ 28 | SKQMRHQSESQGVGLSDKELVSDDEERGTGLEENNQEEQSMDSNLGEAASGCESETSVSE\ 29 | DCSGLSSQSDILTTQQRDTMQHNLIKLQQEMAELEAVLEQHGSQPSNSYPSIISDSSALE\ 30 | DLRNPEQSTSEKAVLTSQKSSEYPISQNPEGLSADKFEVSADSSTSKNKEPGVERSSPSK\ 31 | CPSLDDRWYMHSCSGSLQNRNYPSQEELIKVVDVEEQQLEESGPHDLTETSYLPRQDLEG\ 32 | TPYLESGISLFSDDPESDPSEDRAPESARVGNIPSSTSALKVPQLKVAESAQSPAAAHTT\ 33 | DTAGYNAMEESVSREKPELTASTERVNKRMSMVVSGLTPEEFMLVYKFARKHHITLTNLI\ 34 | TEETTHVVMKTDAEFVCERTLKYFLGIAGGKWVVSYFWVTQSIKERKMLNEHDFEVRGDV\ 35 | VNGRNHQGPKRARESQDRKIFRGLEICCYGPFTNMPTDQLEWMVQLCGASVVKELSSFTL\ 36 | GTGVHPIVVVQPDAWTEDNGFHAIGQMCEAPVVTREWVLDSVALYQCQELDTYLIPQIPH\ 37 | SHY" 38 | experiment: 'Complement' 39 | transform: 'None' 40 | authour: 'Findlay et al.' 41 | year: 2018 42 | title: 'Accurate classification of BRCA1 variants with saturation genome editing' 43 | lab: ['Shendure', 'Starita'] 44 | doi: '10.1038/s41586-018-0461-z' 45 | pmid: '30209399' 46 | url: 'https://www.nature.com/articles/s41586-018-0461-z' 47 | notes: "Mutated nucleotides exhaustively rather than AAs, so only has \ 48 | AAs that can be reached in a single substitution" 49 | input_files: 50 | - '41586_2018_461_MOESM3_ESM.xlsx' 51 | source: 'SI - Table S1' 52 | qc: 53 | filter: True 54 | notes: "Low coverage (only made single nucleotide substitutions)" 55 | --------------------------------------------------------------------------------