├── FOSJUN
    └── dataset
    │   ├── PDB
    │       └── 1fos_1.pdb
    │   └── elife-32472-supp1-v2.txt
├── FOSJUN_pipeline.R
├── GB1
    └── dataset
    │   ├── GB1_sequence.fasta
    │   ├── Olson2014_TableS2_doubles.txt
    │   ├── Olson2014_TableS2_singles.txt
    │   ├── Olson2014_TableS2_wildtype.txt
    │   ├── PDB
    │       ├── 1pga.pdb
    │       └── g_xray.pdb
    │   └── PSIPRED
    │       ├── gb1.psipass2
    │       ├── gb1.psipred.pdf
    │       ├── gb1.psipred.ps
    │       └── gb1.ss2
├── GB1_downsampling
    └── dataset
    │   ├── GB1_CDS_nt
    │   └── amino_acid_codon_conversion.txt
├── GB1_downsampling_pipeline.R
├── GB1_pipeline.R
├── LICENSE
├── README.md
├── RRM
    └── dataset
    │   ├── PDB
    │       └── 1cvj.pdb
    │   ├── PSIPRED
    │       ├── PAB1.psipass2
    │       ├── PAB1.psipred.pdf
    │       ├── PAB1.psipred.ps
    │       └── PAB1.ss2
    │   ├── RRM_domain_sequence.fasta
    │   └── Supplementary_Table_5_doubles.txt
├── RRM_pipeline.R
├── WW
    ├── WW_pipeline.R
    └── dataset
    │   ├── PDB
    │       ├── 1k9q.pdb
    │       ├── 1k9q_model1_6to29.pdb
    │       └── 1k9q_model1_mod_manual.pdb
    │   ├── PSIPRED
    │       ├── WW1.psipass2
    │       ├── WW1.psipred.pdf
    │       ├── WW1.psipred.ps
    │       └── WW1.ss2
    │   ├── WW_sequence.fasta
    │   └── bash_scripts
    │       ├── 001_bash_download_SRA_data.sh
    │       ├── 002_bash_fastq_dump.sh
    │       ├── 003_bash_fastqc.sh
    │       ├── 004_bash_usearch_pairedreadmerging_Q20_ee0p1.sh
    │       └── 005_bash_usearch_fastx_unique_Q20.sh
├── WW_pipeline.R
└── scripts
    ├── SS_from_PSIPRED.R
    ├── XPLOR
        ├── XPLOR_modeling_functions_v2.R
        ├── XPLOR_simulations.R
        ├── anneal_template.py
        └── refine_template.py
    ├── XPLOR_wrapper.R
    ├── analyse_XPLOR_results.R
    ├── calculate_pairwise_interaction_scores.R
    ├── call_epistasis_binary.R
    ├── convert_AAabr_one_three.R
    ├── create_directory_structure.R
    ├── deepcontact_transform_basic2d.R
    ├── epistasis_analytics.R
    ├── evaluate_contacts_vs_PDB.R
    ├── hbonds_from_betasheetpairing.R
    ├── identify_expand_seeds.R
    ├── misc
        ├── call_epistasis_symdata_v1.R
        ├── contact_matrix_from_pairdistances.R
        ├── kernel_structure_propensity.R
        ├── pairdistances_from_PDB_crystal.R
        ├── tau_specificity_score.R
        └── tile_heatmap_wrapper.R
    ├── pairdistances_from_PDB.R
    ├── plot_fitness_surface.R
    ├── predict_beta_sheets.R
    ├── predict_secondary_structure_elements.R
    ├── quantile_fitness_surface_adaptive.R
    ├── surface_at_gridpoint.R
    └── switch_double_DT.R


/GB1/dataset/GB1_sequence.fasta:
--------------------------------------------------------------------------------
1 | >GB1
2 | MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE


--------------------------------------------------------------------------------
/GB1/dataset/Olson2014_TableS2_singles.txt:
--------------------------------------------------------------------------------
1 | WT amino acid	Position	Mutation	Input Count	Selection CountQ	2	A	14663	38476Q	2	C	13001	23023Q	2	D	11488	18085Q	2	E	9501	15629Q	2	F	4770	13332Q	2	G	12460	27778Q	2	H	33615	71252Q	2	I	13180	28931Q	2	K	10166	23382Q	2	L	40106	95276Q	2	M	8287	20694Q	2	N	20166	39445Q	2	P	68124	167686Q	2	R	37351	99959Q	2	S	25071	55253Q	2	T	24951	57168Q	2	V	13922	28304Q	2	W	4687	11796Q	2	Y	11961	28793Y	3	A	24572	24579Y	3	C	4869	5187Y	3	D	11538	190Y	3	E	7916	121Y	3	F	12855	23428Y	3	G	9583	564Y	3	H	34286	68210Y	3	I	11006	5473Y	3	K	12120	1089Y	3	L	41550	56805Y	3	M	9407	9818Y	3	N	21962	4221Y	3	P	115397	13682Y	3	Q	19928	6234Y	3	R	30444	4311Y	3	S	38003	18224Y	3	T	45311	11803Y	3	V	13326	7033Y	3	W	5089	9798K	4	A	22568	27370K	4	C	4357	5731K	4	D	10377	10437K	4	E	8752	9196K	4	F	13134	17305K	4	G	14831	14501K	4	H	11157	14510K	4	I	23783	39679K	4	L	31238	71838K	4	M	16821	27867K	4	N	23440	28059K	4	P	72825	4085K	4	Q	8296	12864K	4	R	22469	39647K	4	S	31978	41200K	4	T	47044	68114K	4	V	18075	30181K	4	W	4507	7256K	4	Y	13209	17632L	5	A	102479	100175L	5	C	27859	37553L	5	D	32890	222L	5	E	26930	338L	5	F	16197	14567L	5	G	47338	18272L	5	H	92720	29969L	5	I	15033	5354L	5	K	23710	568L	5	M	34946	22256L	5	N	29983	20241L	5	P	133938	1649L	5	Q	61046	21300L	5	R	96469	1706L	5	S	96274	71799L	5	T	94325	95229L	5	V	43625	54635L	5	W	13800	1148L	5	Y	47702	6813I	6	A	3681	4673I	6	C	15717	23476I	6	D	5259	6253I	6	E	4087	8062I	6	F	8610	18820I	6	G	9897	19596I	6	H	26635	50187I	6	K	3704	6958I	6	L	51935	127409I	6	M	21205	30447I	6	N	5283	7607I	6	P	49132	349I	6	Q	36062	68637I	6	R	61986	117930I	6	S	29356	39046I	6	T	30894	57423I	6	V	3536	6449I	6	W	23812	50273I	6	Y	22409	54649L	7	A	34404	55458L	7	C	12449	22293L	7	D	11778	477L	7	E	18267	4238L	7	F	6309	4059L	7	G	35071	16207L	7	H	15664	1471L	7	I	12610	37038L	7	K	13270	5978L	7	M	17642	12594L	7	N	12614	4505L	7	P	50684	45596L	7	Q	20595	6443L	7	R	49573	10097L	7	S	40306	44544L	7	T	44897	66023L	7	V	24444	53108L	7	W	12875	1310L	7	Y	10612	2241N	8	A	41498	73325N	8	C	10419	19998N	8	D	11498	18688N	8	E	20466	33074N	8	F	7730	21302N	8	G	27388	43321N	8	H	11762	26546N	8	I	7929	12477N	8	K	14407	32239N	8	L	39548	65846N	8	M	11905	15333N	8	P	38933	1496N	8	Q	19093	36218N	8	R	44725	109754N	8	S	36002	58381N	8	T	36479	41620N	8	V	24209	37603N	8	W	13454	29498N	8	Y	8536	23340G	9	A	65857	90820G	9	C	14763	16172G	9	D	23794	5463G	9	E	16766	3278G	9	F	13660	6108G	9	H	14290	5115G	9	I	29687	7088G	9	K	21971	4655G	9	L	35873	12146G	9	M	28714	7328G	9	N	34727	7288G	9	P	46102	18963G	9	Q	10714	2366G	9	R	38107	16124G	9	S	59986	42587G	9	T	83378	44105G	9	V	39908	10779G	9	W	10074	4696G	9	Y	15887	6886K	10	A	21539	35206K	10	C	8456	14389K	10	D	23660	39063K	10	E	15907	24117K	10	F	9023	15345K	10	G	38207	44504K	10	H	19516	37164K	10	I	11121	20274K	10	L	32545	59716K	10	M	17008	20959K	10	N	26632	52790K	10	P	18073	12463K	10	Q	10356	17632K	10	R	39021	106293K	10	S	28845	45028K	10	T	20743	38733K	10	V	30739	58818K	10	W	9949	19051K	10	Y	16988	30337T	11	A	21240	12806T	11	C	16220	11209T	11	D	18836	10728T	11	E	21981	11689T	11	F	17814	11918T	11	G	51023	24644T	11	H	29467	19328T	11	I	13690	8447T	11	K	7671	5108T	11	L	52385	32984T	11	M	13196	5420T	11	N	11666	8132T	11	P	62555	19555T	11	Q	59534	38097T	11	R	84936	60451T	11	S	53756	68761T	11	V	17090	11551T	11	W	23128	16088T	11	Y	20849	14953L	12	A	80686	25231L	12	C	23994	12420L	12	D	37546	8641L	12	E	44908	12601L	12	F	21145	13767L	12	G	126077	39005L	12	H	48068	15096L	12	I	38122	29613L	12	K	36433	12378L	12	M	41835	20147L	12	N	51115	12577L	12	P	92976	16826L	12	Q	40010	14668L	12	R	133367	45200L	12	S	88507	27502L	12	T	77923	30648L	12	V	61316	35207L	12	W	33165	9147L	12	Y	41640	9491K	13	A	51090	87056K	13	C	11935	24232K	13	D	16942	16219K	13	E	23256	33571K	13	F	15918	22650K	13	G	16015	20792K	13	H	17858	24023K	13	I	18456	28325K	13	L	53619	58806K	13	M	17213	26339K	13	N	24914	48289K	13	P	92147	28369K	13	Q	29766	48077K	13	R	41546	101318K	13	S	55067	151624K	13	T	73085	212720K	13	V	30084	56440K	13	W	13803	44516K	13	Y	16363	17424G	14	A	59036	22637G	14	C	15405	5946G	14	D	19272	4508G	14	E	24981	10605G	14	F	18216	3836G	14	H	28633	10268G	14	I	15815	5793G	14	K	15236	3390G	14	L	73241	17722G	14	M	19303	2655G	14	N	18163	6746G	14	P	107029	15918G	14	Q	27506	9577G	14	R	60734	17476G	14	S	53429	19145G	14	T	55458	23123G	14	V	35098	14502G	14	W	13041	6159G	14	Y	17488	5065E	15	A	35374	54806E	15	C	11999	25648E	15	D	25843	27352E	15	F	13347	28524E	15	G	43958	38387E	15	H	26373	76960E	15	I	21230	51279E	15	K	19141	35533E	15	L	66676	111516E	15	M	23308	33648E	15	N	28746	40639E	15	P	45923	16911E	15	Q	21882	51881E	15	R	51673	127737E	15	S	45073	119282E	15	T	37041	94156E	15	V	41378	106208E	15	W	13024	32676E	15	Y	17783	51121T	16	A	38128	29083T	16	C	18196	16997T	16	D	12383	4727T	16	E	27018	21493T	16	F	20611	13948T	16	G	113760	91632T	16	H	20554	19280T	16	I	22641	32740T	16	K	14421	20458T	16	L	86808	114278T	16	M	42047	39266T	16	N	10693	8357T	16	P	90794	66331T	16	Q	40611	48881T	16	R	106367	184675T	16	S	71248	69455T	16	V	57504	73376T	16	W	39584	20763T	16	Y	15737	10608T	17	A	49502	105171T	17	C	12726	24562T	17	D	23175	17511T	17	E	26208	32446T	17	F	19124	36535T	17	G	77342	250890T	17	H	29294	67973T	17	I	21379	26724T	17	K	17966	31951T	17	L	45193	66035T	17	M	16394	26591T	17	N	24237	32505T	17	P	46699	28322T	17	Q	16481	31164T	17	R	57501	142948T	17	S	60126	120151T	17	V	42924	57540T	17	W	16674	41918T	17	Y	17209	33102T	18	A	24941	47753T	18	C	12780	26146T	18	D	14900	7125T	18	E	18261	8260T	18	F	10364	16480T	18	G	31188	34862T	18	H	15556	27435T	18	I	12371	18372T	18	K	13299	26175T	18	L	40817	41499T	18	M	12954	8022T	18	N	14761	20680T	18	P	32959	2273T	18	Q	17438	20418T	18	R	40452	84029T	18	S	43099	70606T	18	V	28486	56671T	18	W	11084	18827T	18	Y	11917	19875E	19	A	18824	47540E	19	C	4896	11333E	19	D	11420	23477E	19	F	7277	19654E	19	G	25998	63362E	19	H	5505	14168E	19	I	8639	23671E	19	K	10812	29121E	19	L	20503	54766E	19	M	11552	23452E	19	N	13257	34536E	19	P	18245	48044E	19	Q	9046	23201E	19	R	17346	48488E	19	S	23583	57158E	19	T	20505	49629E	19	V	11313	30847E	19	W	2977	8035E	19	Y	5374	14191A	20	C	24537	37806A	20	D	20563	9947A	20	E	15652	10857A	20	F	16676	7008A	20	G	44945	71573A	20	H	42209	43445A	20	I	19267	11063A	20	K	15365	43568A	20	L	77574	53578A	20	M	24103	22390A	20	N	26540	36162A	20	P	51963	27499A	20	Q	28547	43826A	20	R	64542	264673A	20	S	51408	99798A	20	T	41910	78941A	20	V	44986	82564A	20	W	11207	7651A	20	Y	24835	13413V	21	A	13650	23326V	21	C	14099	26854V	21	D	10031	12945V	21	E	19264	21930V	21	F	13024	30576V	21	G	61067	118204V	21	H	15286	28460V	21	I	24563	48919V	21	K	20400	35012V	21	L	79943	141666V	21	M	47124	62252V	21	N	8333	15994V	21	P	55694	88295V	21	Q	33194	55950V	21	R	65544	175477V	21	S	46233	79777V	21	T	56564	93099V	21	W	30867	77314V	21	Y	11617	27353D	22	A	9047	25911D	22	C	10972	27342D	22	E	16317	44052D	22	F	6210	15240D	22	G	23209	59545D	22	H	11342	27861D	22	I	19813	49271D	22	K	13479	24719D	22	L	34282	88054D	22	M	20338	40916D	22	N	16267	35157D	22	P	16807	29528D	22	Q	13803	37575D	22	R	28638	61579D	22	S	32697	70745D	22	T	20286	48870D	22	V	24190	64599D	22	W	13688	38811D	22	Y	9194	22559A	23	C	18057	10764A	23	D	11942	8380A	23	E	17648	8102A	23	F	21097	302A	23	G	34128	30318A	23	H	12892	1590A	23	I	16994	2500A	23	K	16460	1807A	23	L	60733	8751A	23	M	17224	1063A	23	N	13924	13401A	23	P	77434	143942A	23	Q	15496	2201A	23	R	47171	4880A	23	S	56880	84047A	23	T	36646	21766A	23	V	39290	17221A	23	W	19887	201A	23	Y	13372	157A	24	C	10754	36229A	24	D	13144	24369A	24	E	16991	103524A	24	F	16392	103489A	24	G	39536	168270A	24	H	5846	37594A	24	I	18686	47237A	24	K	11269	3056A	24	L	32451	73148A	24	M	16651	61844A	24	N	11510	48609A	24	P	18147	108794A	24	Q	7560	36593A	24	R	24555	20139A	24	S	34534	124522A	24	T	26572	111754A	24	V	53445	150842A	24	W	9271	68725A	24	Y	8272	71811T	25	A	46590	93295T	25	C	37737	91653T	25	D	49759	32504T	25	E	32850	20013T	25	F	54190	167843T	25	G	55253	88914T	25	H	36621	89699T	25	I	40464	105691T	25	K	33520	71394T	25	L	89027	172970T	25	M	36011	127671T	25	N	64696	106237T	25	P	28742	59440T	25	Q	23603	67274T	25	R	62651	255882T	25	S	85249	217570T	25	V	64589	106108T	25	W	20589	77262T	25	Y	64615	129609A	26	C	40939	52103A	26	D	16721	171A	26	E	21980	281A	26	F	36840	476A	26	G	55786	111192A	26	H	36742	414A	26	I	47218	12166A	26	K	30044	275A	26	L	153776	2683A	26	M	77582	16353A	26	N	17271	424A	26	P	110496	44600A	26	Q	59433	6474A	26	R	112216	1777A	26	S	97456	175161A	26	T	89974	68569A	26	V	47681	26746A	26	W	65263	2807A	26	Y	33011	463E	27	A	33597	134E	27	C	25543	227E	27	D	27258	164E	27	F	23278	87E	27	G	62168	332E	27	H	31349	152E	27	I	41336	218E	27	K	17150	73E	27	L	88198	379E	27	M	36178	152E	27	N	28919	142E	27	P	49182	300E	27	Q	21151	90E	27	R	83788	339E	27	S	70701	295E	27	T	41217	210E	27	V	70819	345E	27	W	17196	71E	27	Y	25091	126K	28	A	26191	2268K	28	C	23792	5594K	28	D	16711	145K	28	E	16630	150K	28	F	14273	3425K	28	G	31664	2623K	28	H	17790	3067K	28	I	8669	4232K	28	L	42503	14362K	28	M	15445	3448K	28	N	20952	1555K	28	P	35132	4458K	28	Q	21106	1918K	28	R	49893	36868K	28	S	52024	3631K	28	T	33309	5330K	28	V	20079	5930K	28	W	17099	1177K	28	Y	20806	1597V	29	A	39812	92273V	29	C	19851	45232V	29	D	23564	10442V	29	E	25354	12525V	29	F	15440	26765V	29	G	68382	114448V	29	H	15935	32754V	29	I	17523	39268V	29	K	13073	51292V	29	L	52829	116555V	29	M	26006	53730V	29	N	14142	35376V	29	P	38418	7455V	29	Q	10223	23746V	29	R	33965	158139V	29	S	54027	118947V	29	T	46829	102169V	29	W	11810	15508V	29	Y	20082	36706F	30	A	18125	6748F	30	C	10727	10717F	30	D	9088	65F	30	E	12864	105F	30	G	28882	2388F	30	H	12045	17363F	30	I	14663	5071F	30	K	13030	1544F	30	L	61921	84159F	30	M	21611	24474F	30	N	13478	3499F	30	P	14188	144F	30	Q	9211	1261F	30	R	27385	236F	30	S	35649	21685F	30	T	20023	3170F	30	V	28681	20083F	30	W	15150	11664F	30	Y	15979	11560K	31	A	55355	306K	31	C	19770	223K	31	D	22264	107K	31	E	34179	166K	31	F	25061	132K	31	G	89526	447K	31	H	28017	103K	31	I	37026	239K	31	L	59926	627K	31	M	29810	558K	31	N	19575	120K	31	P	37140	233K	31	Q	19850	122K	31	R	96271	38897K	31	S	58473	277K	31	T	70080	283K	31	V	58660	287K	31	W	37743	306K	31	Y	25248	95Q	32	A	45737	62296Q	32	C	25211	35086Q	32	D	26730	2704Q	32	E	16067	2097Q	32	F	13104	5269Q	32	G	41499	107395Q	32	H	34805	69113Q	32	I	23939	16270Q	32	K	19405	43027Q	32	L	53796	89371Q	32	M	17386	24572Q	32	N	27848	78606Q	32	P	84178	2721Q	32	R	68222	161777Q	32	S	75023	117147Q	32	T	59318	68119Q	32	V	37714	27235Q	32	W	15661	6587Q	32	Y	23896	8581Y	33	A	42937	14211Y	33	C	13618	7227Y	33	D	12938	450Y	33	E	27209	2110Y	33	F	14944	40764Y	33	G	32478	17473Y	33	H	27158	17449Y	33	I	9098	3549Y	33	K	15792	8926Y	33	L	52029	59654Y	33	M	12336	10217Y	33	N	12552	4067Y	33	P	121812	1734Y	33	Q	27382	11808Y	33	R	55133	28853Y	33	S	49671	15665Y	33	T	36843	10498Y	33	V	30407	9053Y	33	W	20801	23701A	34	C	10374	18607A	34	D	15003	109A	34	E	22624	158A	34	F	15606	339A	34	G	45889	20105A	34	H	12544	110A	34	I	18933	1713A	34	K	14995	92A	34	L	49709	3575A	34	M	21124	28072A	34	N	15857	173A	34	P	65070	440A	34	Q	14331	127A	34	R	38446	770A	34	S	66756	91257A	34	T	63560	23065A	34	V	41506	12859A	34	W	12172	187A	34	Y	13350	426N	35	A	63720	1396N	35	C	19456	1709N	35	D	19194	226N	35	E	23562	279N	35	F	20382	650N	35	G	53332	4777N	35	H	10896	295N	35	I	17748	977N	35	K	20037	245N	35	L	57502	1005N	35	M	24975	530N	35	P	25199	132N	35	Q	17172	241N	35	R	51659	1222N	35	S	59069	5017N	35	T	42623	7111N	35	V	56124	1001N	35	W	16332	844N	35	Y	14304	375D	36	A	37862	108638D	36	C	13800	35933D	36	E	24683	48688D	36	F	14014	21786D	36	G	40780	130232D	36	H	20486	39282D	36	I	17312	40222D	36	K	16394	37048D	36	L	54438	136708D	36	M	22041	54362D	36	N	14209	45759D	36	P	61131	1491D	36	Q	23100	53835D	36	R	41259	121420D	36	S	36216	104810D	36	T	41883	105626D	36	V	37504	107749D	36	W	19650	29800D	36	Y	17074	30458N	37	A	10769	4178N	37	C	6461	6480N	37	D	5993	659N	37	E	8814	2283N	37	F	4969	492N	37	G	13000	8525N	37	H	14291	8514N	37	I	13969	8967N	37	K	27145	9793N	37	L	30911	16367N	37	M	16191	5886N	37	P	18721	181N	37	Q	12411	6013N	37	R	37937	12594N	37	S	32491	33449N	37	T	9824	7775N	37	V	10572	5024N	37	W	17030	4636N	37	Y	8329	1424G	38	A	39224	67579G	38	C	18699	17573G	38	D	19563	19070G	38	E	19515	15234G	38	F	15474	15153G	38	H	13944	11076G	38	I	11316	4050G	38	K	15497	11369G	38	L	50251	53855G	38	M	17289	13778G	38	N	14985	12910G	38	P	21785	259G	38	Q	15375	12082G	38	R	53827	39352G	38	S	49899	51247G	38	T	22571	18387G	38	V	44184	31440G	38	W	22393	23946G	38	Y	11640	12772V	39	A	50993	4710V	39	C	29687	10729V	39	D	27237	263V	39	E	24644	1203V	39	F	45582	25645V	39	G	54685	677V	39	H	10859	429V	39	I	41684	89921V	39	K	13498	406V	39	L	59997	151290V	39	M	22038	38487V	39	N	21962	381V	39	P	26222	350V	39	Q	9603	1471V	39	R	45394	1165V	39	S	58447	2992V	39	T	31568	11612V	39	W	20952	325V	39	Y	20930	2249D	40	A	72913	261212D	40	C	22805	62542D	40	E	34701	44572D	40	F	29759	135618D	40	G	36838	47487D	40	H	24747	113760D	40	I	25622	68491D	40	K	24297	41248D	40	L	71102	96247D	40	M	30818	50279D	40	N	13713	36886D	40	P	47981	28789D	40	Q	26309	43722D	40	R	92738	227667D	40	S	84297	272107D	40	T	64079	111421D	40	V	76641	222704D	40	W	34963	203511D	40	Y	23207	134736G	41	A	40035	7202G	41	C	27048	2450G	41	D	28032	249G	41	E	25625	246G	41	F	28902	2890G	41	H	22666	147G	41	I	19771	101G	41	K	22532	137G	41	L	61881	902G	41	M	34714	679G	41	N	21696	186G	41	P	40724	298G	41	Q	24377	152G	41	R	85007	552G	41	S	49820	1032G	41	T	45043	276G	41	V	53408	1274G	41	W	35914	403G	41	Y	27320	559E	42	A	41720	110713E	42	C	34433	91108E	42	D	28410	58190E	42	F	11284	31141E	42	G	48896	132453E	42	H	79258	179793E	42	I	215952	707845E	42	K	31298	88839E	42	L	107262	396377E	42	M	118559	355384E	42	N	43914	117273E	42	P	78746	93537E	42	Q	21762	80916E	42	R	101401	333896E	42	S	205734	590830E	42	T	75886	240399E	42	V	58682	185266E	42	W	15388	55497E	42	Y	39163	106077W	43	A	34051	222W	43	C	31817	256W	43	D	16025	93W	43	E	19799	111W	43	F	31836	722W	43	G	28923	234W	43	H	33953	247W	43	I	13577	75W	43	K	23290	106W	43	L	90390	568W	43	M	19622	103W	43	N	20699	101W	43	P	102078	542W	43	Q	24945	142W	43	R	60402	512W	43	S	71157	404W	43	T	45303	279W	43	V	26354	151W	43	Y	38099	531T	44	A	54538	137791T	44	C	23389	38879T	44	D	43206	51360T	44	E	36333	61956T	44	F	46475	82200T	44	G	61901	149762T	44	H	27996	54886T	44	I	98732	98379T	44	K	31904	68023T	44	L	90253	157222T	44	M	47645	96567T	44	N	68354	97684T	44	P	45159	733T	44	Q	18368	45672T	44	R	52296	110049T	44	S	104291	210464T	44	V	95580	80822T	44	W	18350	40417T	44	Y	20506	30793Y	45	A	34745	3261Y	45	C	19082	6808Y	45	D	15906	261Y	45	E	16013	349Y	45	F	18795	29616Y	45	G	38347	2409Y	45	H	12601	5219Y	45	I	14215	5785Y	45	K	14572	1205Y	45	L	62081	47234Y	45	M	23118	11079Y	45	N	11795	911Y	45	P	26630	338Y	45	Q	9616	423Y	45	R	47758	3954Y	45	S	39780	2840Y	45	T	23175	1611Y	45	V	51587	12146Y	45	W	26206	81940D	46	A	48346	102811D	46	C	20557	45983D	46	E	37556	65024D	46	F	18628	34090D	46	G	23940	48100D	46	H	27683	60059D	46	I	21946	45693D	46	K	32575	75213D	46	L	64854	138921D	46	M	26037	54534D	46	N	20019	47544D	46	P	39434	2273D	46	Q	35722	76767D	46	R	90713	214058D	46	S	53436	128674D	46	T	59465	160767D	46	V	49576	109810D	46	W	40239	79467D	46	Y	21535	36622D	47	A	7204	21366D	47	C	13478	32851D	47	E	12678	24553D	47	F	6624	20154D	47	G	51719	141201D	47	H	26045	98148D	47	I	77112	191499D	47	K	36526	135127D	47	L	67112	176415D	47	M	39127	94955D	47	N	32525	93407D	47	P	19598	64620D	47	Q	14347	42281D	47	R	95155	459818D	47	S	81857	253299D	47	T	10792	28271D	47	V	46400	122163D	47	W	30408	112440D	47	Y	12387	37249A	48	C	25328	52729A	48	D	10830	20547A	48	E	19124	38164A	48	F	14034	33455A	48	G	45860	98142A	48	H	9313	21925A	48	I	20022	47453A	48	K	37423	86560A	48	L	105890	242246A	48	M	42758	96305A	48	N	19408	42082A	48	P	18594	40783A	48	Q	12222	27746A	48	R	70326	162935A	48	S	56835	124948A	48	T	33682	75483A	48	V	41059	95216A	48	W	41855	94752A	48	Y	23998	54383T	49	A	23592	47971T	49	C	22589	48249T	49	D	16890	31419T	49	E	10154	20579T	49	F	64416	134447T	49	G	44684	90121T	49	H	7397	16284T	49	I	43837	108528T	49	K	9120	21107T	49	L	60630	132230T	49	M	22054	45431T	49	N	20824	41777T	49	P	11871	20258T	49	Q	4755	10031T	49	R	24029	54393T	49	S	41183	92486T	49	V	67111	133195T	49	W	10081	22640T	49	Y	20669	45721K	50	A	20795	41734K	50	C	16071	26863K	50	D	7948	13072K	50	E	10822	18816K	50	F	21470	28018K	50	G	44464	120012K	50	H	7675	15678K	50	I	13587	19713K	50	L	45151	69429K	50	M	17327	23878K	50	N	11244	18577K	50	P	7370	4866K	50	Q	7663	14302K	50	R	38243	88134K	50	S	26578	50576K	50	T	12098	21044K	50	V	30878	56855K	50	W	21825	35792K	50	Y	12759	18361T	51	A	26978	38976T	51	C	18668	37878T	51	D	13315	28365T	51	E	18309	48531T	51	F	14899	40786T	51	G	46770	86314T	51	H	11831	34308T	51	I	15310	46622T	51	K	11778	34328T	51	L	37478	134802T	51	M	18098	57337T	51	N	11582	27958T	51	P	14245	493T	51	Q	14867	43071T	51	R	50483	147093T	51	S	35977	54557T	51	V	37307	115951T	51	W	25156	74824T	51	Y	15632	44003F	52	A	23631	718F	52	C	39559	1525F	52	D	43358	275F	52	E	85794	592F	52	G	130332	1109F	52	H	42029	1615F	52	I	37954	3926F	52	K	89853	605F	52	L	102269	37343F	52	M	44034	13994F	52	N	52808	379F	52	P	47858	336F	52	Q	63866	494F	52	R	179406	1251F	52	S	98399	1048F	52	T	62918	784F	52	V	37470	2101F	52	W	76426	948F	52	Y	53880	9805T	53	A	35249	61175T	53	C	11727	18709T	53	D	16704	18240T	53	E	22498	37446T	53	F	7020	6562T	53	G	39615	121803T	53	H	13525	15852T	53	I	8221	10792T	53	K	20005	32942T	53	L	42622	40972T	53	M	9002	12462T	53	N	14090	18607T	53	P	30936	462T	53	Q	13143	23217T	53	R	46367	72498T	53	S	40744	82571T	53	V	30573	56318T	53	W	12247	21316T	53	Y	11914	12078V	54	A	25806	52860V	54	C	8191	17279V	54	D	8840	416V	54	E	8258	435V	54	F	13497	16528V	54	G	23276	9231V	54	H	4506	1088V	54	I	7362	4142V	54	K	4728	54V	54	L	42347	24038V	54	M	6622	5786V	54	N	6557	2202V	54	P	10201	115V	54	Q	4880	905V	54	R	19127	176V	54	S	19097	14125V	54	T	11354	10421V	54	W	7260	7582V	54	Y	6720	286T	55	A	20582	24299T	55	C	18726	22674T	55	D	17959	15830T	55	E	18307	19558T	55	F	16398	17146T	55	G	44442	34757T	55	H	10879	11224T	55	I	18174	17308T	55	K	16615	24381T	55	L	35437	35932T	55	M	18530	19909T	55	N	17278	18366T	55	P	7685	436T	55	Q	12523	14755T	55	R	48640	79126T	55	S	31279	45403T	55	V	29999	34713T	55	W	18644	25725T	55	Y	17203	19707E	56	A	32602	17262E	56	C	13643	4884E	56	D	16629	4912E	56	F	12085	4330E	56	G	23114	9238E	56	H	17551	7403E	56	I	10718	2529E	56	K	18397	8191E	56	L	36684	8700E	56	M	16210	4810E	56	N	16311	5976E	56	P	27785	6802E	56	Q	19970	14058E	56	R	45467	20955E	56	S	42231	26020E	56	T	29883	21199E	56	V	29251	12541E	56	W	17351	9023E	56	Y	16560	5434


--------------------------------------------------------------------------------
/GB1/dataset/Olson2014_TableS2_wildtype.txt:
--------------------------------------------------------------------------------
1 | Input Count	Selection Count1759616	3041819


--------------------------------------------------------------------------------
/GB1/dataset/PSIPRED/gb1.psipass2:
--------------------------------------------------------------------------------
1 | # PSIPRED HFORMAT (PSIPRED V3.3)
2 | 
3 | Conf: 93799980512412445775088889999997554258740465407823688529
4 | Pred: CEEEEEEECCCCCEEEEEEEECHHHHHHHHHHHHHHCCCCEEEEEECCCEEEEEEC
5 |   AA: MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE
6 |               10        20        30        40        50  
7 | 
8 | 


--------------------------------------------------------------------------------
/GB1/dataset/PSIPRED/gb1.psipred.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lehner-lab/DMS2structure/3c1976b78a743d4d4c73921f047648500e671c8e/GB1/dataset/PSIPRED/gb1.psipred.pdf


--------------------------------------------------------------------------------
/GB1/dataset/PSIPRED/gb1.ss2:
--------------------------------------------------------------------------------
 1 | # PSIPRED VFORMAT (PSIPRED V3.3)
 2 | 
 3 |    1 M C   1.000  0.000  0.000
 4 |    2 Q E   0.293  0.002  0.652
 5 |    3 Y E   0.106  0.017  0.846
 6 |    4 K E   0.028  0.003  0.946
 7 |    5 L E   0.012  0.004  0.972
 8 |    6 I E   0.016  0.006  0.976
 9 |    7 L E   0.061  0.007  0.923
10 |    8 N E   0.418  0.006  0.515
11 |    9 G C   0.742  0.004  0.202
12 |   10 K C   0.575  0.007  0.425
13 |   11 T C   0.608  0.008  0.390
14 |   12 L C   0.680  0.008  0.266
15 |   13 K C   0.524  0.011  0.385
16 |   14 G E   0.320  0.009  0.612
17 |   15 E E   0.249  0.005  0.726
18 |   16 T E   0.192  0.046  0.656
19 |   17 T E   0.131  0.030  0.700
20 |   18 T E   0.064  0.069  0.847
21 |   19 E E   0.056  0.110  0.875
22 |   20 A E   0.192  0.086  0.711
23 |   21 V E   0.468  0.123  0.480
24 |   22 D C   0.899  0.063  0.068
25 |   23 A H   0.070  0.905  0.002
26 |   24 A H   0.076  0.885  0.004
27 |   25 T H   0.048  0.926  0.003
28 |   26 A H   0.017  0.975  0.001
29 |   27 E H   0.013  0.983  0.000
30 |   28 K H   0.010  0.982  0.000
31 |   29 V H   0.008  0.985  0.000
32 |   30 F H   0.016  0.975  0.001
33 |   31 K H   0.032  0.947  0.005
34 |   32 Q H   0.095  0.851  0.032
35 |   33 Y H   0.218  0.750  0.023
36 |   34 A H   0.207  0.780  0.010
37 |   35 N H   0.247  0.736  0.008
38 |   36 D H   0.367  0.584  0.009
39 |   37 N C   0.741  0.196  0.021
40 |   38 G C   0.930  0.019  0.035
41 |   39 V C   0.858  0.008  0.122
42 |   40 D C   0.704  0.005  0.276
43 |   41 G E   0.453  0.002  0.523
44 |   42 E E   0.268  0.001  0.767
45 |   43 W E   0.173  0.000  0.847
46 |   44 T E   0.204  0.001  0.801
47 |   45 Y E   0.299  0.005  0.710
48 |   46 D E   0.493  0.012  0.501
49 |   47 D C   0.854  0.052  0.060
50 |   48 A C   0.907  0.029  0.042
51 |   49 T C   0.616  0.034  0.331
52 |   50 K E   0.344  0.008  0.661
53 |   51 T E   0.180  0.006  0.858
54 |   52 F E   0.097  0.001  0.920
55 |   53 T E   0.064  0.001  0.944
56 |   54 V E   0.215  0.001  0.748
57 |   55 T E   0.358  0.002  0.574
58 |   56 E C   0.977  0.000  0.001
59 | 


--------------------------------------------------------------------------------
/GB1_downsampling/dataset/GB1_CDS_nt:
--------------------------------------------------------------------------------
 1 | >M13825.1:578-1924 Streptococcus sp. (Lancefield group G) spg gene encoding an immunoglobulin G binding protein
 2 | ATGGAAAAAGAAAAAAAGGTAAAATACTTTTTACGTAAATCAGCTTTTGGGTTAGCATCCGTATCAGCTG
 3 | CATTTTTAGTGGGATCAACGGTATTCGCTGTTGATTCACCAATCGAAGATACCCCAATTATTCGTAATGG
 4 | TGGTGAATTAACTAATCTTCTGGGGAATTCAGAGACAACACTGGCTTTGCGTAATGAAGAGAGTGCTACA
 5 | GCTGATTTGACAGCAGCAGCGGTAGCCGATACTGTGGCAGCAGCGGCAGCTGAAAATGCTGGGGCAGCAG
 6 | CTTGGGAAGCAGCGGCAGCAGCAGATGCTCTAGCAAAAGCCAAAGCAGATGCCCTTAAAGAATTCAACAA
 7 | ATATGGAGTAAGTGACTATTACAAGAATCTAATCAACAATGCCAAAACTGTTGAAGGCATAAAAGACCTT
 8 | CAAGCACAAGTTGTTGAATCAGCGAAGAAAGCGCGTATTTCAGAAGCAACAGATGGCTTATCTGATTTCT
 9 | TGAAATCGCAAACACCTGCTGAAGATACTGTTAAATCAATTGAATTAGCTGAAGCTAAAGTCTTAGCTAA
10 | CAGAGAACTTGACAAATATGGAGTAAGTGACTATCACAAGAACCTAATCAACAATGCCAAAACTGTTGAA
11 | GGTGTAAAAGAACTGATAGATGAAATTTTAGCTGCATTACCTAAGACTGACACTTACAAATTAATCCTTA
12 | ATGGTAAAACATTGAAAGGCGAAACAACTACTGAAGCTGTTGATGCTGCTACTGCAGAAAAAGTCTTCAA
13 | ACAATACGCTAACGACAACGGTGTTGACGGTGAATGGACTTACGACGATGCGACTAAGACCTTTACAGTT
14 | ACTGAAAAACCAGAAGTGATCGATGCGTCTGAATTAACACCAGCCGTGACAACTTACAAACTTGTTATTA
15 | ATGGTAAAACATTGAAAGGCGAAACAACTACTAAAGCAGTAGACGCAGAAACTGCAGAAAAAGCCTTCAA
16 | ACAATACGCTAACGACAACGGTGTTGATGGTGTTTGGACTTATGATGATGCGACTAAGACCTTTACGGTA
17 | ACTGAAATGGTTACAGAGGTTCCTGGTGATGCACCAACTGAACCAGAAAAACCAGAAGCAAGTATCCCTC
18 | TTGTTCCGTTAACTCCTGCAACTCCAATTGCTAAAGATGACGCTAAGAAAGACGATACTAAGAAAGAAGA
19 | TGCTAAAAAACCAGAAGCTAAGAAAGATGACGCTAAGAAAGCTGAAACTCTTCCTACAACTGGTGAAGGA
20 | AGCAACCCATTCTTCACAGCAGCTGCGCTTGCAGTAATGGCTGGTGCGGGTGCTTTGGCGGTCGCTTCAA
21 | AACGTAAAGAAGACTAA
22 | 


--------------------------------------------------------------------------------
/GB1_downsampling/dataset/amino_acid_codon_conversion.txt:
--------------------------------------------------------------------------------
1 | Amino Acid	SLC	DNA codonsIsoleucine	I	ATTIsoleucine	I	ATCIsoleucine	I	ATALeucine	L	CTTLeucine	L	CTCLeucine	L	CTALeucine	L	CTGLeucine	L	TTALeucine	L	TTGValine	V	GTCValine	V	GTAValine	V	GTGValine	V	GTTPhenylalanine	F	TTCPhenylalanine	F	TTTMethionine	M	ATGCysteine	C	TGCCysteine	C	TGTAlanine	A	GCCAlanine	A	GCAAlanine	A	GCGAlanine	A	GCTGlycine	G	GGGGlycine	G	GGAGlycine	G	GGCGlycine	G	GGTProline	P	CCGProline	P	CCAProline	P	CCCProline	P	CCTThreonine	T	ACTThreonine	T	ACCThreonine	T	ACAThreonine	T	ACGSerine	S	TCTSerine	S	TCCSerine	S	TCASerine	S	TCGSerine	S	AGTSerine	S	AGCTyrosine	Y	TATTyrosine	Y	TACTryptophan	W	TGGGlutamine	Q	CAGGlutamine	Q	CAAAsparagine	N	AATAsparagine	N	AACHistidine	H	CATHistidine	H	CACGlutamicacid	E	GAAGlutamicacid	E	GAGAsparticacid	D	GATAsparticacid	D	GACLysine	K	AAALysine	K	AAGArginine	R	CGTArginine	R	CGCArginine	R	CGAArginine	R	CGGArginine	R	AGAArginine	R	AGGStopcodons	Stop	TAGStopcodons	Stop	TGAStopcodons	Stop	TAA


--------------------------------------------------------------------------------
/GB1_downsampling_pipeline.R:
--------------------------------------------------------------------------------
  1 | ############################
  2 | ##### GB1 downsampling #####
  3 | ############################
  4 | 
  5 | #this is the pipeline used to analyse the downsampled versions of the GB1 data from Olson et al. 2014
  6 | 
  7 | #first, set the working directory to the DMS2structure folder
  8 | setwd("/where/is/DMS2structure/")
  9 | 
 10 | #source scripts
 11 | filelist = list.files('scripts/')
 12 | sapply(paste0('scripts/',filelist),source,.GlobalEnv)
 13 | 
 14 | #create the necessary subfolder structure for all results and processed data
 15 | dataset_dir = "GB1_downsampling/"
 16 | create_directory_structure(dataset_dir)
 17 | #then save this script in the dataset_dir
 18 | #and paste all necessary source data into dataset_dir/dataset
 19 | 
 20 | #load required packages
 21 | require(data.table)
 22 | require(ggplot2)
 23 | require(cowplot)
 24 | require(GGally)
 25 | theme_set(theme_minimal())
 26 | require(seqinr)
 27 | 
 28 | 
 29 | ###############################
 30 | ##### READ downsampling #######
 31 | ###############################
 32 | 
 33 | read_downsampling = c(0.25,0.1,0.025)
 34 | 
 35 | for (RD in seq_along(read_downsampling)) {
 36 |   
 37 |   #load original data
 38 |   wildtype = fread("GB1/dataset/Olson2014_TableS2_wildtype.txt", sep = "\t", header = TRUE)
 39 |   singles = fread("GB1/dataset/Olson2014_TableS2_singles.txt", sep = "\t",header = TRUE)
 40 |   doubles = fread("GB1/dataset/Olson2014_TableS2_doubles.txt", sep = "\t",header = TRUE)
 41 |   doubles[,c("V11","V12","V13","V14","V15","V16","V17","V18") := NULL]
 42 |   
 43 |   # rename coloumns
 44 |   colnames(wildtype) = c("count_r1_t0","count_r1_t1")
 45 |   colnames(singles) = c("WT_AA","Pos","Mut","count_r1_t0","count_r1_t1")
 46 |   colnames(doubles) = c("WT_AA1","Pos1","Mut1","WT_AA2","Pos2","Mut2","count_r1_t0","count_r1_t1","fitness1","fitness2")
 47 |   
 48 |   
 49 |   ## rearrange doubles$GB1 such that always Pos1 < Pos2
 50 |   doubles[Pos1 > Pos2,':=' (Pos1=Pos2,WT_AA1 = WT_AA2,Mut1 = Mut2,fitness1 = fitness2,
 51 |                             Pos2=Pos1,WT_AA2 = WT_AA1,Mut2 = Mut1,fitness2 = fitness1)]
 52 |   
 53 |   
 54 |   ########################
 55 |   ##### downsample #######
 56 |   ########################
 57 |   set.seed(1603)
 58 |   wildtype[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD])]
 59 |   wildtype[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD])]
 60 |   
 61 |   singles[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD]),.(Pos,Mut)]
 62 |   singles[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD]),.(Pos,Mut)]
 63 |   
 64 |   doubles[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD]),.(Pos1,Pos2,Mut1,Mut2)]
 65 |   doubles[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD]),.(Pos1,Pos2,Mut1,Mut2)]
 66 |   
 67 |   
 68 |   ## calculate fitness
 69 |   wildtype[,fitness:=0]
 70 |   singles[,fitness := log(count_r1_t1/count_r1_t0 * (wildtype$count_r1_t0 / wildtype$count_r1_t1))]
 71 |   
 72 |   #wild-type correction factor
 73 |   xd=density(singles$fitness,bw=.15)
 74 |   # plot(xd)
 75 |   #fitness peak ~ wildtype peak
 76 |   ## both fits give similar result for upper mode, exp(0.193) and exp(0.169)
 77 |   correction_factor_wildtype =xd$x[xd$y==max(xd$y)]
 78 |   
 79 |   #correct fitness value for this factor
 80 |   singles[,fitness := NULL]
 81 |   singles[,fitness := log(count_r1_t1/count_r1_t0 * (wildtype$count_r1_t0 / wildtype$count_r1_t1)) - correction_factor_wildtype]
 82 |   doubles[,fitness := log(count_r1_t1/count_r1_t0 / (wildtype$count_r1_t1 / wildtype$count_r1_t0)) - correction_factor_wildtype]
 83 |   
 84 |   # calculate standard-error of fitness values given read counts
 85 |   wildtype[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0)]
 86 |   singles[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0 + 1/wildtype$count_r1_t1 + 1/wildtype$count_r1_t0)]
 87 |   doubles[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0 + 1/wildtype$count_r1_t1 + 1/wildtype$count_r1_t0)]
 88 |   
 89 |   # transfer single fitness/error values to doubles data.table
 90 |   doubles[,fitness1 := singles[Pos == Pos1 & Mut == Mut1,fitness],.(Pos1,Mut1)]
 91 |   doubles[,fitness2 := singles[Pos == Pos2 & Mut == Mut2,fitness],.(Pos2,Mut2)]
 92 |   doubles[,sigma1 := singles$sigma[singles$Pos %in% Pos1 & singles$Mut %in% Mut1],by=.(Pos1,Mut1)]
 93 |   doubles[,sigma2 := singles$sigma[singles$Pos %in% Pos2 & singles$Mut %in% Mut2],by=.(Pos2,Mut2)]
 94 |   
 95 |   #mark variants with nonsensical fitness values
 96 |   wildtype[,is.fitness := fitness > -Inf & !is.na(fitness)]
 97 |   singles[,is.fitness := fitness > -Inf & !is.na(fitness)]
 98 |   doubles[,is.fitness := fitness > -Inf & !is.na(fitness) & fitness1 > -Inf & fitness2 > -Inf]
 99 |   
100 |   # define which variants have enough reads
101 |   wildtype[,is.reads0 := TRUE]
102 |   singles[,is.reads0 := TRUE]
103 |   # only throw away variants with zero output counts if unclear where above lower_fitness_bound their fitness would be
104 |   # only applies to doubles
105 |   lower_read_cut = 200
106 |   doubles[,is.reads0 := count_r1_t0 > 10 & (count_r1_t1 >= 1 | count_r1_t0 >= lower_read_cut)]
107 |   
108 |   # rearrange doubles data.table
109 |   doubles = doubles[,.SD,,.SDcols = c("Pos1","Pos2","Mut1","Mut2","WT_AA1","WT_AA2",
110 |                                       "count_r1_t0","count_r1_t1","is.fitness","is.reads0",
111 |                                       "fitness1","fitness2","sigma1","sigma2",
112 |                                       "fitness","sigma")]
113 |   
114 |   # save doubles data.table
115 |   write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_RD",read_downsampling[RD],".txt"),
116 |               quote = F,row.names = F, col.names = T)
117 | }
118 | 
119 | 
120 | 
121 | ############################
122 | ##### doped dataset  #######
123 | ############################
124 | # only allow AA mutations 1nt hamming distance away from coding sequence
125 | # load a codon<>AA conversion table
126 | AA_codon_conversion = read.table(paste0(dataset_dir,"dataset/amino_acid_codon_conversion.txt"),sep = "\t",header = T)
127 | # load the coding sequence of G protein B1 domain
128 | Gprot_nuc_seq = read.fasta(paste0(dataset_dir,"dataset/GB1_CDS_nt"))
129 | ## 229:282 corresponds to 3:56 in GB1_Olson AA sequence
130 | # seqinr::translate(Gprot_nuc_seq$`M13825.1:578-1924`)[227:282]
131 | # seqinr::translate(Gprot_nuc_seq$`M13825.1:578-1924`[(227*3-2):(282*3)])
132 | acgt = c("a","c","g","t")
133 | GB1_nuc_seq = Gprot_nuc_seq$`M13825.1:578-1924`[(227*3-2):(282*3)]
134 | 
135 | #fix position 2 to Q/Glutamine, Codon CAA
136 | GB1_nuc_seq[4:6] = c("c","a","a")
137 | GB1_doped_mut = data.table(Pos = rep(0,55*12), WTaa = rep("",55*12), Mut = rep("",55*12))
138 | for (aa_pos in 2:56) {
139 |   for (nt_pos in 1:3) {
140 |     for (nt in 1:4) {
141 |       GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,Pos:= aa_pos]
142 |       wt = GB1_nuc_seq[((aa_pos-1)*3+1) : ((aa_pos-1)*3+3)]
143 |       GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,WTaa := seqinr::translate(wt)]
144 |       mutated = wt
145 |       mutated[nt_pos] = acgt[nt]
146 |       GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,Mut := seqinr::translate(mutated)]
147 |     } 
148 |   }
149 | }
150 | print(paste("possible NT1 mutations:",nrow(GB1_doped_mut)))
151 | #get rid of duplicate mutations
152 | GB1_doped_mut = unique(GB1_doped_mut)
153 | #get rid of mutations that don't change codons and those that give stop codons
154 | GB1_doped_mut = GB1_doped_mut[WTaa != Mut & Mut != "*"]
155 | print(paste("unique possible NT1  mutations, w/o PTC:",nrow(GB1_doped_mut)))
156 | print(paste0("fraction unique possible NNS mutations (",55*19,"): ",round(nrow(GB1_doped_mut)/55/19*100,digits=1),"%"))
157 | GB1_doped_mut[,PosMut := paste0(Pos,Mut)]
158 | 
159 | #create doped double data.tables
160 | #full dataset
161 | doubles = fread("GB1/processed_data/DMS_doubles_preE.txt")
162 | doubles[,':=' (PosMut1 = paste0(Pos1,Mut1),PosMut2 = paste0(Pos2,Mut2))]
163 | doubles = doubles[PosMut1 %in% GB1_doped_mut$PosMut & PosMut2 %in% GB1_doped_mut$PosMut]
164 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_doped.txt"),
165 |             quote = F,row.names = F, col.names = T)
166 | 
167 | #from read-downsampled versions
168 | for (RD in read_downsampling) {
169 |   doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_preE_RD",RD,".txt"))
170 |   doubles[,':=' (PosMut1 = paste0(Pos1,Mut1),PosMut2 = paste0(Pos2,Mut2))]
171 |   doubles = doubles[PosMut1 %in% GB1_doped_mut$PosMut & PosMut2 %in% GB1_doped_mut$PosMut]
172 |   write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_doped_RD",RD,".txt"),
173 |               quote = F,row.names = F, col.names = T)
174 | }
175 | 
176 | ##################################################################
177 | ### calculate epistasis null model and call pos./neg.epistasis ###
178 | ##################################################################
179 | 
180 | #read all downsampled data files
181 | double_files = list.files(path = paste0(dataset_dir,"processed_data/"))[grep("DMS_doubles_preE_",list.files(path = paste0(dataset_dir,"processed_data/")))]
182 | ID = sapply(double_files,FUN=function(X){gsub(".txt","",gsub("DMS_doubles_preE_","",X))})
183 | 
184 | for (idx in seq_along(ID)) {
185 |   doubles = fread(paste0(dataset_dir,"processed_data/",double_files[idx]))
186 |   doubles2 = copy(doubles[,.SD,,.SDcols = c(1:16)])
187 |   #lower bound of fitness
188 |   lower_bound_F = doubles2[is.fitness == T & is.reads0 == T & fitness1 + fitness2 < -8,median(fitness,na.rm=T)]
189 |   #call epistatic interactions
190 |   doubles = call_epistasis_binary(doubles,
191 |                                   lower_bound_F,
192 |                                   dataset_dir = dataset_dir,
193 |                                   output_filename = paste0("DMS_doubles_",ID[idx],".txt"),
194 |                                   prefix = paste0("GB1_",ID[idx]))
195 | }
196 | 
197 | 
198 | #############################################
199 | ### calculate pairwise interaction scores ###
200 | #############################################
201 | 
202 | double_files = list.files(path = paste0(dataset_dir,"processed_data/"))[grep("DMS_doubles_preE_",list.files(path = paste0(dataset_dir,"processed_data/")))]
203 | ID = sapply(double_files,FUN=function(X){gsub(".txt","",gsub("DMS_doubles_preE_","",X))})
204 | for (idx in seq_along(ID)) {
205 |   doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_",ID[idx],".txt"))
206 |   PWI = calculate_pairwise_interaction_scores(doubles,
207 |                                               dataset_dir = dataset_dir,
208 |                                               output_filename = paste0("DMS_PWI_",ID[idx],".txt"),
209 |                                               detailed = F)
210 | }
211 | 
212 | #### assemble all the combined scores from all PWI data.tables into one
213 | PWI_complete = fread("GB1/processed_data/DMS_PWI.txt")[,.(Pos1,Pos2,WT_AA1,WT_AA2,combined_score)]
214 | for (idx in seq_along(ID)) {
215 |   PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_",ID[idx],".txt"))
216 |   PWI_complete = merge(PWI_complete,
217 |                        PWI[,.(Pos1,Pos2,V1=combined_score)],
218 |                        by=c("Pos1","Pos2"),all=T)
219 |   names(PWI_complete)[ncol(PWI_complete)] = paste0("combined_score_",ID[idx])
220 | }
221 | write.table(PWI_complete,file=paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt"),
222 |             row.names=F,quote = F)
223 | 
224 | 
225 | #########################################
226 | ### deep contact transform PWI scores ###
227 | #########################################
228 | 
229 | prefix = "GB1_"
230 | 
231 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt"))
232 | PWI_transformed = deepcontact_transform_basic2d(PWI[,-grep("WT_AA",names(PWI)),with=F],
233 |                                                 dataset_dir = dataset_dir,
234 |                                                 output_filename = "DMS_PWI_complete_deepcontact.txt",
235 |                                                 deepcontact_dir = "where/is/deepcontact/",
236 |                                                 prefix = prefix)
237 | 
238 | 
239 | 
240 | 
241 | ###########################################################
242 | ######## predict secondary structure from PWI data ########
243 | ###########################################################
244 | 
245 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt"))
246 | #### predict secondary structure
247 | predict_secondary_structure_elements(PWI,
248 |                                      dataset_dir = dataset_dir,
249 |                                      prefix = prefix,
250 |                                      known_SS = "GB1/processed_data/PDB_secondary_structure_1pga_A.txt")
251 | 
252 | #### predict beta sheets
253 | predict_beta_sheets(PWI,
254 |                     input_ss0 = fread(paste0(dataset_dir,"processed_data/",prefix,"_secondary_structure_prediction.txt")),
255 |                     dataset_dir = dataset_dir,
256 |                     prefix = prefix,
257 |                     known_ss_file = "GB1/processed_data/PDB_secondary_structure_1pga_A.txt",
258 |                     known_bsi_file = "GB1/processed_data/PDB_beta_sheet_hbonds_1pga.txt")
259 | 
260 | 
261 | 
262 | 
263 | ##########################################################################################
264 | ##### evaluate predicted contacts (top scoring pairs) against reference structure ########
265 | ##########################################################################################
266 | 
267 | contactmap = fread("GB1/processed_data/PDB_contactmap_1pga_A.txt")
268 | 
269 | #### true positive rate of top contacts + contactmaps + eCDFs
270 | evaluate_contacts_vs_PDB(contacts = PWI,
271 |                          contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
272 |                          secondary_structure=NA,
273 |                          dataset_dir = dataset_dir,
274 |                          lindist=5,
275 |                          prefix = prefix)
276 | 
277 | #### minimal number of edges connecting top contacts versus all position pairs
278 | evaluate_contatcs_minimaledges(PWI,
279 |                                contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
280 |                                dataset_dir = dataset_dir,
281 |                                prefix = prefix,
282 |                                lindist=5,
283 |                                N_contacts = 1,
284 |                                dist_cutoff = 8)
285 | 
286 | #### interaction scores versus distance in reference structure
287 | score_vs_distance_scatter(contacts = PWI,
288 |                           contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
289 |                           dataset_dir = dataset_dir,
290 |                           prefix = prefix)
291 | 
292 | 
293 | #########################################
294 | ##### XPLOR structure prediction ########
295 | #########################################
296 | 
297 | #use predicted tertiary contacts and secondary structure elements to generate structural models
298 | 
299 | #### use  DMS data to derive restraints for tertiary contacts & use secondary structure restraints derived from PSIPRED predictions (no beta sheet pairing restraints)
300 | # +compare to deepcontact transformed scores
301 | 
302 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt"))
303 | PWI_DC = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete_deepcontact.txt"))
304 | names(PWI_DC)[names(PWI_DC) %in% setdiff(names(PWI_DC),c("Pos1","Pos2","WT_AA1","WT_AA2"))] = paste0(setdiff(names(PWI_DC),c("Pos1","Pos2","WT_AA1","WT_AA2")),"_DC")
305 | PWI2 = merge(PWI,PWI_DC,by=c("Pos1","Pos2"),all=T)
306 | 
307 | XPLOR_wrapper(input_PWI = PWI2,
308 |               SS_mode = "SSonly",
309 |               input_SS_file = "PSIPRED_secondary_structure.txt", ### copy this from GB1 folder into the dataset folder
310 |               prefix = "GB1_ds_PSIPRED_SSonly_",
311 |               dataset_dir = dataset_dir,
312 |               cores = 15,
313 |               queue = "short-sl7,long-sl7",
314 |               protein_sequence = scan("GB1/dataset/GB1_sequence.fasta",what = "character")[2],
315 |               pdb_file = "GB1/dataset/PDB/g_xray.pdb",
316 |               home_dir = "/where/to/temporarily/build/folderstructure/",
317 |               cluster_dir = "cluster/directory/for/XPLOR/",
318 |               login_serveraddress = "mylogin@serveraddress.com",
319 |               debug_this = F) 
320 | 
321 | #####################################
322 | ##### evaluate XPLOR results ########
323 | #####################################
324 | 
325 | #this will analyse the "..._variables_results.RData" files generated by the XPLOR simulations
326 | # (copy these back to the dataset directory, e.g. to results/XPLOR/my_prefix_used/, which is the "XPLOR_dir" variable)
327 | # the script will give basic outputs into a "results" sub-directory, such as how models performed across stages, the RMSD/TMscore of the best models etc
328 | 
329 | analyse_XPLOR_results(XPLOR_dir = paste0(dataset_dir,"results/XPLOR/GB1_ds_PSIPRED_SSonly_/"),
330 |                       contactmap = fread("GB1/processed_data/contactmap_1pga_A.txt"),
331 |                       draw_contactmaps = F)
332 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Jörn Schmiedel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DMS2structure
 2 | 
 3 | These are scripts used for the analysis of deep mutational scanning data in Schmiedel & Lehner, "Determining protein structures using deep mutagenesis", Nature Genetics (2019) (https://www.nature.com/articles/s41588-019-0431-x)  
 4 | 
 5 | The DATASET_pipeline.R scripts do the complete analysis for one dataset, the necessary data are already deposited in the [dataset_folder]/dataset/ folder, except for the WW domain, for which the sequencing data has to be downloaded and processed separately (as described in the pipeline script).  
 6 | 
 7 | 
 8 | ## required software
 9 | To run these scripts, you will need   
10 | R (version 3.4)  
11 | DeepContact (can be cloned from https://github.com/largelymfs/deepcontact )  
12 | XPLOR-NIH (can be downloaded from https://nmr.cit.nih.gov/xplor-nih/ )  
13 | TMscore (can be downloaded from https://zhanglab.ccmb.med.umich.edu/TM-score/ )  
14 | 
15 | ## required R packages
16 | data.table  
17 | ggplot2  
18 | cowplot  
19 | GGally  
20 | mgcv  
21 | caTools  
22 | parallel  
23 | stringr  
24 | gdata  
25 | corpcor  
26 | Rpdb  
27 | pdist  
28 | metap  
29 | RColorBrewer  
30 | ssh.utils  
31 | seqinr  
32 | optparse  
33 | pheatmap
34 | 


--------------------------------------------------------------------------------
/RRM/dataset/PSIPRED/PAB1.psipass2:
--------------------------------------------------------------------------------
 1 | # PSIPRED HFORMAT (PSIPRED V3.3)
 2 | 
 3 | Conf: 965521289453511233322102761444557689999500246660566999999998
 4 | Pred: CCEEEECCCCCCCCHHHHHHHHCCCCEEEEEEEECCCCCCEEEEEEEECCHHHHHHHHHH
 5 |   AA: GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDA
 6 |               10        20        30        40        50        60
 7 | 
 8 | 
 9 | Conf: 505200693667489
10 | Pred: HHCCCCCCCEEEECC
11 |   AA: LNGMLLNGQEIYVAP
12 |               70 
13 | 
14 | 


--------------------------------------------------------------------------------
/RRM/dataset/PSIPRED/PAB1.psipred.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lehner-lab/DMS2structure/3c1976b78a743d4d4c73921f047648500e671c8e/RRM/dataset/PSIPRED/PAB1.psipred.pdf


--------------------------------------------------------------------------------
/RRM/dataset/PSIPRED/PAB1.ss2:
--------------------------------------------------------------------------------
 1 | # PSIPRED VFORMAT (PSIPRED V3.3)
 2 | 
 3 |    1 G C   1.000  0.000  0.000
 4 |    2 N C   0.801  0.001  0.178
 5 |    3 I E   0.230  0.004  0.773
 6 |    4 F E   0.211  0.005  0.727
 7 |    5 I E   0.312  0.024  0.585
 8 |    6 K E   0.382  0.076  0.537
 9 |    7 N C   0.647  0.037  0.361
10 |    8 L C   0.891  0.071  0.065
11 |    9 H C   0.975  0.038  0.013
12 |   10 P C   0.778  0.325  0.021
13 |   11 D C   0.784  0.236  0.031
14 |   12 I C   0.688  0.317  0.033
15 |   13 D C   0.742  0.235  0.041
16 |   14 N C   0.554  0.398  0.058
17 |   15 K H   0.388  0.512  0.084
18 |   16 A H   0.307  0.581  0.065
19 |   17 L H   0.302  0.611  0.057
20 |   18 Y H   0.262  0.632  0.103
21 |   19 D H   0.287  0.629  0.092
22 |   20 T H   0.322  0.578  0.052
23 |   21 F H   0.341  0.565  0.031
24 |   22 S H   0.371  0.544  0.016
25 |   23 V C   0.475  0.450  0.021
26 |   24 F C   0.603  0.357  0.037
27 |   25 G C   0.854  0.128  0.035
28 |   26 D C   0.764  0.077  0.129
29 |   27 I E   0.409  0.041  0.560
30 |   28 L E   0.250  0.055  0.692
31 |   29 S E   0.261  0.045  0.669
32 |   30 S E   0.230  0.014  0.699
33 |   31 K E   0.205  0.009  0.720
34 |   32 I E   0.194  0.005  0.776
35 |   33 A E   0.152  0.003  0.853
36 |   34 T E   0.205  0.002  0.827
37 |   35 D C   0.929  0.002  0.101
38 |   36 E C   0.966  0.011  0.029
39 |   37 N C   0.982  0.008  0.016
40 |   38 G C   0.980  0.008  0.015
41 |   39 K C   0.959  0.012  0.035
42 |   40 S C   0.764  0.008  0.217
43 |   41 K E   0.433  0.010  0.524
44 |   42 G E   0.448  0.006  0.507
45 |   43 F E   0.336  0.009  0.581
46 |   44 G E   0.219  0.009  0.694
47 |   45 F E   0.134  0.010  0.823
48 |   46 V E   0.177  0.007  0.780
49 |   47 H E   0.174  0.007  0.780
50 |   48 F E   0.450  0.006  0.545
51 |   49 E C   0.767  0.033  0.174
52 |   50 E C   0.807  0.172  0.049
53 |   51 E H   0.172  0.867  0.003
54 |   52 G H   0.061  0.971  0.001
55 |   53 A H   0.037  0.980  0.001
56 |   54 A H   0.017  0.987  0.000
57 |   55 K H   0.014  0.982  0.000
58 |   56 E H   0.017  0.971  0.000
59 |   57 A H   0.029  0.957  0.000
60 |   58 I H   0.035  0.957  0.000
61 |   59 D H   0.037  0.952  0.001
62 |   60 A H   0.089  0.895  0.003
63 |   61 L H   0.198  0.792  0.003
64 |   62 N H   0.463  0.509  0.010
65 |   63 G C   0.754  0.195  0.079
66 |   64 M C   0.544  0.137  0.314
67 |   65 L C   0.310  0.307  0.357
68 |   66 L C   0.418  0.187  0.346
69 |   67 N C   0.804  0.027  0.113
70 |   68 G C   0.966  0.005  0.023
71 |   69 Q C   0.672  0.002  0.319
72 |   70 E E   0.203  0.002  0.851
73 |   71 I E   0.166  0.001  0.850
74 |   72 Y E   0.141  0.002  0.856
75 |   73 V E   0.275  0.002  0.684
76 |   74 A C   0.908  0.003  0.085
77 |   75 P C   0.985  0.000  0.000
78 | 


--------------------------------------------------------------------------------
/RRM/dataset/RRM_domain_sequence.fasta:
--------------------------------------------------------------------------------
1 | >RRM domain
2 | GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP


--------------------------------------------------------------------------------
/RRM_pipeline.R:
--------------------------------------------------------------------------------
  1 | #######################################
  2 | ##### RRM domain from Melamed2013 ##### of PAB1 yeast gene
  3 | #######################################
  4 | 
  5 | #this is the pipeline used to analyse the RRM domain data from Melamed et al. 2013
  6 | 
  7 | #first, set the working directory to the DMS2structure folder
  8 | setwd("/where/is/DMS2structure/")
  9 | 
 10 | 
 11 | #source scripts
 12 | filelist = list.files('scripts/')
 13 | sapply(paste0('scripts/',filelist),source,.GlobalEnv)
 14 | 
 15 | #create the necessary subfolder structure for all results and processed data
 16 | dataset_dir = "RRM/"
 17 | create_directory_structure(dataset_dir)
 18 | #then save this script in the dataset_dir
 19 | #and paste all necessary source data into dataset_dir/dataset
 20 | 
 21 | #load required packages
 22 | require(data.table)
 23 | require(ggplot2)
 24 | require(cowplot)
 25 | 
 26 | #############################################################################################
 27 | ##### preprocess data (calculate fitness scores and errors, set quality thresholds etc) #####
 28 | #############################################################################################
 29 | 
 30 | ################# read data from Supplementary Table 5 from Melamed2013
 31 | dataset = fread(paste0(dataset_dir,"dataset/Supplementary_Table_5_doubles.txt"), sep = "\t",header = TRUE)
 32 | 
 33 | # extract position and amino acids
 34 | dataset[,Pos1:=sapply(strsplit(dataset[,as.character(seqID_X)],"-"),FUN = function(X){as.integer(X[1])})]
 35 | dataset[,Mut1:=sapply(strsplit(dataset[,as.character(seqID_X)],"-"),FUN = function(X){X[2]})]
 36 | 
 37 | dataset[,Pos2:=sapply(strsplit(dataset[,as.character(seqID_Y)],"-"),FUN = function(X){as.integer(X[1])})]
 38 | dataset[,Mut2:=sapply(strsplit(dataset[,as.character(seqID_Y)],"-"),FUN = function(X){X[2]})]
 39 | 
 40 | # indicate library
 41 | dataset[Pos1<=150,section := 1]
 42 | dataset[between(Pos1,151,175),section := 2]
 43 | dataset[Pos1>175,section := 3]
 44 | 
 45 | 
 46 | #investigate dependency on input counts
 47 | ggplot(dataset,aes(Input_reads,log(XY_Enrichment_score))) + 
 48 |   geom_hex() +
 49 |   scale_x_log10() +
 50 |   scale_fill_gradient(trans="log",breaks=c(1,10,100)) +
 51 |   facet_wrap(~section)
 52 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_Inputreads_fitness.pdf"),width=8,height=3)
 53 | 
 54 | ### should treat libraries separately, develop independent error estimates
 55 | ### output count should be Input_reads * XY_enrichment_score
 56 | #define abbrev. variables
 57 | dataset = dataset[,.(Pos1,Pos2,Mut1,Mut2,section,dist = Physical_distance,
 58 |                          count_e1_s0 = Input_reads,count_e1_s1 = Input_reads * X_Enrichment_score,
 59 |                          fitness1=log(X_Enrichment_score),fitness2=log(Y_Enrichment_score),fitness=log(XY_Enrichment_score))]
 60 | 
 61 | # calculate Poissonian error
 62 | dataset[,sigma := sqrt(1/count_e1_s0 + 1/count_e1_s1)]
 63 | 
 64 | ggplot(dataset,aes(fitness,sigma)) + 
 65 |   geom_hex() + 
 66 |   scale_y_log10() + 
 67 |   scale_fill_continuous(trans = "log10") +
 68 |   facet_grid( ~ section)
 69 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_fitness_sigma.pdf"),width=8,height=3)
 70 | 
 71 | 
 72 | # define STOP variants
 73 | dataset[,STOP := Mut1 == "*" | Mut2 == "*"]
 74 | 
 75 | # plot fitness distribution + STOPs
 76 | ggplot(dataset,aes(fitness,color=factor(section),linetype=STOP)) + 
 77 |   geom_density()
 78 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_fitness_distribution.pdf"),width=5,height=3)
 79 | # >> all libraries are different
 80 | 
 81 | #estimate lower fitness bounds from STOP variants
 82 | dataset[,lower_bound_F := weighted.mean(x = dataset[section == unique(unlist(.SD)) & STOP == TRUE,fitness],
 83 |                                         w = dataset[section == unique(unlist(.SD)) & STOP == TRUE,1/sigma^2]),
 84 |         section,.SDcols = "section"]
 85 | unique(dataset[,.(lower_bound_F,section)][order(section)])
 86 | 
 87 | ## mark STOP variants and variants with enough reads (all)
 88 | dataset[,is.fitness := !STOP]
 89 | dataset[,is.reads0 := T]
 90 | 
 91 | #give dummy sigmas for single mutants (no read counts for singles available)
 92 | dataset[,sigma1 := 0.01]
 93 | dataset[,sigma2 := 0.01]
 94 | 
 95 | #correct Positions
 96 | dataset[,Pos1 := Pos1 - 125]
 97 | dataset[,Pos2 := Pos2 - 125]
 98 | 
 99 | #add WT_AA
100 | WT_aaseq = scan(paste0(dataset_dir,"dataset/RRM_domain_sequence.fasta"),what="character",sep="\n")[2]
101 | WT_aaseq_split = data.table(Pos = 1:75,WT_AA = strsplit(WT_aaseq,"")[[1]])
102 | 
103 | dataset[,WT_AA1 := WT_aaseq_split[Pos == Pos1,WT_AA],Pos1]
104 | dataset[,WT_AA2 := WT_aaseq_split[Pos == Pos2,WT_AA],Pos2]
105 | 
106 | 
107 | # reorder double table
108 | doubles = dataset[,.(Pos1,Pos2,WT_AA1,WT_AA2,Mut1,Mut2,
109 |                      section,count_e1_s0,count_e1_s1,STOP,is.fitness,is.reads0,
110 |                      fitness,sigma,fitness1,sigma1,fitness2,sigma2,lower_bound_F)]
111 | 
112 | #### save data.tables
113 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE.txt"),
114 |             quote = F,row.names = F, col.names = T)
115 | 
116 | 
117 | ### also save extracted pair-distances from dataset table
118 | ggplot(unique(dataset[,.(Pos1,Pos2,dist,section)]),aes(Pos1,Pos2,fill=dist<8)) +
119 |   geom_raster() +
120 |   scale_fill_manual(values=c("grey95","grey25"),na.value = "red")
121 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_contactmap_Melamed2013.pdf"),width=6,height=5)
122 | 
123 | #only section 2 has a decent number of off-diagonal contacts
124 | # !!! this is missing position 14 of the second section (position 39 in absolute terms)
125 | 
126 | write.table(x = unique(dataset[section == 1,.(Pos1,Pos2,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec1.txt"),
127 |             quote = F,row.names = F, col.names = T)
128 | 
129 | write.table(x = unique(dataset[section == 2 & !is.na(dist),.(Pos1 = Pos1-25,Pos2 = Pos2-25,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec2.txt"),
130 |             quote = F,row.names = F, col.names = T)
131 | 
132 | write.table(x = unique(dataset[section == 3,.(Pos1 = Pos1-50,Pos2 = Pos2-50,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec3.txt"),
133 |             quote = F,row.names = F, col.names = T)
134 | 
135 | 
136 | 
137 | ##################################################################
138 | ### calculate epistasis null model and call pos./neg.epistasis ###
139 | ##################################################################
140 | 
141 | ## given that only section 2 has off-diagonal contacts, focus on this
142 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_preE.txt"))
143 | doubles2 = copy(doubles[section==2])
144 | doubles2[,':=' (Pos1 = Pos1-25,Pos2 = Pos2-25)]
145 | doubles = call_epistasis_binary(doubles2,
146 |                                 unique(doubles2$lower_bound_F),
147 |                                 dataset_dir = dataset_dir,
148 |                                 prefix = "RRM_sec2_",
149 |                                 output_filename = "DMS_doubles_sec2.txt",
150 |                                 epistasis_error_from_slopes = F)
151 | 
152 | #############################################
153 | ### calculate pairwise interaction scores ###
154 | #############################################
155 | 
156 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_sec2.txt"))
157 | PWI = calculate_pairwise_interaction_scores(doubles,
158 |                                             dataset_dir = dataset_dir,
159 |                                             output_filename = "DMS_PWI_sec2.txt",
160 |                                             detailed = F)
161 | 
162 | 
163 | #########################################
164 | ### deep contact transform PWI scores ###
165 | #########################################
166 | 
167 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2.txt"))
168 | PWI_transformed = deepcontact_transform_basic2d(PWI,
169 |                                                 dataset_dir = dataset_dir,
170 |                                                 deepcontact_dir = "where/is/deepcontact/",
171 |                                                 output_filename = "DMS_PWI_sec2_deepcontact.txt",
172 |                                                 prefix = "RRM_sec2_")
173 | 
174 | ### negative controls for DeepContact
175 | # 3x permutate combined_scores, while keeping matrix symmetry
176 | set.seed(1603)
177 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2.txt"))[Pos1<Pos2,.(Pos1,Pos2,control1 = sample(combined_score),
178 |                                                                           control2 = sample(combined_score),
179 |                                                                           control3 = sample(combined_score))]
180 | PWI2 = switch_double_DT(PWI,list(c("Pos1","Pos2")),c("control1","control2","control3"))
181 | PWI2 = rbind(PWI2,data.table(Pos1=1:25,Pos2=1:25,control1=NA,control2=NA,control3=NA))
182 | write.table(x = PWI2, file = paste0(dataset_dir,"processed_data/DMS_PWI_sec2_DC_control.txt"),
183 |             quote = F,row.names = F, col.names = T)
184 | PWI_transformed = deepcontact_transform_basic2d(PWI2,
185 |                                                 dataset_dir = dataset_dir,
186 |                                                 output_filename = "DMS_PWI_sec2_DC_control_deepcontact.txt",
187 |                                                 deepcontact_dir = "where/is/deepcontact/",
188 |                                                 prefix = "RRM_sec2_")
189 | 
190 | ##################################################################################
191 | ##### read distance and secodnary structure info from PDB files (or PSIPRED) #####
192 | ##################################################################################
193 | 
194 | #deposit PDB and PSIPRED files in the dataset subfolders first
195 | 
196 | ## calculate contact maps & secondary structure from PDB file
197 | pairdistances_from_PDB(input_file = paste0(dataset_dir,"dataset/PDB/1cvj.pdb"),
198 |                        dataset_dir = dataset_dir,
199 |                        aa_seq = paste0(WT_aaseq_split[Pos %in% c(26:38,40:50),WT_AA],collapse=""), ### as in Melamed2013, omit position 39
200 |                        idx_pdb_start = 124,idx_DMS_start = 1,
201 |                        given_chainids = "A",suffix = "_sec2",debug_this = F)
202 | 
203 | #define betasheets in PDB file 1cvj for secction 2
204 | beta_sheet_pairing = data.table(pos1_min = c(126-123),
205 |                                 pos1_max = c(130-123),
206 |                                 pos2_min = c(141-123),
207 |                                 pos2_max = c(145-123),
208 |                                 type = c("anti-par"))
209 | ss_elements = fread(paste0(dataset_dir,"processed_data/PDB_secondary_structure_1cvj_A_sec2.txt"))
210 | save(list = c("beta_sheet_pairing","ss_elements"),
211 |      file = paste0(dataset_dir,"processed_data/PDB_secondary_structure_elements_control_1cvj_A_sec2.RData"))
212 | 
213 | #define hbonding in PDB file (1cvj, section 2)
214 | pdb_beta_sheet = data.table(pos_hn = c(127-123,144-123,129-123,142-123), 
215 |                             pos_o = c(144-123,127-123,142-123,129-123), strand1 = 1, strand2 = 2, type="anti_par")
216 | write.table(x=pdb_beta_sheet,file=paste0(dataset_dir,"processed_data/PDB_beta_sheet_hbonds_1cvj_A_sec2.txt"),quote = F,row.names = F,col.names = T)
217 | 
218 | SS_from_PSIPRED(input_file = paste0(dataset_dir,"dataset/PSIPRED/PAB1.ss2"),dataset_dir = dataset_dir)
219 | 
220 | 
221 | ######################################################
222 | ### correct sec2 data to fit with 1cvj contactmap  ###
223 | ######################################################
224 | 
225 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2.txt"))
226 | PWI_corr = PWI[Pos1 != 14 & Pos2 != 14] #remove position 14 for which there is no distance information
227 | PWI_corr[,':=' (Pos1 = ifelse(Pos1 < 14,Pos1,Pos1-1),Pos2 = ifelse(Pos2 < 14,Pos2,Pos2-1))]
228 | write.table(x = PWI_corr, file = paste0(dataset_dir,"processed_data/DMS_PWI_sec2_corr14.txt"),
229 |             quote = F,row.names = F, col.names = T)
230 | 
231 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2_deepcontact.txt"))
232 | PWI_corr = PWI[Pos1 != 14 & Pos2 != 14]
233 | PWI_corr[,':=' (Pos1 = ifelse(Pos1 < 14,Pos1,Pos1-1),Pos2 = ifelse(Pos2 < 14,Pos2,Pos2-1))]
234 | write.table(x = PWI_corr, file = paste0(dataset_dir,"processed_data/DMS_PWI_sec2_corr14_deepcontact.txt"),
235 |             quote = F,row.names = F, col.names = T)
236 | 
237 | 
238 | ### for randomized DC controls
239 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2_DC_control.txt"))
240 | PWI_corr = PWI[Pos1 != 14 & Pos2 != 14]
241 | PWI_corr[,':=' (Pos1 = ifelse(Pos1 < 14,Pos1,Pos1-1),Pos2 = ifelse(Pos2 < 14,Pos2,Pos2-1))]
242 | write.table(x = PWI_corr, file = paste0(dataset_dir,"processed_data/DMS_PWI_sec2_corr14_DC_control.txt"),
243 |             quote = F,row.names = F, col.names = T)
244 | 
245 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2_DC_control_deepcontact.txt"))
246 | PWI_corr = PWI[Pos1 != 14 & Pos2 != 14]
247 | PWI_corr[,':=' (Pos1 = ifelse(Pos1 < 14,Pos1,Pos1-1),Pos2 = ifelse(Pos2 < 14,Pos2,Pos2-1))]
248 | write.table(x = PWI_corr, file = paste0(dataset_dir,"processed_data/DMS_PWI_sec2_corr14_DC_control_deepcontact.txt"),
249 |             quote = F,row.names = F, col.names = T)
250 | 
251 | 
252 | 
253 | 
254 | ###########################################
255 | ###### evaluate epistasis data (QC) #######
256 | ###########################################
257 | ### these are some basic scripts to evaluate the dataset and predictions
258 | 
259 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_sec2.txt"))
260 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2_corr14.txt"))
261 | prefix = "RRM_sec2_"
262 | contactmap = fread(paste0(dataset_dir,"processed_data/PDB_contactmap_1cvj_A_sec2.txt"))
263 | 
264 | #check how subsets over which positive or negative epistasis can be classified are distributed in single mutant fitness space
265 | epistasis_analytics_subsets_singlemutantspace(doubles,
266 |                                               dataset_dir = dataset_dir,prefix = prefix)
267 | 
268 | #marginal distribution of # of variants suitable for epistasis classification across position pairs
269 | epistasis_analytics_NumEvars_marginal(doubles,
270 |                                       dataset_dir = dataset_dir,prefix = prefix)
271 | 
272 | #variants suitable for epistasis classification versus median fitness of single mutants
273 | epistasis_analytics_NumEvars_fitness(doubles,
274 |                                      dataset_dir = dataset_dir,prefix = prefix)
275 | 
276 | #check the spatial distribution of the number of variants suitable for epistasis classification
277 | epistasis_analytics_NumEvars_spatial(PWI,
278 |                                      dataset_dir = dataset_dir,prefix = prefix)
279 | 
280 | # cumulative distribution function of epistatic variants as function of distance 
281 | # (if a known contactmap is available)
282 | epistasis_analytics_subsets_CDF(doubles,
283 |                                 dataset_dir = dataset_dir,
284 |                                 contactmap = contactmap,
285 |                                 prefix = prefix,
286 |                                 dist_type = "scHAmin", lindist = 5)
287 | 
288 | 
289 | 
290 | 
291 | ###########################################################
292 | ######## predict secondary structure from PWI data ########
293 | ###########################################################
294 | 
295 | #### predict secondary structure
296 | predict_secondary_structure_elements(PWI,
297 |                                      dataset_dir = dataset_dir,
298 |                                      prefix = prefix,
299 |                                      known_SS = paste0(dataset_dir,"processed_data/PDB_secondary_structure_1cvj_A_sec2.txt"))
300 | 
301 | #### predict beta sheets given PSIPRED secondary structure prediction
302 | #first modify PSIPRED input 
303 | #i.e. isolate section 2 positions (26-50), then remove position 14 of this section
304 | psipred = fread(paste0(dataset_dir,"processed_data/PSIPRED_secondary_structure.txt"))[between(Pos,26,50),.(Pos=Pos-25,SS)][Pos!=14,.(Pos = ifelse(Pos<14,Pos,Pos-1),SS)]
305 | predict_beta_sheets(PWI,
306 |                     input_ss0 = psipred,
307 |                     dataset_dir = dataset_dir,
308 |                     prefix = prefix,
309 |                     known_ss_file = paste0(dataset_dir,"processed_data/PDB_secondary_structure_1cvj_A_sec2.txt"),
310 |                     known_bsi_file = paste0(dataset_dir,"processed_data/PDB_beta_sheet_hbonds_1cvj_A_sec2.txt"),
311 |                     restricted_pairing = T)
312 | 
313 | 
314 | 
315 | ##########################################################################################
316 | ##### evaluate predicted contacts (top scoring pairs) against reference structure ########
317 | ##########################################################################################
318 | 
319 | #### true positive rate of top contacts + contactmaps + eCDFs
320 | evaluate_contacts_vs_PDB(contacts = PWI,
321 |                          contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
322 |                          secondary_structure=NA,
323 |                          dataset_dir = dataset_dir,
324 |                          lindist=5,
325 |                          prefix = prefix)
326 | 
327 | #### minimal number of edges connecting top contacts versus all position pairs
328 | evaluate_contatcs_minimaledges(PWI,
329 |                                contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
330 |                                dataset_dir = dataset_dir,
331 |                                prefix = prefix,
332 |                                lindist=5,
333 |                                N_contacts = 1,
334 |                                dist_cutoff = 8)
335 | 
336 | #### interaction scores versus distance in reference structure
337 | score_vs_distance_scatter(contacts = PWI,
338 |                           contactmap = contactmap[,.(Pos1,Pos2,scHAmin)],
339 |                           dataset_dir = dataset_dir,
340 |                           prefix = prefix)
341 | 


--------------------------------------------------------------------------------
/WW/dataset/PSIPRED/WW1.psipass2:
--------------------------------------------------------------------------------
1 | # PSIPRED HFORMAT (PSIPRED V3.3)
2 | 
3 | Conf: 9888875100035788756742678624314999
4 | Pred: CCCCCCCCCEEECCCCCEEEEECCCCEEECCCCC
5 |   AA: DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR
6 |               10        20        30
7 | 
8 | 


--------------------------------------------------------------------------------
/WW/dataset/PSIPRED/WW1.psipred.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lehner-lab/DMS2structure/3c1976b78a743d4d4c73921f047648500e671c8e/WW/dataset/PSIPRED/WW1.psipred.pdf


--------------------------------------------------------------------------------
/WW/dataset/PSIPRED/WW1.ss2:
--------------------------------------------------------------------------------
 1 | # PSIPRED VFORMAT (PSIPRED V3.3)
 2 | 
 3 |    1 D C   1.000  0.000  0.000
 4 |    2 V C   0.921  0.009  0.066
 5 |    3 P C   0.902  0.009  0.059
 6 |    4 L C   0.915  0.008  0.039
 7 |    5 P C   0.905  0.018  0.025
 8 |    6 A C   0.817  0.097  0.025
 9 |    7 G C   0.718  0.122  0.031
10 |    8 W C   0.480  0.358  0.132
11 |    9 E C   0.398  0.304  0.357
12 |   10 M E   0.375  0.280  0.391
13 |   11 A E   0.398  0.251  0.423
14 |   12 K E   0.301  0.105  0.604
15 |   13 T C   0.751  0.011  0.170
16 |   14 S C   0.807  0.010  0.049
17 |   15 S C   0.878  0.004  0.031
18 |   16 G C   0.881  0.004  0.040
19 |   17 Q C   0.888  0.000  0.100
20 |   18 R E   0.197  0.002  0.771
21 |   19 Y E   0.134  0.003  0.823
22 |   20 F E   0.116  0.002  0.856
23 |   21 L E   0.205  0.012  0.702
24 |   22 N E   0.311  0.009  0.578
25 |   23 H C   0.801  0.009  0.116
26 |   24 I C   0.848  0.015  0.078
27 |   25 D C   0.909  0.010  0.052
28 |   26 Q C   0.788  0.006  0.158
29 |   27 T E   0.369  0.005  0.642
30 |   28 T E   0.293  0.007  0.762
31 |   29 T E   0.306  0.020  0.666
32 |   30 W C   0.543  0.025  0.378
33 |   31 Q C   0.705  0.024  0.273
34 |   32 D C   0.945  0.009  0.043
35 |   33 P C   0.962  0.016  0.022
36 |   34 R C   0.972  0.000  0.000
37 | 


--------------------------------------------------------------------------------
/WW/dataset/WW_sequence.fasta:
--------------------------------------------------------------------------------
1 | >WW domain aaseq
2 | DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPR
3 | 


--------------------------------------------------------------------------------
/WW/dataset/bash_scripts/001_bash_download_SRA_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -q short-sl7,long-sl7
 3 | # -N sra.download
 4 | # -l virtual_free=1G,h_rt=42:00:00
 5 | # -o DMS2struct/datasets/WW_Araya2012
 6 | # -e DMS2struct/datasets/WW_Araya2012
 7 | 
 8 | ### download sequencing data from Araya 2012
 9 | 
10 | mkdir DMS2struct/datasets/WW_Araya2012/
11 | cd DMS2struct/datasets/WW_Araya2012/
12 | 
13 | # download all SRR files
14 | wget -r -nH -nd -np -R index.html* ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/SRP015/SRP015751/
15 | 
16 | 


--------------------------------------------------------------------------------
/WW/dataset/bash_scripts/002_bash_fastq_dump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -q long-sl7
 3 | # -N sra.download
 4 | # -l virtual_free=10G,h_rt=42:00:00
 5 | # -o DMS2struct/datasets/WW_Araya2012
 6 | # -e DMS2struct/datasets/WW_Araya2012
 7 | 
 8 | # go to directory where all the sra files are located
 9 | cd DMS2struct/datasets/WW_Araya2012
10 | mkdir FastQ/
11 | 
12 | # run the code on all the .sra files I downloaded just now.
13 | for filename in ./*.sra
14 | 
15 | do
16 |     /users/blehner/pbaeza/bin/fastq-dump --outdir cd /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ --split-files ${filename}
17 | 
18 | done
19 | 
20 | 


--------------------------------------------------------------------------------
/WW/dataset/bash_scripts/003_bash_fastqc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # -q long-sl7
 3 | # -N fastqc
 4 | # -l virtual_free=10G,h_rt=42:00:00
 5 | # -o DMS2struct/datasets/WW_Araya2012
 6 | # -e DMS2struct/datasets/WW_Araya2012
 7 | 
 8 | # command to run FastQC
 9 | # - o argument is output folder which previously exists
10 | mkdir DMS2struct/datasets/WW_Araya2012/FastQC/
11 | # last argument is the file to be processed by fastqc
12 | /software/bl/el6.5/fastqc/FastQC/fastqc -o /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQC/ /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/*
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/WW/dataset/bash_scripts/004_bash_usearch_pairedreadmerging_Q20_ee0p1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -q long-sl7
 3 | #$ -N usearch_prm
 4 | #$ -pe smp 4
 5 | #$ -l virtual_free=16G,h_rt=42:00:00
 6 | #$ -o DMS2struct/datasets/WW_Araya2012
 7 | #$ -e DMS2struct/datasets/WW_Araya2012
 8 | 
 9 | # command to run usearch paired read merger
10 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569005_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Input1_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Input1_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
11 | 
12 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569006_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Input2_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Input2_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
13 | 
14 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569007_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round1-1_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round1-1_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
15 | 
16 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569008_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round1-2_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round1-2_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
17 | 
18 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569009_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round2-1_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round2-1_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
19 | 
20 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569010_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round2-2_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round2-2_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
21 | 
22 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569011_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round3-1_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round3-1_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
23 | 
24 | /software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastq_mergepairs /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ/SRR569012_R1.fastq -fastqout /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round3-2_Q20ee0p1.fastq -report /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/WW1_Round3-2_Q20ee0p1.report -fastq_minqual 20 -fastq_merge_maxee 0.01 
25 | 


--------------------------------------------------------------------------------
/WW/dataset/bash_scripts/005_bash_usearch_fastx_unique_Q20.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #$ -q long-sl7
 3 | #$ -N usearch_prm
 4 | #$ -pe smp 4
 5 | #$ -l virtual_free=16G
 6 | #$ -M joern.schmiedel@crg.eu
 7 | #$ -m ae
 8 | #$ -o DMS2struct/datasets/WW_Araya2012
 9 | #$ -e DMS2struct/datasets/WW_Araya2012
10 | 
11 | # command to run usearch fastx_unique code to collapse unique variants
12 | 
13 | for filename in /users/blehner/jschmiedel/DMS2struct/datasets/WW_Araya2012/FastQ_merged/*Q20ee0p1.fastq
14 | 
15 | do
16 | 	/software/bl/el7.2/usearch-10.0.240/usearch10.0.240_i86linux32 -fastx_uniques ${filename} -fastaout ${filename}_unique -sizeout -relabel Uniq
17 | 
18 | done
19 | 


--------------------------------------------------------------------------------
/scripts/SS_from_PSIPRED.R:
--------------------------------------------------------------------------------
1 | ##### extract secondary structure predictions from PSIPRED
2 | SS_from_PSIPRED = function(input_file,
3 |                            dataset_dir){
4 |   
5 |   secondary_structure = fread(input_file)[,.(Pos = V1,SS = V3)]
6 |   write.table(paste0(dataset_dir,"processed_data/PSIPRED_secondary_structure.txt"),
7 |               x = secondary_structure,quote = F,row.names = F,col.names = T)
8 |   
9 | }


--------------------------------------------------------------------------------
/scripts/XPLOR/XPLOR_simulations.R:
--------------------------------------------------------------------------------
 1 | #this is the script to initialize R on the cluster and call the actual XPLOR modeling function
 2 | 
 3 | require(data.table)
 4 | require(optparse)
 5 | 
 6 | #read in varlist file deposited in the copied folder structure
 7 | option_list = list(
 8 |   make_option(opt_str = c("-v", "--varlist"), type="character", default=NULL,
 9 |               help="varlist pointer", metavar = "character")
10 | )
11 | 
12 | opt_parser = OptionParser(option_list=option_list)
13 | opt = parse_args(opt_parser)
14 | 
15 | file_pointer = opt$varlist
16 | load(file = file_pointer)
17 | 
18 | ###### source all XPLOR modeling functions ###### 
19 | source(paste0(varlist$cluster_dir,varlist$protein,"/XPLOR_modeling_functions_v2.R"))
20 | 
21 | ###### start XPLOR modeling
22 | XPLOR_modeling(varlist)


--------------------------------------------------------------------------------
/scripts/XPLOR/anneal_template.py:
--------------------------------------------------------------------------------
  1 | 
  2 | xplor.requireVersion("2.43.4")
  3 | 
  4 | 
  5 | #
  6 | # slow cooling protocol in torsion angle space for protein G. Uses 
  7 | # NOE, J-coupling restraints.
  8 | #
  9 | # this script performs annealing from an extended structure
 10 | #
 11 | # CDS 2009/07/24
 12 | #
 13 | 
 14 | # this checks for typos on the command-line. User-customized arguments can
 15 | # also be specified.
 16 | #
 17 | xplor.parseArguments()
 18 | 
 19 | 
 20 | # filename for output structures. This string must contain the STRUCTURE
 21 | # literal so that each calculated structure has a unique name. The SCRIPT
 22 | # literal is replaced by this filename (or stdin if redirected using <),
 23 | # but it is optional.
 24 | #
 25 | outFilename = "XPLOR_output/SCRIPT_STRUCTURE.pdb"
 26 | numberOfStructures=100   #usually you want to create at least 20 
 27 | 
 28 | # protocol module has many high-level helper functions.
 29 | #
 30 | import protocol
 31 | 
 32 | protocol.initRandomSeed(1603)   #set random seed - by time
 33 | 
 34 | command = xplor.command
 35 | 
 36 | # generate PSF data from sequence and initialize the correct parameters.
 37 | #
 38 | from psfGen import seqToPSF
 39 | seqToPSF('protG.seq') ## modify ##
 40 | 
 41 | 
 42 | # generate random extended initial structure with correct covalent geometry
 43 | #
 44 | protocol.genExtendedStructure()
 45 | 
 46 | #
 47 | # a PotList contains a list of potential terms. This is used to specify which
 48 | # terms are active during refinement.
 49 | #
 50 | from potList import PotList
 51 | potList = PotList()
 52 | 
 53 | # parameters to ramp up during the simulated annealing protocol
 54 | #
 55 | from simulationTools import MultRamp, StaticRamp, InitialParams, IVMAction
 56 | 
 57 | rampedParams=[]
 58 | highTempParams=[]
 59 | 
 60 | # IVM setup
 61 | #   the IVM is used for performing dynamics and minimization in torsion-angle
 62 | #   space, and in Cartesian space.
 63 | #
 64 | from ivm import IVM
 65 | dyn  = IVM()
 66 | minc = IVM() # minc used for final cartesian minimization
 67 | 
 68 | # initialize ivm topology for torsion-angle dynamics
 69 | 
 70 | 
 71 | #
 72 | # 
 73 | #
 74 | 
 75 | 
 76 | # compare atomic Cartesian rmsd with a reference structure
 77 | #  backbone and heavy atom RMSDs will be printed in the output
 78 | #  structure files
 79 | #
 80 | # from posDiffPotTools import create_PosDiffPot
 81 | # refRMSD = create_PosDiffPot("refRMSD","name CA or name C or name N",
 82 | #                            pdbFile='g_xray.pdb', ## modify ##
 83 | #                            cmpSel="not name H*")
 84 | 
 85 | # set up NOE potential
 86 | noe=PotList('noe')
 87 | potList.append(noe)
 88 | from noePotTools import create_NOEPot
 89 | for (name,scale,file) in [('all',1,"NOE_contacts55rand_perfect.tbl"), ## modify ##
 90 |                           #add entries for additional tables
 91 |                           ]:
 92 |     pot = create_NOEPot(name,file)
 93 |     ## use soft PotType##
 94 |     # pot.setPotType("soft") # if you think there may be bad NOEs 
 95 |     pot.setAveType("shortest") ## use shortest distance between atom selections (side chain heavy atoms)
 96 |     pot.setScale(scale)
 97 |     noe.append(pot)
 98 | rampedParams.append( MultRamp(2,30, "noe.setScale( VALUE )") )
 99 | 
100 | 
101 | ## no J couplings ##
102 | # set up J coupling - with Karplus coefficients
103 | ##from jCoupPotTools import create_JCoupPot
104 | ##jCoup = create_JCoupPot("jcoup","jna_coup.tbl",
105 | ##                        A=6.98,B=-1.38,C=1.72,phase=-60.0)
106 | ##potList.append(jCoup)
107 | 
108 | # Set up dihedral angles
109 | from xplorPot import XplorPot
110 | dihedralRestraintFilename="dihe_ss_1pga.tbl" ## modify, and check that if works if dihedral angles are incomplete ##
111 | protocol.initDihedrals(dihedralRestraintFilename,
112 |                        #useDefaults=False  # by default, symmetric sidechain
113 |                                            # restraints are included
114 |                        )
115 | potList.append( XplorPot('CDIH') )
116 | highTempParams.append( StaticRamp("potList['CDIH'].setScale(10)") )
117 | rampedParams.append( StaticRamp("potList['CDIH'].setScale(200)") )
118 | # set custom values of threshold values for violation calculation
119 | #
120 | potList['CDIH'].setThreshold( 5 )
121 | 
122 | 
123 | 
124 | # gyration volume term 
125 | #
126 | from gyrPotTools import create_GyrPot
127 | gyr = create_GyrPot("Vgyr",
128 |                     "resid 1:56") # selection should exclude disordered tails
129 | potList.append(gyr)
130 | rampedParams.append( MultRamp(.002,1,"gyr.setScale(VALUE)") )
131 | 
132 | 
133 | ## what is this exactly doing?? ##
134 | # hbdb - hbond database-based term
135 | #
136 | protocol.initHBDB()
137 | potList.append( XplorPot('HBDB') )
138 | 
139 | #New torsion angle database potential
140 | #
141 | from torsionDBPotTools import create_TorsionDBPot
142 | torsionDB = create_TorsionDBPot('torsionDB')
143 | potList.append( torsionDB )
144 | rampedParams.append( MultRamp(.002,2,"torsionDB.setScale(VALUE)") )
145 | 
146 | #
147 | # setup parameters for atom-atom repulsive term. (van der Waals-like term)
148 | #
149 | from repelPotTools import create_RepelPot,initRepel
150 | repel = create_RepelPot('repel')
151 | potList.append(repel)
152 | rampedParams.append( StaticRamp("initRepel(repel,use14=False)") )
153 | rampedParams.append( MultRamp(.004,4,  "repel.setScale( VALUE)") )
154 | # nonbonded interaction only between CA atoms
155 | highTempParams.append( StaticRamp("""initRepel(repel,
156 |                                                use14=True,
157 |                                                scale=0.004,
158 |                                                repel=1.2,
159 |                                                moveTol=45,
160 |                                                interactingAtoms='name CA'
161 |                                                )""") )
162 | 
163 | # Selected 1-4 interactions.
164 | import torsionDBPotTools
165 | repel14 = torsionDBPotTools.create_Terminal14Pot('repel14')
166 | potList.append(repel14)
167 | highTempParams.append(StaticRamp("repel14.setScale(0)"))
168 | rampedParams.append(MultRamp(0.004, 4, "repel14.setScale(VALUE)"))
169 | 
170 | 
171 | potList.append( XplorPot("BOND") )
172 | potList.append( XplorPot("ANGL") )
173 | potList['ANGL'].setThreshold( 5 )
174 | rampedParams.append( MultRamp(0.4,1,"potList['ANGL'].setScale(VALUE)") )
175 | potList.append( XplorPot("IMPR") )
176 | potList['IMPR'].setThreshold( 5 )
177 | rampedParams.append( MultRamp(0.1,1,"potList['IMPR'].setScale(VALUE)") )
178 |       
179 | 
180 | 
181 | # Give atoms uniform weights, configure bath/molecule friction coeff.
182 | #
183 | protocol.massSetup()
184 | 
185 | 
186 | # IVM setup
187 | #   the IVM is used for performing dynamics and minimization in torsion-angle
188 | #   space, and in Cartesian space.
189 | #
190 | from ivm import IVM
191 | dyn = IVM()
192 | 
193 | # initialize ivm topology for torsion-angle dynamics
194 | #
195 | protocol.torsionTopology(dyn)
196 | 
197 | # minc used for final cartesian minimization
198 | #
199 | minc = IVM()
200 | protocol.initMinimize(minc)
201 | 
202 | protocol.cartesianTopology(minc)
203 | 
204 | 
205 | 
206 | # object which performs simulated annealing
207 | #
208 | from simulationTools import AnnealIVM
209 | init_t  = 3500.     # Need high temp and slow annealing to converge
210 | cool = AnnealIVM(initTemp =init_t,
211 |                  finalTemp=25,
212 |                  tempStep =12.5,
213 |                  ivm=dyn,
214 |                  rampedParams = rampedParams)
215 | 
216 | 
217 | def calcOneStructure(loopInfo):
218 |     """ this function calculates a single structure, performs analysis on the
219 |     structure, and then writes out a pdb file, with remarks.
220 |     """
221 | 
222 |     # generate a new structure with randomized torsion angles
223 |     #
224 |     from monteCarlo import randomizeTorsions
225 |     randomizeTorsions(dyn)
226 | 
227 |     # set torsion angles from restraints
228 |     #
229 |     from torsionTools import setTorsionsFromTable
230 |     setTorsionsFromTable(dihedralRestraintFilename) ## does this work if incomplete? ##
231 |     protocol.fixupCovalentGeom(maxIters=100,useVDW=1)
232 |     protocol.writePDB(loopInfo.filename()+".init")
233 | 
234 | 
235 |     # initialize parameters for high temp dynamics.
236 |     InitialParams( rampedParams )
237 |     # high-temp dynamics setup - only need to specify parameters which
238 |     #   differfrom initial values in rampedParams
239 |     InitialParams( highTempParams )
240 | 
241 |     # high temp dynamics
242 |     #
243 |     protocol.initDynamics(dyn,
244 |                           potList=potList, # potential terms to use
245 |                           bathTemp=init_t,
246 |                           initVelocities=1,
247 |                           finalTime=100,   # stops at 100ps or 1000 steps
248 |                           numSteps=1000,   # whichever comes first
249 |                           printInterval=100)
250 | 
251 |     dyn.setETolerance( init_t/100 )  #used to det. stepsize. default: t/1000 
252 |     dyn.run()
253 | 
254 |     # initialize integrator for simulated annealing
255 |     #
256 |     protocol.initDynamics(dyn,
257 |                           potList=potList,
258 |                           numSteps=100,       #at each temp: 100 steps or
259 |                           finalTime=.2 ,       # .2ps, whichever is less
260 |                           printInterval=100)
261 | 
262 |     # perform simulated annealing
263 |     #
264 |     cool.run()
265 |               
266 |               
267 |     # final torsion angle minimization
268 |     #
269 |     protocol.initMinimize(dyn,
270 |                           printInterval=50)
271 |     dyn.run()
272 | 
273 |     # final all- atomic degrees of freedom minimization
274 |     #
275 |     protocol.initMinimize(minc,
276 |                           potList=potList,
277 |                           dEPred=10)
278 |     minc.run()
279 | 
280 |     #do analysis and write structure when function returns
281 |     pass
282 | 
283 | 
284 | ## modify ##
285 | from simulationTools import StructureLoop, FinalParams
286 | StructureLoop(numStructures=numberOfStructures,
287 |               doWriteStructures=True,
288 |               pdbTemplate=outFilename,
289 |               structLoopAction=calcOneStructure,
290 |               genViolationStats=True,
291 |               averageTopFraction=0.25, #report stats on best 25% of structs
292 |               averageSortPots=[potList['torsionDB']],
293 |               averageContext=FinalParams(rampedParams),
294 |               #averageCrossTerms=refRMSD,
295 |               averageFilename="XPLOR_output/SCRIPT_ave.pdb",
296 |               averagePotList=potList).run()
297 | 


--------------------------------------------------------------------------------
/scripts/XPLOR/refine_template.py:
--------------------------------------------------------------------------------
  1 | 
  2 | xplor.requireVersion("2.34")
  3 | 
  4 | #
  5 | # slow cooling protocol in torsion angle space for protein G. Uses 
  6 | # NOE, J-coupling restraints.
  7 | #
  8 | # this version refines from a reasonable model structure.
  9 | #
 10 | # CDS 2013/07/10
 11 | #
 12 | 
 13 | # this checks for typos on the command-line. User-customized arguments can
 14 | # also be specified.
 15 | #
 16 | xplor.parseArguments()
 17 | 
 18 | 
 19 | # filename for output structures. This string must contain the STRUCTURE
 20 | # literal so that each calculated structure has a unique name. The SCRIPT
 21 | # literal is replaced by this filename (or stdin if redirected using <),
 22 | # but it is optional.
 23 | #
 24 | outFilename = "contacts55perfect_diheSSpsipred/SCRIPT_STRUCTURE.pdb"
 25 | numberOfStructures=100   #usually you want to create at least 20 
 26 | 
 27 | # protocol module has many high-level helper functions.
 28 | #
 29 | import protocol
 30 | protocol.initRandomSeed(1603)   #explicitly set random seed
 31 | 
 32 | command = xplor.command
 33 | # read an existing model
 34 | #
 35 | protocol.loadPDB("contacts55perfect_diheSSpsipred/anneal_GB1_ave.pdb",deleteUnknownAtoms=True)
 36 | 
 37 | protocol.fixupCovalentGeom(maxIters=100,useVDW=1)
 38 | 
 39 | #
 40 | # a PotList contains a list of potential terms. This is used to specify which
 41 | # terms are active during refinement.
 42 | #
 43 | from potList import PotList
 44 | potList = PotList()
 45 | 
 46 | # parameters to ramp up during the simulated annealing protocol
 47 | #
 48 | from simulationTools import MultRamp, StaticRamp, InitialParams
 49 | 
 50 | rampedParams=[]
 51 | highTempParams=[]
 52 | 
 53 | # compare atomic Cartesian rmsd with a reference structure
 54 | #  backbone and heavy atom RMSDs will be printed in the output
 55 | #  structure files
 56 | #
 57 | # from posDiffPotTools import create_PosDiffPot
 58 | # refRMSD = create_PosDiffPot("refRMSD","name CA or name C or name N",
 59 | #                            pdbFile='g_xray.pdb',
 60 | #                            cmpSel="not name H*")
 61 | 
 62 | # set up NOE potential
 63 | noe=PotList('noe')
 64 | potList.append(noe)
 65 | from noePotTools import create_NOEPot
 66 | for (name,scale,file) in [('all',1,"NOE_contacts55rand_perfect.tbl"),
 67 |                           #add entries for additional tables
 68 |                           ]:
 69 |     pot = create_NOEPot(name,file)
 70 |     # pot.setPotType("soft") # if you think there may be bad NOEs
 71 |     pot.setAveType("shortest") ## use shortest distance between atom selections (side chain heavy atoms)
 72 |     pot.setScale(scale)
 73 |     noe.append(pot)
 74 | rampedParams.append( MultRamp(2,30, "noe.setScale( VALUE )") )
 75 | 
 76 | ### set up J coupling - with Karplus coefficients
 77 | ##from jCoupPotTools import create_JCoupPot
 78 | ##jCoup = create_JCoupPot("jcoup","jna_coup.tbl",
 79 | ##                        A=6.98,B=-1.38,C=1.72,phase=-60.0)
 80 | ##potList.append(jCoup)
 81 | 
 82 | # Set up dihedral angles
 83 | from xplorPot import XplorPot
 84 | protocol.initDihedrals("dihe_ss_1pga.tbl",
 85 |                        #useDefaults=False  # by default, symmetric sidechain
 86 |                                            # restraints are included
 87 |                        )
 88 | potList.append( XplorPot('CDIH') )
 89 | highTempParams.append( StaticRamp("potList['CDIH'].setScale(10)") )
 90 | rampedParams.append( StaticRamp("potList['CDIH'].setScale(200)") )
 91 | 
 92 | 
 93 | 
 94 | # gyration volume term 
 95 | #
 96 | # gyration volume term 
 97 | #
 98 | from gyrPotTools import create_GyrPot
 99 | gyr = create_GyrPot("Vgyr",
100 |                     "resid 1:56") # selection should exclude disordered tails
101 | potList.append(gyr)
102 | rampedParams.append( MultRamp(.002,1,"gyr.setScale(VALUE)") )
103 | 
104 | ### hbda - distance/angle bb hbond term
105 | ###
106 | ##protocol.initHBDA('hbda.tbl')
107 | ##potList.append( XplorPot('HBDA') )
108 | 
109 | # hbdb - knowledge-based backbone hydrogen bond term
110 | #
111 | protocol.initHBDB()
112 | potList.append( XplorPot('HBDB') )
113 | 
114 | #New torsion angle database potential
115 | #
116 | from torsionDBPotTools import create_TorsionDBPot
117 | torsionDB = create_TorsionDBPot('torsionDB')
118 | potList.append( torsionDB )
119 | rampedParams.append( MultRamp(.002,2,"torsionDB.setScale(VALUE)") )
120 | 
121 | #
122 | # setup parameters for atom-atom repulsive term. (van der Waals-like term)
123 | #
124 | from repelPotTools import create_RepelPot,initRepel
125 | repel = create_RepelPot('repel')
126 | potList.append(repel)
127 | rampedParams.append( StaticRamp("initRepel(repel,use14=False)") )
128 | rampedParams.append( MultRamp(.004,4,  "repel.setScale( VALUE)") )
129 | # nonbonded interaction only between CA atoms
130 | highTempParams.append( StaticRamp("""initRepel(repel,
131 |                                                use14=True,
132 |                                                scale=0.004,
133 |                                                repel=1.2,
134 |                                                moveTol=45,
135 |                                                interactingAtoms='name CA'
136 |                                                )""") )
137 | 
138 | # Selected 1-4 interactions.
139 | import torsionDBPotTools
140 | repel14 = torsionDBPotTools.create_Terminal14Pot('repel14')
141 | potList.append(repel14)
142 | highTempParams.append(StaticRamp("repel14.setScale(0)"))
143 | rampedParams.append(MultRamp(0.004, 4, "repel14.setScale(VALUE)"))
144 | 
145 | 
146 | potList.append( XplorPot("BOND") )
147 | potList.append( XplorPot("ANGL") )
148 | potList['ANGL'].setThreshold( 5 )
149 | rampedParams.append( MultRamp(0.4,1,"potList['ANGL'].setScale(VALUE)") )
150 | potList.append( XplorPot("IMPR") )
151 | potList['IMPR'].setThreshold( 5 )
152 | rampedParams.append( MultRamp(0.1,1,"potList['IMPR'].setScale(VALUE)") )
153 |       
154 | 
155 | 
156 | # Give atoms uniform weights, except for the anisotropy axis
157 | #
158 | protocol.massSetup()
159 | 
160 | 
161 | # IVM setup
162 | #   the IVM is used for performing dynamics and minimization in torsion-angle
163 | #   space, and in Cartesian space.
164 | #
165 | from ivm import IVM
166 | dyn = IVM()
167 | 
168 | 
169 | # reset ivm topology for torsion-angle dynamics
170 | #
171 | dyn.reset()
172 | 
173 | protocol.torsionTopology(dyn)
174 | 
175 | # minc used for final cartesian minimization
176 | #
177 | minc = IVM()
178 | protocol.initMinimize(minc)
179 | 
180 | protocol.cartesianTopology(minc)
181 | 
182 | 
183 | 
184 | # object which performs simulated annealing
185 | #
186 | from simulationTools import AnnealIVM
187 | init_t  = 3000.     # Need high temp and slow annealing to converge
188 | cool = AnnealIVM(initTemp =init_t,
189 |                  finalTemp=25,
190 |                  tempStep =12.5,
191 |                  ivm=dyn,
192 |                  rampedParams = rampedParams)
193 | 
194 | def accept(potList):
195 |     """
196 |     return True if current structure meets acceptance criteria
197 |     """
198 |     if potList['noe'].violations()>1:
199 |         return False
200 |     if potList['CDIH'].violations()>0:
201 |         return False
202 |     if potList['BOND'].violations()>0:
203 |         return False
204 |     if potList['ANGL'].violations()>0:
205 |         return False
206 |     if potList['IMPR'].violations()>1:
207 |         return False
208 |     
209 |     return True
210 | 
211 | def calcOneStructure(loopInfo):
212 |     """ this function calculates a single structure, performs analysis on the
213 |     structure, and then writes out a pdb file, with remarks.
214 |     """
215 | 
216 |     # initialize parameters for high temp dynamics.
217 |     InitialParams( rampedParams )
218 |     # high-temp dynamics setup - only need to specify parameters which
219 |     #   differfrom initial values in rampedParams
220 |     InitialParams( highTempParams )
221 | 
222 |     # high temp dynamics
223 |     #
224 |     protocol.initDynamics(dyn,
225 |                           potList=potList, # potential terms to use
226 |                           bathTemp=init_t,
227 |                           initVelocities=1,
228 |                           finalTime=10,    # stops at 10ps or 5000 steps
229 |                           numSteps=5000,   # whichever comes first
230 |                           printInterval=100)
231 | 
232 |     dyn.setETolerance( init_t/100 )  #used to det. stepsize. default: t/1000 
233 |     dyn.run()
234 | 
235 |     # initialize parameters for cooling loop
236 |     InitialParams( rampedParams )
237 | 
238 | 
239 |     # initialize integrator for simulated annealing
240 |     #
241 |     protocol.initDynamics(dyn,
242 |                           potList=potList,
243 |                           numSteps=100,       #at each temp: 100 steps or
244 |                           finalTime=.2 ,       # .2ps, whichever is less
245 |                           printInterval=100)
246 | 
247 |     # perform simulated annealing
248 |     #
249 |     cool.run()
250 |               
251 |               
252 |     # final torsion angle minimization
253 |     #
254 |     protocol.initMinimize(dyn,
255 |                           printInterval=50)
256 |     dyn.run()
257 | 
258 |     # final all- atom minimization
259 |     #
260 |     protocol.initMinimize(minc,
261 |                           potList=potList,
262 |                           dEPred=10)
263 |     minc.run()
264 | 
265 |     #do analysis and write structure when this function returns
266 |     pass
267 | 
268 | 
269 | 
270 | from simulationTools import StructureLoop, FinalParams
271 | StructureLoop(numStructures=numberOfStructures,
272 |               structLoopAction=calcOneStructure,
273 |               calcMissingStructs=True, #calculate only missing structures
274 |               doWriteStructures=True,  #analyze and write coords after calc
275 |               pdbTemplate=outFilename,
276 |               genViolationStats=True,
277 |               averagePotList=potList,
278 |               #averageCrossTerms=refRMSD,
279 |               averageTopFraction=0.1, #report only on best 50% of structs
280 |               averageAccept=accept,   #only use structures which pass accept()
281 |               averageContext=FinalParams(rampedParams),
282 |               averageFilename="contacts55perfect_diheSSpsipred/SCRIPT_ave.pdb",    #generate regularized ave structure
283 |               averageFitSel="name CA",
284 |               averageCompSel="not resname ANI and not name H*"     ).run()
285 | 
286 | 


--------------------------------------------------------------------------------
/scripts/XPLOR_wrapper.R:
--------------------------------------------------------------------------------
  1 | 
  2 | XPLOR_wrapper = function(input_PWI,
  3 |                          SS_mode = "SSonly",
  4 |                          input_SS_file,
  5 |                          L = c(0.5,1,1.5,2),
  6 |                          predictorXlength = c(),
  7 |                          prefix,
  8 |                          dataset_dir,
  9 |                          protein_sequence,
 10 |                          pdb_file,
 11 |                          cores = 12,
 12 |                          queue = "long-sl7,short-sl7",
 13 |                          numberOfStructures = c(500,500,500),
 14 |                          top_avg_fraction = c(0.1,0.1,0.1),
 15 |                          NOE_pot_soft = c(TRUE,TRUE,FALSE),
 16 |                          home_dir,
 17 |                          cluster_dir,
 18 |                          reporting_email = NA,
 19 |                          login_serveraddress,
 20 |                          debug_this = F,
 21 |                          linear_dist = 5,
 22 |                          dist_restraint = 8) {
 23 |   
 24 |   #these scripts are multi-level wrapper functions to run XPLOR-NIH on a compute cluster
 25 |   #they locally create/modify the necessary scripts and folder structure, and scp this to the cluster, then remotely execute parallel jobs
 26 |   #this might take quite some work to adapt to other working environments
 27 |   
 28 |   ### variables
 29 |   # input_PWI: pairwise interaction score data.table; this should have Pos1, Pos2, WT_AA1 and WT_AA2 columns + all interscores that restraints should be derived from
 30 |   # SS_mode: - either "SSonly", in which case it only uses predicted secondary structure elements (from DMS data or e.g. PSIPRED) for restraints
 31 |   #          - or "SSsheets", in which it also derives restraints for beta sheet pairing hbonding
 32 |   # input_SS_file: secondary structure element input file, 
 33 |   #                   - if SS_mode == "SSonly" this is a table with position columns and either ONE ss index column to be used for all given interaction scores, or a SS index column per interaction score (with the interaction score as column name)
 34 |   #                   - if SS_mode == "SSsheets" this needs to be a secondary structure element RData file that also contains the beta sheet pairing table
 35 |   # L: number of top contacts * protein length used for tertiary contact restraints, can be a vector, a simulation per score x L combination will be started [additionally, if there is a "control" score, the script will create a negative control, i.e. L = 0]
 36 |   # predictorXlength: this can be a data.table with first column indicating interaction score, second column indicate L; this will rerun only the specific score:L combinations indicated instead of all scores versus all L + negative control
 37 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
 38 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
 39 |   # protein_sequence: amino acid sequence of the protein to be modeled, used by XPLOR
 40 |   # pdb_file (optional): reference structure pdb file to compare structural models to in terms of RMSD and template modeling score
 41 |   # cores: number of cores to request on the computing cluster, max is 16
 42 |   # queue: which queues to submit jobs to
 43 |   # numberOfStructures: how many structures to create in each of the three modeling stages
 44 |   # top_avg_fraction: fraction of top evaluated models (by XPLOR total energy) to use to decide restraint violations (stages 1+2) and calculate an average structural model (stage 2 as starting point for stage 3, and stage 3 as a final output)
 45 |   # NOE_pot_soft: for the three different stages; TRUE: use soft well potentials for distance restraints (if potentially many restraitns from false positive contacts)
 46 |   # home_dir: a directory on local machine in which a "tmp" folder is create to build the script/folder structure locally then copy to cluster; this is removed at the end of the script; could be "/Users/me/"
 47 |   # cluster_dir: (main) directory on the compute cluster into which the folder structure should be copied and where the simulations are executed
 48 |   # reporting_email: if not NA: email to report to about job status for qsub system
 49 |   # login_serveraddress: mylogin@serveraddress.com
 50 |   # debug_this: if TRUE, script will stop at certain points for debugging
 51 |   # linear_dist: minimum linear (residue) distance for defining top predicted contacts
 52 |   # dist_restraint: distance restraint (in Angstrom) for top predicted contacts
 53 |   
 54 |   require(data.table)
 55 |   require(ssh.utils)
 56 |   #load utility scripts
 57 |   filelist = list.files('scripts/')
 58 |   sapply(paste0('scripts/',filelist),source,.GlobalEnv)
 59 |   
 60 |   #create tmp directory for data structure
 61 |   system(command = paste0("mkdir ",home_dir,"tmp"),wait=T)
 62 |   system(command = paste0("mkdir ",home_dir,"tmp/",prefix),wait=T)
 63 |   
 64 |   #which interaction score to use as predictors?
 65 |   predictor = setdiff(names(input_PWI),c("Pos1","Pos2","WT_AA1","WT_AA2"))
 66 |   #create data.table with predictor x L combinations
 67 |   if (length(predictorXlength) == 0) {
 68 |     predictorXlength = data.table(expand.grid(predictor,L))
 69 |     names(predictorXlength) = c("predictor","L")
 70 |     if (length(grep("control",predictor)) > 0) { #if preditors include controls, also create a negative control without tertiary distance restraints
 71 |       predictorXlength = rbind(predictorXlength,data.table(predictor = "control",L = 0))
 72 |     }
 73 |   } else { #the table was supplied with specific combinations
 74 |     names(predictorXlength) = c("predictor","L")
 75 |   }
 76 |   predictorXlength[,predictor := as.character(predictor)]
 77 |   
 78 |   if (debug_this) {browser()}
 79 |   
 80 |   #set up list structure for variables and results
 81 |   varlist = list()
 82 |   varlist$cores = cores
 83 |   varlist$protein = prefix
 84 |   varlist$cluster_dir = cluster_dir
 85 |   varlist$numberOfStructures = numberOfStructures
 86 |   varlist$top_avg_fraction = top_avg_fraction
 87 |   varlist$NOE_pot_soft = NOE_pot_soft
 88 |   varlist$filename = c("anneal_stage1","anneal_stage2","refine")
 89 |   varlist$contacts_noe_file = "NOE_restraints"
 90 |   varlist$ss_dihe_file = "DIHE_restraints"
 91 |   
 92 |   
 93 |   varlist$protein_length = nchar(protein_sequence)
 94 |   ### define .seq file for XPLOR, 3letter AA names 
 95 |   write(x = paste(toupper(sapply(strsplit(protein_sequence,"")[[1]],convert_AAabr_one_three)),collapse=" "), 
 96 |         file=paste0(home_dir,"tmp/",prefix,"/protein.seq"))
 97 |   varlist$protein_seq = paste0(cluster_dir,prefix,"/protein.seq") #replace by fasta seq, then create .seq file and copy
 98 |   
 99 |   #copy pdb template file
100 |   system(command = paste0("cp ",pdb_file," ",home_dir,"tmp/",prefix,"/template.pdb"),wait=T)
101 |   varlist$pdb_file = paste0(cluster_dir,prefix,"/template.pdb")
102 |   
103 |   
104 |   ### per combination of predictor and L, create restraint files etc
105 |   for (idx in 1:nrow(predictorXlength)) {
106 |     varlist$predictor = predictorXlength$predictor[idx]
107 |     varlist$L = predictorXlength$L[idx]
108 |     varlist$folder = paste0(cluster_dir,prefix,"/",varlist$predictor,"/L",varlist$L,"/")
109 |     
110 |     
111 |     ### initiate lists
112 |     #for restraints
113 |     varlist$NOE_DT = list()
114 |     varlist$NOE_DT[[1]] = data.table(weight = as.numeric(),Pos1 = as.numeric(), Pos2 = as.numeric(), 
115 |                                      atom1 = as.character(), atom2 = as.character(), 
116 |                                      dist = as.numeric(), lower_dist = as.numeric(), upper_dist = as.numeric(),
117 |                                      Pos1_opt2 = as.numeric(), Pos2_opt2 = as.numeric(),
118 |                                      atom1_opt2 = as.character(),atom2_opt2 = as.character(),type = as.character())
119 |     varlist$DIHE_DT = list()
120 |     varlist$DIHE_DT[[1]] = data.table(weight = as.numeric(),position = as.numeric(),angle=as.numeric(), 
121 |                                       delta_angle=as.numeric(),type = as.character(),ss = as.character())
122 |     #for evaluation
123 |     varlist$energy_XPLOR = list()
124 |     varlist$violations = list()
125 |     
126 |     ##### secondary structure elements
127 |     if (SS_mode == "SSonly") {
128 |       ### only secondary structure
129 |       if (varlist$predictor == "control") {
130 |         files = list.files(path = paste0(dataset_dir,"processed_data/"))
131 |         load(paste0(dataset_dir,"processed_data/",files[grep("secondary_structure_elements_control.*RData",files)]))
132 |         input_SS = ss_elements
133 |       } else {
134 |         input_SS = fread(paste0(dataset_dir,"processed_data/",input_SS_file))
135 |       }
136 |       ### define secondary structure restraints
137 |       if (ncol(input_SS) > 2){
138 |         varlist$SS_pred = input_SS[,.(position = Pos,ss = .SD),,.SDcols = varlist$predictor]
139 |       } else if (ncol(input_SS) == 2) {
140 |         varlist$SS_pred = input_SS[,.(position = Pos,ss = .SD),,.SDcols = 2]
141 |       } else {
142 |         print("number secondary structure inputs doesn't match the number of input features")
143 |         secondary_structure = error
144 |       }
145 |       varlist$SS_pred[,rleidx := rleid(ss)]
146 |     } else if (SS_mode == "SSsheets") { ## also include beta sheet hbonding
147 |       
148 |       ### secondary structure and beta sheet hbonding
149 |       if (varlist$predictor == "control") {
150 |         files = list.files(path = paste0(dataset_dir,"processed_data/"))
151 |         load(paste0(dataset_dir,"processed_data/",files[grep("secondary_structure_elements_control.*RData",files)]))
152 |         beta_hbonds = hbonds_from_betasheetpairing(beta_sheet_pairing)
153 |       } else {
154 |         load(paste0(dataset_dir,"processed_data/",input_SS_file,"secondary_structure_elements_",varlist$predictor,".RData"))
155 |         beta_hbonds = hbonds_from_betasheetpairing(beta_sheet_pairing,ss_data)
156 |       }
157 |       # secondary structure
158 |       varlist$SS_pred = ss_elements
159 |       names(varlist$SS_pred) = c("position","ss")
160 |       varlist$SS_pred[,rleidx := rleid(ss)]
161 |       # write beta sheet hbonding to restraints list
162 |       if (nrow(beta_hbonds)>0) {
163 |         varlist$NOE_DT[[1]] = rbind(varlist$NOE_DT[[1]], data.table(weight=1, Pos1=beta_hbonds$hn_opt1,Pos2=beta_hbonds$o_opt1, atom1="hn", atom2="o",
164 |                                                                     dist=2,lower_dist=0.2, upper_dist = 0.1,
165 |                                                                     Pos1_opt2=beta_hbonds$hn_opt2,Pos2_opt2=beta_hbonds$o_opt2, 
166 |                                                                     atom1_opt2="hn", atom2_opt2="o",type = "beta sheet hbond"))
167 |         #XPLOR doesn't create an O for last position when using seq2PSF
168 |         varlist$NOE_DT[[1]] = varlist$NOE_DT[[1]][(is.na(Pos2_opt2) & !(Pos2 == varlist$protein_length)) |
169 |                                                     (!is.na(Pos2_opt2) & !(Pos2 == varlist$protein_length | Pos2_opt2 == varlist$protein_length))]
170 |       }
171 |     }
172 |     
173 |     ##### write phi/psi dihedral angle restraints for secondary structure
174 |     # angle values +- delta for PSI and PHI angles in secondary structure elements
175 |     # these values are taken from Table1 of "CONFOLD: Residue-residue contact-guided ab initio protein folding", Adhikari et al. 2015
176 |     #alpha
177 |     phiH=-63.5
178 |     dphiH=4.5
179 |     psiH=-41.5
180 |     dpsiH=5
181 |     #beta
182 |     phiE=-118
183 |     dphiE=10.7
184 |     psiE=134
185 |     dpsiE=8.6
186 |     
187 |     varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="E" & position > 1,
188 |                                                                       .(weight=1,position,angle = phiE,delta_angle = dphiE,type="phi",ss="beta")])
189 |     varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="E" & position < varlist$protein_length,
190 |                                                                       .(weight=1,position,angle = psiE,delta_angle = dpsiE,type="psi",ss="beta")])
191 |     varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="H" & position > 1,
192 |                                                                       .(weight=1,position,angle = phiH,delta_angle = dphiH,type="phi",ss="alpha")])
193 |     varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="H" & position < varlist$protein_length,
194 |                                                                       .(weight=1,position,angle = psiH,delta_angle = dpsiH,type="psi",ss="alpha")])
195 |     
196 |     # if (debug_this) {browser()}
197 |     #write O-O restraints in beta strands
198 |     pos = varlist$SS_pred[,.(ss,next_E = varlist$SS_pred[position == (unlist(.SD)+1),ss=="E"]),
199 |                           position,.SDcols = "position"][ss=="E" & next_E & position < varlist$protein_length-1,position]
200 |     if (length(pos) > 0) {
201 |       varlist$NOE_DT[[1]] = rbind(varlist$NOE_DT[[1]], data.table(weight=1,Pos1=pos,Pos2=pos+1,atom1 = "o", atom2 = "o",
202 |                                                                   dist = 4.5,lower_dist=0.1, upper_dist=0.1,
203 |                                                                   Pos1_opt2 = NA, Pos2_opt2 = NA, atom1_opt2 = NA,atom2_opt2 = NA, type = "beta strand"))
204 |     }
205 |     
206 |     ###### define restraints from top predicted contacts (Cbeta distances) ###### 
207 |     # get scores for predictor (only those > linear_dist apart in linear sequence)
208 |     PWI = input_PWI[Pos1 < Pos2-linear_dist,cbind(Pos1,Pos2,WT_AA1,WT_AA2,.SD),,.SDcols = varlist$predictor]
209 |     names(PWI)[5] = "score"
210 |     
211 |     #exclude contacts within predicted secondary structure
212 |     for (i in varlist$SS_pred[ss %in% c("E","H"),unique(rleidx)]) {
213 |       PWI[Pos1 %in% varlist$SS_pred[rleidx==i,position] & Pos2 %in% varlist$SS_pred[rleidx==i,position],score := NA]
214 |     }
215 |     if (varlist$predictor == "control") {
216 |       set.seed(seed=1603)
217 |       helper = PWI[Pos1<Pos2-linear_dist & score==T][sample(1:.N,((varlist$protein_length-linear_dist)*varlist$L))]
218 |     } else {
219 |       helper = PWI[Pos1<Pos2-linear_dist][order(-score)][1:((varlist$protein_length-linear_dist)*varlist$L)]
220 |     }
221 |     helper[,':=' (atom1 = "cb",atom2 = "cb")]
222 |     helper[WT_AA1 == "G",atom1 := "ca"]
223 |     helper[WT_AA2 == "G",atom2 := "ca"]
224 |     if (nrow(helper) > 0) {
225 |       varlist$NOE_DT[[1]] = rbind(varlist$NOE_DT[[1]], data.table(weight=helper[,score/mean(score)],Pos1 = helper$Pos1,Pos2 = helper$Pos2,
226 |                                                                   atom1 = helper$atom1,atom2 = helper$atom2,
227 |                                                                   dist = dist_restraint, lower_dist=dist_restraint, upper_dist=0,
228 |                                                                   Pos1_opt2 = NA, Pos2_opt2 = NA, atom1_opt2 = NA,atom2_opt2 = NA, type = "contact"))
229 |     }
230 |     
231 |     
232 |     ########## write files
233 |     #create predictor sub-directories
234 |     system(command = paste0("mkdir ",home_dir,"tmp/",prefix,"/",predictorXlength$predictor[idx]),wait=T)
235 |     system(command = paste0("mkdir ",home_dir,"tmp/",prefix,"/",predictorXlength$predictor[idx],"/L",predictorXlength$L[idx]),wait=T)
236 |     #save varlist
237 |     save(file = paste0(home_dir,"tmp/",prefix,"/",varlist$predictor,"/L",varlist$L,"/varlist.RData"),
238 |          list = "varlist")
239 |     
240 |     ##### copy modeling scripts
241 |     system(command = paste0("cp -r scripts/XPLOR/XPLOR_simulations.R ",home_dir,"tmp/",prefix,"/"),wait=T)
242 |     system(command = paste0("cp -r scripts/XPLOR/XPLOR_modeling_functions_v2.R ",home_dir,"tmp/",prefix,"/"),wait=T)
243 |     system(command = paste0("cp -r scripts/XPLOR/anneal_template.py ",home_dir,"tmp/",prefix,"/"),wait=T)
244 |     system(command = paste0("cp -r scripts/XPLOR/refine_template.py ",home_dir,"tmp/",prefix,"/"),wait=T)
245 |     
246 |     
247 |     ######## write bash file to execute qsub job on cluster
248 |     bash_script = "#!/bin/bash"
249 |     bash_script[2] = paste0("#$ -q ",queue)
250 |     bash_script[3] = paste0("#$ -N ",prefix,"_L",varlist$L,"_",varlist$predictor)
251 |     if (!is.na(reporting_email)) {
252 |       bash_script[4] = paste0("#$ -M ",reporting_email)
253 |     } else {bash_script[4] = ""}
254 |     bash_script[5] = "#$ -m ae"
255 |     bash_script[6] = paste0("#$ -l virtual_free=",varlist$cores*3,"G")
256 |     bash_script[7] = paste0("#$ -pe smp ",varlist$cores)
257 |     bash_script[8] = paste0("#$ -o ",varlist$folder)
258 |     bash_script[9] = paste0("#$ -e ",varlist$folder)
259 |     bash_script[10] = paste0("/software/bl/el7.2/R-3.4.0/bin/Rscript --vanilla ",cluster_dir,prefix,"/XPLOR_simulations.R -v ",varlist$folder,"varlist.RData")
260 |     
261 |     write(bash_script,
262 |           file = paste0(home_dir,"tmp/",prefix,"/",varlist$predictor,"/L",varlist$L,"/bash_execute_XPLOR_sim.sh"))
263 |     
264 |   }
265 | 
266 | ###### copy folder structure to server
267 | system(command = paste0("scp -r ",home_dir,"tmp/",prefix," ",login_serveraddress,":",cluster_dir),wait=T)
268 | 
269 | 
270 | if (debug_this) {browser()}
271 | 
272 | ###### execute bash_scripts
273 | for (idx in 1:nrow(predictorXlength)) {
274 |   run.remote(paste0("qsub ",cluster_dir,prefix,"/",predictorXlength$predictor[idx],"/L",predictorXlength$L[idx],"/bash_execute_XPLOR_sim.sh"),remote = login_serveraddress)
275 | }
276 | 
277 | 
278 | ###### remove tmp dir on local machine
279 | system(command = paste0("rm -rf ",home_dir,"tmp/"))
280 | }


--------------------------------------------------------------------------------
/scripts/call_epistasis_binary.R:
--------------------------------------------------------------------------------
  1 | #####################################################################
  2 | ### calculate non-parametric epistasis null model, call epistasis & #
  3 | #####  define subsets for positive/neative epistasis evaluation #####
  4 | #####################################################################
  5 | 
  6 | call_epistasis_binary = function(double_data,
  7 |                                  lower_bound_F,
  8 |                                  dataset_dir,
  9 |                                  output_filename = "DMS_doubles.txt",
 10 |                                  prefix = "",
 11 |                                  xsig=2,
 12 |                                  sym = T,
 13 |                                  Q = 0.05,
 14 |                                  epistasis_error_from_slopes = T) {
 15 |   
 16 |   ### variables 
 17 |   # double_data: the doubles data.table
 18 |   # lower_bound_F: an estimate of the lower bound of the fitness assay
 19 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/epistasis/
 20 |   # output_filename: filename to write datatable to in dataset_dir/processed_data/
 21 |   # prefix:to be added to results files (in case of running diff. versions of data from same dataset etc)
 22 |   # xsig: significance threshold for calling epistasis, 2 is fine
 23 |   # sym: logical for whether the position-position fitness map is symmetrical (only FALSE for protein-protein interactions!!)
 24 |   # Q: the percentile used for upper (1-Q) and lower (Q) fitness surface calculation
 25 |   # epistasis_error_from_slopes: logical, should sigmeE (error for epistasis values) be calculate by taking slopes of median fitness surface into account
 26 |   
 27 |   set.seed(1603)
 28 |   
 29 |   require(data.table)
 30 |   require(mgcv)
 31 |   require(caTools)
 32 |   
 33 |   ##################################################################################
 34 |   ######## calculate quantile fitness surfaces for epistasis classification ########
 35 |   ##################################################################################
 36 |   
 37 |   #make position-position fitness map symmetrical
 38 |   if (sym == T) {
 39 |     DT = switch_double_DT(double_data[is.fitness == TRUE & is.reads0 == TRUE],
 40 |                           cols_switchdouble = list(c("fitness1","fitness2"),c("sigma1","sigma2")),
 41 |                           cols_double = c("fitness","sigma"))
 42 |   } else { #for non-symmetrical protein-protein interaction leave as is
 43 |     DT = copy(double_data[is.fitness == TRUE & is.reads0 == TRUE])
 44 |   }
 45 |   
 46 |   ### calculate a first approximation of the fitness surface via loess
 47 |   # use at max 10^5 variants, otherwise this can be very slow
 48 |   subDT = DT[sample(nrow(DT),min(c(100000,nrow(DT)))),.(fitness1,fitness2,fitness)]
 49 |   F_fit_loess_model = loess(fitness ~ fitness1 + fitness2,data=subDT,span=0.2)
 50 |   # extrapolate loess fit to all variants
 51 |   double_data[is.fitness == TRUE & is.reads0 == TRUE,
 52 |               F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")]
 53 |   
 54 |   # predict loess surface fitness for all variants in DT data.table 
 55 |   DT[,F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")]
 56 |   # calculate a loess-corrected fitness
 57 |   DT[,fitness_norm := fitness-F_fit_loess]
 58 |   
 59 |   ### correct the loess approximated fitness surface by quantile surfaces
 60 |   Nq = 100 # grid points along each axis
 61 |   Nv = 500000 # max # of variants used in estimation of surface
 62 |   span = max(c(0.01,500/nrow(DT))) # fraction of nearest neighbours to use for median calculation
 63 |   
 64 |   ## calculate quantile fitness surfaces
 65 |   List = quantile_fitness_surface_adaptive(DT,Nq,Nv,span,Q)
 66 |   
 67 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 68 |           F_fit_median := predict(List$F_median_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 69 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 70 |           F_fit_lower := predict(List$F_lower_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 71 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 72 |           F_fit_upper := predict(List$F_upper_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 73 |   
 74 |   
 75 |   ### calculate error of (quantitative) epistasis estimate (double mutant fitness - median fitness surface)
 76 |   if (epistasis_error_from_slopes) {
 77 |     #calculate slope of median surface to estimate error propagation from singles
 78 |     f1 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)]) +
 79 |       predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)])
 80 |     double_data[is.fitness == TRUE & is.reads0==TRUE,slope1 := abs(F_fit_median - f1)/0.01]
 81 |     
 82 |     f2 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2+ 0.01)]) +
 83 |       predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2 + 0.01)])
 84 |     double_data[is.fitness == TRUE & is.reads0==TRUE,slope2 := abs(F_fit_median - f2)/0.01]
 85 |     
 86 |     #from this calculate epistasis error via error propagation
 87 |     double_data[,sigmaE := sqrt(sigma^2 + slope1^2 * sigma1^2 + slope2^2 * sigma2^2)]
 88 |   } else {
 89 |     #otherwise, just add variances of single and double mutant fitness estimates
 90 |     double_data[,sigmaE := sqrt(sigma^2 + sigma1^2 + sigma2^2)]
 91 |   }
 92 |   
 93 |   
 94 |   ########################################################
 95 |   ################# plot fitness surfaces ################
 96 |   ########################################################
 97 |   plot_fitness_surface(double_data,F_fit_loess_model,List,dataset_dir,prefix)
 98 |   
 99 |   
100 |   #####################################################################
101 |   ######## define data subsets for positive/negative epistasis ########
102 |   #####################################################################
103 |   
104 |   ## estimate width (/95percent quantile) of background fitness distribution (due to measurement limit of fitness assay)
105 |   # take only double mutants with expected fitness significantly below lower fitness bound 
106 |   background_cutoff = double_data[is.fitness==TRUE & is.reads0 == TRUE & 
107 |                               (fitness1 + xsig*sigma1 + fitness2 + xsig*sigma2) < lower_bound_F,
108 |                               quantile(fitness,probs=0.95,na.rm=T)]
109 |   
110 |   ## mark variants for positive epistasis analysis
111 |   double_data[is.fitness==TRUE & is.reads0 == TRUE,pos_epistasis := FALSE]
112 |   double_data[is.fitness==TRUE & is.reads0 == TRUE &
113 |             F_fit_upper < 0 & #q95 surface smaller than wild-type fitness
114 |             (fitness1 + xsig*sigma1 < 0 | fitness2 + xsig*sigma2 < 0) & # f1 or f2 below wild-type fitness
115 |             !(fitness1 - xsig*sigma1 + fitness2 - xsig*sigma2 > 0), #expected fitness not above wild-type fitness
116 |           pos_epistasis := TRUE]
117 |   
118 |   ## mark variants for negative epistasis analysis
119 |   double_data[is.fitness==TRUE & is.reads0 == TRUE,neg_epistasis := FALSE]
120 |   double_data[is.fitness==TRUE & is.reads0 == TRUE &
121 |             F_fit_lower > background_cutoff & # q5 surface larger than q95 of background
122 |             fitness1 - xsig*sigma1 > lower_bound_F &
123 |             fitness2 - xsig*sigma2 > lower_bound_F & # f1&f2 above lower fitness limit
124 |             !(fitness1 - xsig*sigma1 + fitness2 - xsig*sigma2 > 0), #expected fitness not above wild-type fitness
125 |           neg_epistasis := TRUE]
126 |   
127 |   
128 |   ##### write double_data data.table to file
129 |   write.table(x = double_data, file = paste0(dataset_dir,"processed_data/",output_filename),
130 |               quote = F,row.names = F, col.names = T)
131 |   
132 |   #and return it
133 |   return(double_data)
134 | }
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/scripts/convert_AAabr_one_three.R:
--------------------------------------------------------------------------------
 1 | #collection of scripts used in DMS2struct project
 2 | 
 3 | 
 4 | # convert amino acid abbreviations between 1 and 3 letter code
 5 | convert_AAabr_one_three =  function(input,arg.len=length(unlist(strsplit(input,"")))) {
 6 |   one = unlist(strsplit("ACDEFGHIKLMNPQRSTVWY",""))
 7 |   a ="AlaCysAspGluPheGlyHisIleLysLeuMetAsnProGlnArgSerThrValTrpTyr"
 8 |   three = sapply(seq(1,nchar(a),by=3), function(x){substr(a, x, x+3-1)})
 9 |   if (arg.len == 1) {
10 |     out = three[one %in% toupper(input)]
11 |   } else if (arg.len == 3) {
12 |     if (toupper(input) == "TER") {
13 |       out = input
14 |     } else {
15 |       out = one[toupper(three) %in% toupper(input)]
16 |     }
17 |   } else {
18 |     out = "XXX"
19 |   }
20 |   return(out)
21 | }
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/scripts/create_directory_structure.R:
--------------------------------------------------------------------------------
 1 | create_directory_structure = function(directory) {
 2 |   ### create directory and subdirectories for a dataset
 3 |   #e.g. directory = "GB1/"
 4 |   system(command = paste0("mkdir -p ",directory), wait = T)
 5 |   system(command = paste0("mkdir -p ",paste0(directory,"dataset/")), wait = T)
 6 |   system(command = paste0("mkdir -p ",paste0(directory,"dataset/PDB/")), wait = T)
 7 |   system(command = paste0("mkdir -p ",paste0(directory,"dataset/PSIPRED/")), wait = T)
 8 |   system(command = paste0("mkdir -p ",paste0(directory,"processed_data/")), wait = T)
 9 |   system(command = paste0("mkdir -p ",paste0(directory,"results/")), wait = T)
10 |   system(command = paste0("mkdir -p ",paste0(directory,"results/epistasis/")), wait = T)
11 |   system(command = paste0("mkdir -p ",paste0(directory,"results/preprocessing/")), wait = T)
12 |   system(command = paste0("mkdir -p ",paste0(directory,"results/secondary_structure/")), wait = T)
13 |   system(command = paste0("mkdir -p ",paste0(directory,"results/secondary_structure/temp_plots/")), wait = T)
14 |   system(command = paste0("mkdir -p ",paste0(directory,"results/tertiary_contacts/")), wait = T)
15 |   system(command = paste0("mkdir -p ",paste0(directory,"results/deepcontact/")), wait = T)
16 |   system(command = paste0("mkdir -p ",paste0(directory,"results/XPLOR/")), wait = T)
17 | 
18 | }


--------------------------------------------------------------------------------
/scripts/deepcontact_transform_basic2d.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | ### use DeepContact convolutional neural network (Liu et al. 2107) ###
  3 | ### to transform interaction score matrices; this basic architecture #
  4 | ### only uses the 2d interaction score matrix as input and outputs ###
  5 | ### a transformed interaction score matrix ###########################
  6 | ######################################################################
  7 | 
  8 | 
  9 | deepcontact_transform_basic2d = function(PWI,
 10 |                                  dataset_dir,
 11 |                                  output_filename = "DMS_PWI_deepcontact.txt",
 12 |                                  prefix = "",
 13 |                                  modus = "cis",
 14 |                                  plot_results = T,
 15 |                                  deepcontact_dir,
 16 |                                  normalize = T) {
 17 |   
 18 |   ### variables 
 19 |   # PWI: pairwise interaction score data.table
 20 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
 21 |   # output_filename: filename to write datatable to in dataset_dir/processed_data/
 22 |   # prefix:to be added to results files (in case of running diff. versions of data from same dataset etc)
 23 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
 24 |   # plot results: if TRUE, will output untransformed versus transformed data into dataset_dir/results/deepcontact/
 25 |   # normalize: normalize interaction scores to range from 0 to 1 (as do CCMpred scores, which are normally used with DeepContact)
 26 |   # deepcontact_dir: directory where the DeepContact repository was cloned to (use "git clone https://github.com/largelymfs/deepcontact")
 27 |   #####   NOTE that you'll also have to install the python dependencies required for DeepContact (see github page point 4; points 1-3 can be ignored for this basic "CCMpred only" architecture) 
 28 |   
 29 |   
 30 |   require(gdata)
 31 |   require(cowplot)
 32 |   
 33 |   #setup deep contact
 34 |   Sys.setenv(PYTHONPATH = deepcontact_dir)
 35 |   if (length(grep("anaconda",Sys.getenv("PATH")))==0) {
 36 |     Sys.setenv(PATH = paste0(Sys.getenv("PATH"),":/anaconda/envs/deepcontact-env/bin"))
 37 |   }
 38 |   
 39 |   #write input scores to files
 40 |   setkey(PWI,Pos1,Pos2)
 41 |   unique_pos = unique(c(PWI$Pos1,PWI$Pos2))
 42 |   protein_length = max(unique_pos) - min(unique_pos) + 1
 43 |   
 44 |   #set up data.table for output
 45 |   PWI_transformed = data.table(Pos1 = rep(unique_pos,protein_length),
 46 |                                Pos2 = rep(unique_pos,each=protein_length))
 47 |   
 48 |   #find interaction score columns for evaluation
 49 |   eval_cols = setdiff(names(PWI),c("Pos1","Pos2","WT_AA1","WT_AA2","NposE","NnegE"))
 50 |   
 51 |   for (i in seq_along(eval_cols)) {
 52 |     setkey(PWI,Pos1,Pos2)
 53 |     setkey(PWI_transformed,Pos1,Pos2)
 54 |     
 55 |     if (modus == "cis") {
 56 |       if (normalize == T) {#normalize
 57 |         PWI[Pos1 != Pos2,(eval_cols[i]) := (.SD - min(.SD,na.rm=T)) / (max(.SD,na.rm=T) - min(.SD,na.rm=T)),.SDcols = eval_cols[i]]
 58 |       }
 59 |       #set diagonal elements to 0
 60 |       PWI[Pos1 == Pos2,(eval_cols[i]) := 0]
 61 |     } else { #if trans, don't treat diagonal separately
 62 |       if (normalize == T) {#normalize
 63 |         PWI[,(eval_cols[i]) := (.SD - min(.SD,na.rm=T)) / (max(.SD,na.rm=T) - min(.SD,na.rm=T)),.SDcols = eval_cols[i]]
 64 |       }
 65 |     }
 66 |     #make matrix
 67 |     helper = matrix(PWI[.(rep(unique_pos,protein_length),rep(unique_pos,each=protein_length)),unlist(.SD),.SDcols=eval_cols[i]],nrow=protein_length,ncol=protein_length)
 68 | 
 69 |     if (modus == "cis") { #make symmetric
 70 |       lowerTriangle(helper) = upperTriangle(helper,byrow=T) 
 71 |     }
 72 |     
 73 |     #write to file
 74 |     system(command = paste0("mkdir ",deepcontact_dir,"ccmpred_only/"),wait=T)
 75 |     write.table(helper,file = paste0(deepcontact_dir,"ccmpred_only/tmp.ccmpred"),
 76 |                 quote = F,sep = "\t",row.names = F,col.names = F)
 77 |     
 78 |     #transform with deep contact
 79 |     org_wd = getwd()
 80 |     setwd(deepcontact_dir)
 81 |     command_input = paste0("source activate deepcontact-env","\n","python ",deepcontact_dir,"scripts/predict_using_ccmpred.py --input_filename ",deepcontact_dir,"ccmpred_only/tmp.ccmpred --output_filename ",deepcontact_dir,"ccmpred_only/tmp.output")
 82 |     system(command = command_input,wait = T)
 83 |     setwd(org_wd)
 84 |     
 85 |     #read transformed files
 86 |     if (modus == "cis") {
 87 |       DC = as.matrix(fread(paste0(deepcontact_dir,"ccmpred_only/tmp.output")))
 88 |       DC_norm = (DC + t(DC))/2
 89 |       PWI_transformed[,paste0(eval_cols[i]) := c(DC_norm)]
 90 |     } else {
 91 |       PWI_transformed[,paste0(eval_cols[i]) := c(as.matrix(fread(paste0(deepcontact_dir,"ccmpred_only/tmp.output"))))]  
 92 |     }
 93 |     
 94 |     ### plot results
 95 |     if (plot_results) {
 96 |       p1 = ggplot() +
 97 |         geom_raster(data=PWI[Pos1<Pos2],aes(Pos1,Pos2,fill=get(eval_cols[i]))) +
 98 |         geom_raster(data=PWI[Pos1<Pos2],aes(Pos2,Pos1,fill=get(eval_cols[i]))) +
 99 |         scale_fill_gradient(low = "white",high="dodgerblue1") +
100 |         geom_abline(slope=-1,linetype=2) +
101 |         scale_x_continuous(breaks = seq(0,protein_length,5),expand = c(0,0)) +
102 |         scale_y_reverse(breaks = seq(0,protein_length,5),expand = c(0,0)) +
103 |         labs(fill = "",x = "position", y = "position",
104 |              title = eval_cols[i])
105 |       p2 = ggplot() +
106 |         geom_raster(data=PWI_transformed,aes(Pos1,Pos2,fill=get(eval_cols[i]))) +
107 |         scale_fill_gradient(low = "white",high="dodgerblue1") +
108 |         geom_abline(slope=-1,linetype=2) +
109 |         scale_x_continuous(breaks = seq(0,protein_length,5),expand = c(0,0)) +
110 |         scale_y_reverse(breaks = seq(0,protein_length,5),expand = c(0,0)) +
111 |         labs(fill = "",x = "position", y = "position",
112 |              title = "DeepContact")
113 |       plot_grid(plotlist = list(p1,p2),nrow=1)
114 |       ggsave(paste0(dataset_dir,"results/deepcontact/",prefix,eval_cols[i],".pdf"),width=11,height=5)
115 |       
116 |     }
117 |   }
118 |   
119 |   # write transformed data.table to file
120 |   write.table(x = PWI_transformed, file = paste0(dataset_dir,"processed_data/",output_filename),
121 |               quote = F,row.names = F, col.names = T)
122 |   
123 |   # return transformed data.table
124 |   return(PWI_transformed)
125 | }


--------------------------------------------------------------------------------
/scripts/epistasis_analytics.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | ####### plot epistasis subset regions in single fitness space ########
  3 | ######################################################################
  4 | epistasis_analytics_subsets_singlemutantspace = function(doubles,
  5 |                                                          dataset_dir,
  6 |                                                          prefix = "",
  7 |                                                          modus = "cis") {
  8 |   
  9 |   ### variables 
 10 |   # doubles: doubles data.table
 11 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
 12 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
 13 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
 14 |   
 15 |   if (modus == "trans") {
 16 |     data = copy(doubles[is.fitness==TRUE & is.reads0==TRUE])
 17 |   } else { #if cis library make symmetric
 18 |     data = switch_double_DT(doubles[is.fitness==TRUE & is.reads0==TRUE],list(c("fitness1","fitness2")),c("fitness","pos_epistasis","neg_epistasis"))  
 19 |   }
 20 |   
 21 |   ## bin data in both fitness directions
 22 |   x1 = seq(data[,quantile(fitness1,0,na.rm=T)*0.99],data[,quantile(fitness1,1,na.rm=T)*1.01],length.out = 25)
 23 |   data[,bin1:=findInterval(fitness1,x1)]
 24 |   data[,bin2:=findInterval(fitness2,x1)]
 25 |   data[,X1 := x1[bin1],by=bin1]
 26 |   data[,X2 := x1[bin2],by=bin2]
 27 |   
 28 |   ## 2d histogram
 29 |   setkey(data,X1,X2)
 30 |   N_binned_all = data[.(rep(x1,length(x1)),(rep(x1,each=length(x1)))),.(N_all=.N),by=.EACHI]
 31 |   N_binned_pos = data[pos_epistasis==TRUE][.(rep(x1,length(x1)),(rep(x1,each=length(x1)))),.(N_pos=.N),by=.EACHI]
 32 |   N_binned_neg = data[neg_epistasis==TRUE][.(rep(x1,length(x1)),(rep(x1,each=length(x1)))),.(N_neg=.N),by=.EACHI]
 33 |   
 34 |   ## smooth minimally to facilitate contour plot drawing
 35 |   for (i in 1:nrow(N_binned_all)) {
 36 |     w = exp(-((N_binned_all[,X1]-N_binned_all[i,X1])^2 + (N_binned_all[,X2]-N_binned_all[i,X2])^2)/0.001) 
 37 |     set(N_binned_all,i,"N_all_smooth",sum(N_binned_all$N_all*w)/sum(w))
 38 |     set(N_binned_pos,i,"N_pos_smooth",sum(N_binned_pos$N_pos*w)/sum(w))
 39 |     set(N_binned_neg,i,"N_neg_smooth",sum(N_binned_neg$N_neg*w)/sum(w))
 40 |   }
 41 |   
 42 |   N_binned_merged = merge(merge(N_binned_all,N_binned_pos,by=c("X1","X2"),all = TRUE),N_binned_neg,by=c("X1","X2"),all = TRUE)
 43 |   N_binned_merged[N_all_smooth < 1,N_all_smooth := 0]
 44 |   N_binned_merged[,N_pos_smooth := as.numeric(N_pos_smooth >= 1)]
 45 |   N_binned_merged[,N_neg_smooth := as.numeric(N_neg_smooth >= 1)]
 46 |   
 47 |   ggplot() +
 48 |     geom_raster(data=N_binned_merged,aes(X1,X2,fill=N_all),interpolate=F) +
 49 |     scale_fill_gradient(low="gray75",high="dodgerblue4",trans="log10",na.value = "white") +
 50 |     geom_contour(data=N_binned_merged,aes(X1,X2,z=N_neg_smooth,color="gold"),size=1,bins=1,alpha=0.75) +
 51 |     geom_contour(data=N_binned_merged,aes(X1,X2,z=N_pos_smooth,color="red"),size=1,bins=1,alpha=0.75) +
 52 |     scale_color_manual(values=c("gold","red"),labels=c("negative","positive")) +
 53 |     scale_x_continuous(expand = c(0,0)) +
 54 |     scale_y_continuous(expand = c(0,0)) +
 55 |     labs(x="single mutant fitness 1",y="single mutant fitness 2",
 56 |          color="epistasis subsets",fill="#variants")
 57 |   ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"epistasis_subsets_singlemutantspace.pdf"),height=4,width=6)
 58 | }
 59 | 
 60 | 
 61 | ##########################################################################
 62 | ####### plot marginal distribution of the number of variants suitable ####
 63 | ####### for epistasis classification over all position pairs #############
 64 | ##########################################################################
 65 | epistasis_analytics_NumEvars_marginal = function(doubles,
 66 |                                                  dataset_dir,
 67 |                                                  prefix = "",
 68 |                                                  modus = "cis") {
 69 |   ### variables 
 70 |   # doubles: doubles data.table
 71 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
 72 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
 73 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
 74 |   
 75 |   theme_set(theme_classic())
 76 |   
 77 |   if (modus == "cis") {
 78 |     doubles_sym = switch_double_DT(doubles[is.reads0==T & is.fitness == T],list(c("Pos1","Pos2"),c("fitness1","fitness2")),c("pos_epistasis","neg_epistasis"))  
 79 |   } else {
 80 |     doubles_sym = copy(doubles[is.reads0==T & is.fitness == T])
 81 |   }
 82 |   
 83 |   DT_numbervars = doubles_sym[,.(num_all = .N,
 84 |                                  num_posE = sum(pos_epistasis==T),
 85 |                                  num_negE = sum(neg_epistasis==T)),
 86 |                               .(Pos1,Pos2)]
 87 |   
 88 |   DT_numbervars_melt = melt(DT_numbervars,id.vars = "Pos1",measure.vars = c("num_all","num_posE","num_negE"))
 89 | 
 90 |   ggplot(DT_numbervars_melt,aes(value,color=variable,..count..)) +
 91 |     geom_density(adjust=0.5) +
 92 |     scale_color_manual(breaks=c("num_all","num_posE","num_negE"),values = c("black","red","gold"),
 93 |                        labels = c(paste0("all, <n> = ",DT_numbervars[,round(mean(num_all))]),
 94 |                                   paste0("pos.E, <n> = ",DT_numbervars[,round(mean(num_posE))]),
 95 |                                   paste0("neg.E, <n> = ",DT_numbervars[,round(mean(num_negE))]))) +
 96 |     scale_x_continuous(limits = c(0,361),breaks = seq(0,350,50),expand = c(0,0)) +
 97 |     scale_y_continuous(breaks = seq(0,350,50),expand = c(0,0)) +
 98 |     labs(x="number of double mutants per position pair",y="density [a.u.]",
 99 |          color = "data subset")
100 |   ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"number_epistatic_variants.pdf"),width=5,height=3)
101 | }
102 | 
103 | ####################################################################
104 | ####### number of variants suitable for epistasis classification ###
105 | #######  versus single mutant fitness ##############################
106 | epistasis_analytics_NumEvars_fitness = function(doubles,
107 |                                                 dataset_dir,
108 |                                                 prefix = "",
109 |                                                 modus = "cis") {
110 |   
111 |   ### variables 
112 |   # doubles: doubles data.table
113 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
114 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
115 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
116 |   
117 |   theme_set(theme_classic(base_size = 9))
118 |   
119 |   if (modus == "cis") {
120 |     doubles_sym = switch_double_DT(doubles[is.reads0==T & is.fitness == T],list(c("Pos1","Pos2"),c("fitness1","fitness2")),c("pos_epistasis","neg_epistasis"))  
121 |     DT_numbervars = doubles_sym[,.(num_all = .N,
122 |                                    num_posE = sum(pos_epistasis==T),
123 |                                    num_negE = sum(neg_epistasis==T),
124 |                                    median_fitness = median(fitness1,na.rm=T),
125 |                                    variants = "all"),
126 |                                 .(Pos1,Pos2)]
127 |     ##### fitness versus number of variants
128 |     DT_mN_mF = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness),variants = "all"),Pos1]
129 |     DT_mN_mF = rbind(DT_mN_mF,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness),variants = "posE"),Pos1])
130 |     DT_mN_mF = rbind(DT_mN_mF,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness),variants = "negE"),Pos1])
131 |     DT_mN_mF[,variants := factor(variants,levels = c("all","posE","negE"))]
132 |     
133 |     ggplot(DT_mN_mF,aes(median_fitness,median_N,color = variants)) +
134 |       geom_point() +
135 |       scale_color_manual(values = c("black","red","gold")) +
136 |       geom_smooth(se=F) +
137 |       labs(x = "median single mutant fitness at position",
138 |            y = "median # of double mutants in pairs involving position")
139 |     ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants.pdf"),width=5,height=4)
140 |     
141 |   } else {
142 |     # doubles_sym = copy(doubles[is.reads0==T & is.fitness == T])
143 |     DT_numbervars = doubles[is.reads0==T & is.fitness == T,
144 |                             .(num_all = .N,
145 |                               num_posE = sum(pos_epistasis==T),
146 |                               num_negE = sum(neg_epistasis==T),
147 |                               median_fitness1 = median(fitness1,na.rm=T),
148 |                               median_fitness2 = median(fitness2,na.rm=T),
149 |                               variants = "all"),
150 |                             .(Pos1,Pos2)]
151 |     ##### fitness versus number of variants
152 |     DT_mN_mF1 = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness1),variants = "all"),Pos1]
153 |     DT_mN_mF1 = rbind(DT_mN_mF1,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness1),variants = "posE"),Pos1])
154 |     DT_mN_mF1 = rbind(DT_mN_mF1,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness1),variants = "negE"),Pos1])
155 |     DT_mN_mF1[,variants := factor(variants,levels = c("all","posE","negE"))]
156 |     
157 |     ggplot(DT_mN_mF1,aes(median_fitness,median_N,color = variants)) +
158 |       geom_point() +
159 |       scale_color_manual(values = c("black","red","gold")) +
160 |       geom_smooth(se=F) +
161 |       labs(x = "median single mutant fitness at position",
162 |            y = "median number of double mutants in position pairs involving position")
163 |     ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants_protein1.pdf"),width=5,height=4)
164 |     
165 |     DT_mN_mF2 = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness2),variants = "all"),Pos2]
166 |     DT_mN_mF2 = rbind(DT_mN_mF2,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness2),variants = "posE"),Pos2])
167 |     DT_mN_mF2 = rbind(DT_mN_mF2,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness2),variants = "negE"),Pos2])
168 |     DT_mN_mF2[,variants := factor(variants,levels = c("all","posE","negE"))]
169 |     ggplot(DT_mN_mF2,aes(median_fitness,median_N,color = variants)) +
170 |       geom_point() +
171 |       scale_color_manual(values = c("black","red","gold")) +
172 |       geom_smooth(se=F) +
173 |       labs(x = "median single mutant fitness at position",
174 |            y = "median number of double mutants in position pairs involving position")
175 |     ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants_protein2.pdf"),width=5,height=4)
176 |   }
177 | }
178 | 
179 | 
180 | ###############################################################
181 | ##### spatial distribution of variants per position pair ######
182 | ###############################################################
183 | epistasis_analytics_NumEvars_spatial = function(PWI,
184 |                                                 dataset_dir,
185 |                                                 prefix = "",
186 |                                                 modus = "cis") {
187 |   
188 |   ### variables 
189 |   # PWI: pairwise interaction score data.table
190 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
191 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
192 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
193 |   
194 |   if (modus == "cis") { #plot positive and negative subsets as halves of same matrix
195 |     ggplot() +
196 |       geom_raster(data = PWI[Pos1<Pos2],aes(Pos1,Pos2,fill=NposE)) +
197 |       geom_raster(data = PWI[Pos1<Pos2],aes(Pos2,Pos1,fill=NnegE)) +
198 |       scale_fill_distiller(direction=1) +
199 |       scale_x_continuous(expand = c(0.01,0),breaks = seq(5,55,5)) +
200 |       scale_y_reverse(expand = c(0.01,0),breaks = seq(5,55,5)) +
201 |       labs(x = "position",y="position",title = "posE: lower left, negE: upper right",fill = "# variants")
202 |     ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"number_variants_per_positionpair.pdf"),width=5.5,height=4.2)
203 |   } else { #separate matrices
204 |     Ppos = ggplot() +
205 |       geom_raster(data = PWI,aes(Pos1,Pos2,fill=NposE)) +
206 |       scale_fill_distiller(direction=1,limits = c(0,250)) +
207 |       scale_x_continuous(expand = c(0.01,0),breaks = seq(5,55,5)) +
208 |       scale_y_reverse(expand = c(0.01,0),breaks = seq(5,55,5)) +
209 |       labs(x = "protein 1",y="protein 2",title = "posE variants",fill = "# variants")
210 |     
211 |     Pneg = ggplot() +
212 |       geom_raster(data = PWI,aes(Pos2,Pos1,fill=NnegE)) +
213 |       scale_fill_distiller(direction=1,limits = c(0,250)) +
214 |       scale_x_continuous(expand = c(0.01,0),breaks = seq(5,55,5)) +
215 |       scale_y_reverse(expand = c(0.01,0),breaks = seq(5,55,5)) +
216 |       labs(x = "protein 1",y="protein 2",title = "negE variants",fill = "# variants")
217 |     
218 |     plot_grid(plotlist = list(Ppos,Pneg),nrow=1)
219 |     
220 |     ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"number_variants_per_positionpair.pdf"),width=8,height=3.2)
221 |     
222 |   }
223 | }
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | ######################################################################
231 | ####### CDF of epistasis variants versus distance (if contactmap avail.)
232 | ######################################################################
233 | epistasis_analytics_subsets_CDF = function(doubles,
234 |                                            dataset_dir,
235 |                                            prefix = "",
236 |                                            contactmap,
237 |                                            modus = "cis",
238 |                                            dist_type = "scHAmin",
239 |                                            dist_cutoff = 8,
240 |                                            lindist = 5) { 
241 |   
242 |   ### variables
243 |   # doubles: doubles data.table
244 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
245 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
246 |   # contactmap: known pair-wise distance contactmap from a PDB file
247 |   # modus: "cis" (single protein) or "trans" (protein-protein interaction)
248 |   # dist_type: which distance metric to use
249 |   # dist_cutoff: if not NA, will be used to report the fraction of variants below that distances
250 |   # lindist: only use position pairs with a sequence separation greater than this (only for cis libraries)
251 |   
252 |   data = merge(doubles[is.fitness==TRUE & is.reads0==TRUE],contactmap,by=c("Pos1","Pos2"))
253 | 
254 |   if (modus == "trans") {
255 |     data1 = rbind(data[,c(.SD,type="all"),,.SDcols = dist_type], #all variants that pass read filter threshold
256 |                   data[pos_epistasis==T,c(.SD,type="posEsubset"),,.SDcols = dist_type], #all variants suitable for positive epistasis classification
257 |                   data[pos_epistasis==T & fitness > F_fit_upper,c(.SD,type="posE"),,.SDcols = dist_type], #in this subset, variants that are classified as positive epistatic
258 |                   data[neg_epistasis==T,c(.SD,type="negEsubset"),,.SDcols = dist_type], #same for negative
259 |                   data[neg_epistasis==T & fitness < F_fit_lower,c(.SD,type="negE"),,.SDcols = dist_type])
260 |     title_label = "trans"
261 |   } else {
262 |     data1 = rbind(data[Pos1<Pos2-lindist,c(.SD,type="all"),,.SDcols = dist_type], #all variants that pass read filter threshold
263 |                   data[Pos1<Pos2-lindist & pos_epistasis==T,c(.SD,type="posEsubset"),,.SDcols = dist_type],#all variants suitable for positive epistasis classification
264 |                   data[Pos1<Pos2-lindist & pos_epistasis==T & fitness > F_fit_upper,c(.SD,type="posE"),,.SDcols = dist_type],#in this subset, variants that are classified as positive epistatic
265 |                   data[Pos1<Pos2-lindist & neg_epistasis==T,c(.SD,type="negEsubset"),,.SDcols = dist_type],#same for negative
266 |                   data[Pos1<Pos2-lindist & neg_epistasis==T & fitness < F_fit_lower,c(.SD,type="negE"),,.SDcols = dist_type])
267 |     title_label = paste0("cis, lindist > ",lindist)
268 |   }
269 |   names(data1)[1] = "distance"
270 |   
271 |   data1[,type := factor(type,levels=c("all","posE","posEsubset","negE","negEsubset"))]
272 |   setkey(data1,type)
273 |   ggplot(data1,aes(x=distance,color=type)) + 
274 |     stat_ecdf() +
275 |     scale_color_manual(values = c("black","red","orange","gold","yellow"),
276 |                        labels = data1[,.(paste0(type," ",round(sum(.SD<dist_cutoff)/.N,digits=2)*100,"%")),type,.SDcols = "distance"]$V1) +
277 |     scale_x_continuous(expand=c(0,0),breaks = seq(0,50,5)) +
278 |     scale_y_continuous(expand=c(0,0),breaks = seq(0,1,0.25)) +
279 |     geom_vline(xintercept = dist_cutoff,linetype=2) +
280 |     labs(x=paste0(dist_type," distance [A]"),y="cumulative probability",title = title_label) +
281 |     theme_classic()
282 |   ggsave(ifelse(modus == "trans",
283 |                 paste0(dataset_dir,"results/epistasis/",prefix,"epistasis_subsets_CDF_",dist_type,".pdf"),
284 |                 paste0(dataset_dir,"results/epistasis/",prefix,"epistasis_subsets_CDF_",dist_type,"_lindist",lindist,".pdf")),height=3,width=4)
285 | }
286 | 
287 | 
288 | 


--------------------------------------------------------------------------------
/scripts/hbonds_from_betasheetpairing.R:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | ################ from beta sheet pairing derive hbond combinations #############
 3 | ################################################################################
 4 | #subfunction to predict_beta_sheets function
 5 | hbonds_from_betasheetpairing = function(beta_sheet_pairing,ss_data = c()) {
 6 |   
 7 |   beta_hbonds = data.table(hn_opt1 = integer(),o_opt1 = integer(),hn_opt2 = integer(),o_opt2 = integer(),sheet= integer(),hbond = integer())
 8 |   # beta_sheet_pairing = copy(beta_sheet_pairing)
 9 |   if (nrow(beta_sheet_pairing) > 0) {
10 |     for (i in 1:nrow(beta_sheet_pairing)) {
11 |       L = beta_sheet_pairing[i,pos1_max - pos1_min + 1]
12 |       if (beta_sheet_pairing[i,type=="anti-par"]) { #anti-parallel sheet
13 |         
14 |         if (L %% 2 == 1) { #if uneven, decide which side to keep; 
15 |           
16 |           if (length(ss_data) > 0) { #use the one wiht lower p_value
17 |             p_start = ss_data[Pos1 == beta_sheet_pairing[i,pos1_min] & Pos2 == beta_sheet_pairing[i,pos2_max],beta_antipar_p]
18 |             p_end = ss_data[Pos1 == beta_sheet_pairing[i,pos1_max] & Pos2 == beta_sheet_pairing[i,pos2_min],beta_antipar_p]
19 |             if (p_start < p_end) {
20 |               beta_sheet_pairing[i,':=' (pos1_max = pos1_max - 1, pos2_min = pos2_min + 1)]
21 |             } else {
22 |               beta_sheet_pairing[i,':=' (pos2_max = pos2_max - 1, pos1_min = pos1_min + 1)]
23 |             }
24 |           } else { #without additional info, keep lower position
25 |             beta_sheet_pairing[i,':=' (pos2_max = pos2_max - 1, pos1_min = pos1_min + 1)]
26 |           }
27 |           L=L-1
28 |         }
29 |         beta_hbonds = rbind(beta_hbonds,beta_sheet_pairing[i,.(hn_opt1 = c(seq(pos1_min,pos1_max,2),seq(pos2_max,pos2_min,-2)),
30 |                                                                o_opt1 = c(seq(pos2_max,pos2_min,-2),seq(pos1_min,pos1_max,2)),
31 |                                                                hn_opt2 = c(seq(pos1_min+1,pos1_max,2),seq(pos2_max-1,pos2_min,-2)),
32 |                                                                o_opt2 = c(seq(pos2_max-1,pos2_min,-2),seq(pos1_min+1,pos1_max,2)),
33 |                                                                sheet=i,hbond=nrow(beta_hbonds)+1:L)])
34 |         
35 |       } else { #parallel sheet
36 |         beta_hbonds = rbind(beta_hbonds,beta_sheet_pairing[i,.(hn_opt1 = c(seq(pos2_min+1,pos2_max,2),seq(pos1_min+2,pos1_max,2)),
37 |                                                                o_opt1 = c(seq(pos1_min,pos1_max-1,2),seq(pos2_min+1,pos2_max-1,2)),
38 |                                                                hn_opt2 = c(seq(pos1_min+1,pos1_max,2),seq(pos2_min+2,pos2_max,2)),
39 |                                                                o_opt2 = c(seq(pos2_min,pos2_max-1,2),seq(pos1_min+1,pos1_max-1,2)),
40 |                                                                sheet=i,hbond=nrow(beta_hbonds)+1:(L-1))])
41 |       }
42 |     }
43 |     if (nrow(beta_sheet_pairing) > 1) {
44 |       #create all possible combinations of sheets
45 |       require(combinat)
46 |       sheet_comb = list()
47 |       for (i in 1:(nrow(beta_sheet_pairing)-1)) {
48 |         x=combn(1:nrow(beta_sheet_pairing),i)
49 |         sheet_comb = c(sheet_comb,split(x, rep(1:ncol(x), each = nrow(x))))
50 |       }
51 |       #swap options until hbonding is consistent
52 |       iterations = 0
53 |       while ((beta_hbonds[,.N,hn_opt1][,sum(N>1)>0] | beta_hbonds[,.N,hn_opt2][,sum(N>1)>0]) & iterations < 3) {
54 |         for (i in 1:length(sheet_comb)) {
55 |           beta_hbonds[sheet %in% sheet_comb[[i]], c("hn_opt1","o_opt1","hn_opt2","o_opt2") := .(hn_opt2,o_opt2,hn_opt1,o_opt1)]
56 |           if (!beta_hbonds[,.N,hn_opt1][,sum(N>1)>0] & !beta_hbonds[,.N,hn_opt2][,sum(N>1)>0]) {
57 |             break
58 |           } else { #reverse
59 |             beta_hbonds[sheet %in% sheet_comb[[i]], c("hn_opt1","o_opt1","hn_opt2","o_opt2") := .(hn_opt2,o_opt2,hn_opt1,o_opt1)]
60 |           }
61 |           if (i == length(sheet_comb)) {iterations = iterations + 1}
62 |         }
63 |       }
64 |     }
65 |   }
66 |   return(beta_hbonds)
67 | }


--------------------------------------------------------------------------------
/scripts/identify_expand_seeds.R:
--------------------------------------------------------------------------------
 1 | ################################################################################################
 2 | ############### function for expanding seeds and finding most significant stretches ############
 3 | ################################################################################################
 4 | ## subfunctions to predict_secondary_structure / sheets functions
 5 | identify_expand_seeds = function(any_strand,seed_size = 3,p_threshold = 0.05,bridge_dist = 1,max_extension = 1) {
 6 |   
 7 |   #starting with lowest p-value triplet, expand
 8 |   idx=1
 9 |   any_strand_temp = copy(any_strand)
10 |   any_strand[,':=' (strand = as.integer(NA), p_strand = as.numeric(NA))]
11 |   while (min(any_strand_temp$p_seed,na.rm=T) < p_threshold) {
12 |     #start with best available triplet seed
13 |     psum = any_strand_temp[,min(p_seed,na.rm=T)]
14 |     positions = any_strand_temp[which.min(p_seed),seq(pos-(seed_size-1)/2,pos+(seed_size-1)/2,1)]
15 |     
16 |     #expand triplet seed
17 |     positions_down = c(positions,min(positions)-1)
18 |     positions_up = c(positions,max(positions)+1)
19 |     extension = 0
20 |     while (extension <= max_extension) {
21 |       
22 |       psum_down = any_strand_temp[pos %in% positions_down & !is.na(p_ind),if(.N>1){sumlog(p_ind)$p}else{p_ind}]
23 |       psum_up = any_strand_temp[pos %in% positions_up & !is.na(p_ind),if(.N>1){sumlog(p_ind)$p}else{p_ind}]
24 |       if (psum_down < min(psum,psum_up)) {
25 |         positions = positions_down
26 |         psum = psum_down
27 |         extension = 0
28 |       } else if (psum_up < min(psum,psum_down)) {
29 |         positions = positions_up
30 |         psum = psum_up
31 |         extension = 0
32 |       } else { #check whether extending further gives a more significant strand
33 |         extension = extension+1
34 |       }
35 |       positions_down = c(positions_down,min(positions)-1-extension)
36 |       positions_up = c(positions_up,max(positions)+1+extension)
37 |     }
38 |     
39 |     #record beta strands
40 |     if (idx == 1) {
41 |       any_strand[pos %in% positions,':=' (strand = idx, p_strand = psum)]
42 |       idx=idx+1
43 |     } else {
44 |       #merge with adjacent beta strands if dist smaller than bridge_dist
45 |       closeby_strands = unique(any_strand[!is.na(strand)][,.(min_dist=min(abs(pos-positions)),strand),pos][min_dist<= bridge_dist,strand])
46 |       if (length(closeby_strands) > 0) {
47 |         if (length(closeby_strands) > 1) {
48 |           any_strand[strand %in% closeby_strands,strand := min(closeby_strands)]
49 |         }
50 |         strand_idx = min(closeby_strands)
51 |         any_strand[pos %in% positions,':=' (strand = strand_idx)]
52 |         any_strand[strand == strand_idx, p_strand := sumlog(p_ind[!is.na(p_ind)])$p, strand]
53 |       } else { # or record new beta strand
54 |         strand_idx = idx
55 |         any_strand[pos %in% positions,':=' (strand = strand_idx)]
56 |         any_strand[strand == strand_idx, p_strand := sumlog(p_ind[!is.na(p_ind)])$p, strand]
57 |         idx=idx+1
58 |       }
59 |     }
60 |     
61 |     #set p-values of 'used' positions to NA
62 |     any_strand_temp[pos %in% positions,p_ind := NA]
63 |     # any_strand_temp[between(pos,min(positions)-(seed_size-1)/2,max(positions)+(seed_size-1)/2),p_seed:=NA]
64 |     any_strand_temp[pos %in% positions,p_seed:=NA]
65 |   }
66 |   
67 |   #rearrange strand nrs
68 |   strand_vec = unique(any_strand[!is.na(strand),strand])
69 |   if (length(strand_vec) > 1) {
70 |     for (i in seq_along(strand_vec)) {any_strand[strand==strand_vec[i],strand_new:=i]}
71 |     any_strand[,strand:=strand_new]
72 |     any_strand[,strand_new:=NULL]
73 |   }
74 |   return(any_strand)
75 | }


--------------------------------------------------------------------------------
/scripts/misc/call_epistasis_symdata_v1.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #################################################################
  3 | ### a modified version of epistasis classification for "symmetrical" data with detrimental and beneficial variants ###
  4 | #################################################################
  5 | 
  6 | call_epistasis_symdata_v1 = function(double_data,lower_bound_F,upper_bound_F,output_dir,prefix = "",xsig=2,sym = T,Q = 0.05, epistasis_error_from_slopes = T) {
  7 |   
  8 |   
  9 |   #### version for "symmetrical" data with detrimental and beneficial variants
 10 |   # this has a modified version of epistasis classification and needs to know the lower and upper (!) bound of the fitness data
 11 |   
 12 |   #double_data: the doubles data.table
 13 |   #lower_bound_F: an estimate of the lower bound of the fitness assay
 14 |   #upper_bound_F: an estimate of the upper bound of the fitness assay
 15 |   #output_dir: base-output directory, like "GB1/", it will put results output_dir/results/epistasis/
 16 |   #prefix:to be added to results files (in case of running diff. versions of data from same dataset etc)
 17 |   #xsig: significance threshold for calling epistasis, 2 is fine
 18 |   #sym: logical for whether the position-position fitness map is symmetrical (only FALSE for protein-protein interactions!!)
 19 |   #Q: the percentile used for upper (1-Q) and lower (Q) fitness surface calculation
 20 |   #epistasis_error_from_slopes: logical, should sigmeE (error for epistasis values) be calculate by taking slopes of median fitness surface into account
 21 |   
 22 |   
 23 |   set.seed(1603)
 24 |   
 25 |   require(data.table)
 26 |   require(mgcv)
 27 |   require(caTools)
 28 |   
 29 |   if (sym == T) {
 30 |     DT = switch_double_DT(double_data[is.fitness == TRUE & is.reads0 == TRUE],
 31 |                           cols_switchdouble = list(c("fitness1","fitness2"),c("sigma1","sigma2")),
 32 |                           cols_double = c("fitness","sigma"))
 33 |   } else {
 34 |     DT = copy(double_data[is.fitness == TRUE & is.reads0 == TRUE])
 35 |   }
 36 |   
 37 |   ### use loess instead, 2d-gam is very inaccurate
 38 |   # browser()
 39 |   subDT = DT[sample(nrow(DT),min(c(100000,nrow(DT)))),.(fitness1,fitness2,fitness)]
 40 |   F_fit_loess_model = loess(fitness ~ fitness1 + fitness2,data=subDT,span=0.2)
 41 |   double_data[is.fitness == TRUE & is.reads0 == TRUE,
 42 |           F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")]
 43 |   
 44 |   ### >> calculate A-B-AB surface as median surface of gam-corrected surface
 45 |   Nq = 100 # grid points along one axis
 46 |   Nv = 500000 # variants used in estimation of surface
 47 |   span = max(c(0.01,500/nrow(DT))) # fraction of nearest neighbours to use for median calculation
 48 |   # Q = 0.05: values for quantile calcultion: 0.05 and 0.95, 
 49 |   
 50 |   # predict loess surface fitness and correct fitness for it
 51 |   DT[,F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")]
 52 |   DT = DT[,fitness_norm := fitness-F_fit_loess]
 53 |   
 54 |   # calculate quantile fitness surfaces
 55 |   List = quantile_fitness_surface_adaptive(DT,Nq,Nv,span,Q)
 56 | 
 57 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 58 |           F_fit_median := predict(List$F_median_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 59 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 60 |           F_fit_lower := predict(List$F_lower_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 61 |   double_data[is.fitness == TRUE & is.reads0==TRUE,
 62 |           F_fit_upper := predict(List$F_upper_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")]
 63 |   
 64 |   if (epistasis_error_from_slopes) {
 65 |     #calculate slope of median surface to estimate error propagation from singles
 66 |     f1 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)]) +
 67 |       predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)])
 68 |     double_data[is.fitness == TRUE & is.reads0==TRUE,slope1 := abs(F_fit_median - f1)/0.01]
 69 |     
 70 |     f2 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2+ 0.01)]) +
 71 |       predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2 + 0.01)])
 72 |     double_data[is.fitness == TRUE & is.reads0==TRUE,slope2 := abs(F_fit_median - f2)/0.01]
 73 |     
 74 |     #from this calculate epistasis error
 75 |     double_data[,sigmaE := sqrt(sigma^2 + slope1^2 * sigma1^2 + slope2^2 * sigma2^2)]
 76 |   } else {
 77 |     double_data[,sigmaE := sqrt(sigma^2 + sigma1^2 + sigma2^2)]
 78 |   }
 79 |   
 80 |   #####################################################################
 81 |   ######## define data subsets for positive/negative epistasis ########
 82 |   #####################################################################
 83 |   
 84 |   #estimate the width (95percent quantile) of the lower limit "background"
 85 |   lowerlimit_background_cutoff=double_data[is.fitness==TRUE & is.reads0 == TRUE & 
 86 |                               (fitness1 + xsig*sigma1 + fitness2 + xsig*sigma2) < lower_bound_F,quantile(fitness,probs=0.95,na.rm=T)]
 87 |   
 88 |   #same for the upper limit background
 89 |   upperlimit_background_cutoff=double_data[is.fitness==TRUE & is.reads0 == TRUE & 
 90 |                                              (fitness1 - xsig*sigma1 + fitness2 - xsig*sigma2) > upper_bound_F,quantile(fitness,probs=0.95,na.rm=T)]
 91 |   
 92 |   # mark variants for positive epistasis analysis
 93 |   double_data[is.fitness==TRUE & is.reads0 == TRUE,pos_epistasis := FALSE]
 94 |   
 95 |   ## upper fitness restrictions for positive epistasis
 96 |   # only limitation here is that it is not too high into the upper background of fitness
 97 |   double_data[is.fitness==TRUE & is.reads0 == TRUE &
 98 |                 F_fit_upper < upperlimit_background_cutoff,
 99 |           pos_epistasis := TRUE]
100 |   
101 |   # mark variants for negative epistasis analysis
102 |   double_data[is.fitness==TRUE & is.reads0 == TRUE,neg_epistasis := FALSE]
103 |   # #upper fitness restrictions for negative epistasis
104 |   # only limitation here is that it is not too low into the lower background of fitness
105 |   double_data[is.fitness==TRUE & is.reads0 == TRUE &
106 |             F_fit_lower > lowerlimit_background_cutoff,
107 |           neg_epistasis := TRUE]
108 |   
109 |   ########################################################
110 |   ################# plot fitness surfaces ################
111 |   ########################################################
112 |   plot_fitness_surface(double_data,F_fit_loess_model,List,output_dir,prefix)
113 |   
114 |   #Epistasis score and significant positive/negative classifications
115 |   double_data[is.fitness==TRUE & is.reads0 == TRUE,epistasis := fitness - F_fit_median]
116 |   double_data[pos_epistasis == TRUE,pos_epistasis_sig := fitness-F_fit_upper > 0]
117 |   double_data[neg_epistasis == TRUE,neg_epistasis_sig := fitness-F_fit_lower < 0]
118 | 
119 |   return(double_data)
120 | }
121 | 


--------------------------------------------------------------------------------
/scripts/misc/contact_matrix_from_pairdistances.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################################################
 3 | ### convert pair distances into contact and distance matrices ###
 4 | #################################################################
 5 | 
 6 | contact_matrix_from_pairdistances <- function(input_file, 
 7 |   dataset_dir, 
 8 |   idx_start = 1,
 9 |   dist_type = c("scHAmin", "HAmin", "CB"),
10 |   dist_cutoff = 4.5,
11 |   plot = T){
12 |   
13 |   ### variables 
14 |   # input_file: pairdistances data.table
15 |   # dataset_dir: dataset directory, like "GB1/", 
16 |   # idx_start: index of first position in contact matrix
17 |   # dist_type: one of the following distance types: "scHAmin", "HAmin", "CB"
18 |   # dist_cutoff: in Angstrom, used for determining the contact map
19 |   # plot: whether to plot the heatmap (True, False)
20 |   #### > will deposit contact maps (one for distance matrix and one for contact matrix) to dataset_dir/
21 | 
22 |   require(data.table)
23 |   require(ggplot2)
24 | 
25 |   #Read pair distances file
26 |   contactmap <- fread(input_file)
27 |   #Build distance matrix
28 |   all_positions <- unique(unlist(contactmap[,.(Pos1, Pos2)]))
29 |   all_positions_names <- idx_start:(idx_start+length(all_positions)-1)
30 |   num_positions <- length(all_positions)
31 |   dist_mat <- matrix(nrow = num_positions, ncol = num_positions)
32 |   for(i in all_positions){
33 |     for(j in all_positions){
34 |       dist_mat[i,j] <- as.numeric(contactmap[Pos1==i & Pos2==j,.SD,.SDcols = c(dist_type)])
35 |     }
36 |   }
37 |   rownames(dist_mat) <- all_positions_names
38 |   colnames(dist_mat) <- rownames(dist_mat)
39 |   bin_mat<-dist_mat
40 |   bin_mat[dist_mat<dist_cutoff]<-1
41 |   bin_mat[dist_mat>=dist_cutoff]<-0
42 |   bin_mat[dist_mat==0] <- NA
43 |   #Plot heatmap matrices
44 |   if(plot){
45 |     tile_heatmap_wrapper(dist_mat[rev(1:dim(dist_mat)[1]),], file.path(dataset_dir, gsub(".txt$", "_distance_matrix.pdf", basename(input_file))), width=5, height=5, xlab = "Residue position", ylab = "Residue position", colour_clip=F, cluster='none', xaxis_size=10, yaxis_size=10, xaxis_angle=90, x_breaks = all_positions_names, y_breaks = all_positions_names)
46 |     tile_heatmap_wrapper(bin_mat[rev(1:dim(bin_mat)[1]),], file.path(dataset_dir, gsub(".txt$", "_contact_matrix.pdf", basename(input_file))), width=5, height=5, xlab = "Residue position", ylab = "Residue position", colour_clip=F, cluster='none', xaxis_size=10, yaxis_size=10, xaxis_angle =90, x_breaks = all_positions_names, y_breaks = all_positions_names)
47 |   }
48 |   #Return
49 |   return(list(distance_matrix = dist_mat, contact_matrix = bin_mat))
50 | }
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/misc/kernel_structure_propensity.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################################################
  3 | ##### score kernel at every position along diagonal of PWI data ###
  4 | ###################################################################
  5 | 
  6 | kernel_structure_propensity <- function(PWI, 
  7 |   kernel, 
  8 |   dataset_dir,
  9 |   prefix = "",
 10 |   Nsamples = 10000, 
 11 |   debug_this = F, 
 12 |   rand_strategy = c("all_data", "within_kernel", "kernal_width")) {
 13 |   
 14 |   ### variables 
 15 |   # PWI: pairwise interaction score data.table; except for Pos1 and Pos2 this should only contain the scores that SS elements should be predicted from
 16 |   # kernel: pairwise interaction score matrix; symmetric numeric matrix with NAs on the diagonal
 17 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/processed_data/
 18 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
 19 |   # Nsamples: number of randomized controls to compare SS propensity against
 20 |   # debug_this: if TRUE, function will stop at certain points in scripts in order to understand bugs
 21 |   
 22 |   require(data.table)
 23 |   require(metap)
 24 |   
 25 |   #Initialize result list
 26 |   ss_data_list = list()
 27 | 
 28 |   #Which scores should be used for prediction?
 29 |   eval_cols = setdiff(names(PWI),c("Pos1","Pos2","WT_AA1","WT_AA2","NposE","NnegE"))
 30 | 
 31 |   for (eval_cols_idx in seq_along(eval_cols)) {
 32 |     print(eval_cols[eval_cols_idx])
 33 |     ss_data = copy(PWI[Pos1<=Pos2,.(Pos1,Pos2,input = .SD),,.SDcols = eval_cols[eval_cols_idx]])  
 34 |     setkey(ss_data,Pos1,Pos2)
 35 |     #Perpendicular distance from diagonal
 36 |     ss_data[,pos_perp := abs(Pos1-Pos2)]
 37 |     #position range for prediction
 38 |     kernel_length <- dim(kernel)[1]
 39 |     data_range = c(min(c(ss_data$Pos1,ss_data$Pos2)),max(c(ss_data$Pos1,ss_data$Pos2)))
 40 |     data_length = length(data_range[1]:data_range[2])
 41 |     pos_range = c(min(c(ss_data$Pos1,ss_data$Pos2))-(kernel_length-2),max(c(ss_data$Pos1,ss_data$Pos2))-1)
 42 | 
 43 |     if (debug_this) {browser()}
 44 |     
 45 |     #Kernel structure propensities
 46 |     set.seed(1603)
 47 |     for (i in pos_range[1]:pos_range[2]) {
 48 | 
 49 |       #Square distances from center position
 50 |       ss_data[,within_kernel := (Pos1-i)<kernel_length & (Pos1-i)>=0 & (Pos2-i)<kernel_length & (Pos2-i)>=0]
 51 | 
 52 |       #Construct kernel matrix
 53 |       kernel_mat <- matrix(NA, nrow=data_length, ncol=data_length)
 54 |       colnames(kernel_mat) <- data_range[1]:data_range[2]
 55 |       rownames(kernel_mat) <- colnames(kernel_mat)
 56 |       within_kernel_mat <- kernel_mat
 57 |       within_kernel_mat[is.na(within_kernel_mat)] <- FALSE
 58 |       for(j in 1:kernel_length){
 59 |         for(k in 1:kernel_length){
 60 |           j_shift <- j + i - 1
 61 |           k_shift <- k + i - 1
 62 |           if(j_shift>=data_range[1] & j_shift<=data_range[2] & k_shift>=data_range[1] & k_shift<=data_range[2]){
 63 |             kernel_mat[j_shift, k_shift] <- kernel[j, k]
 64 |             if(j_shift!=k_shift){
 65 |               within_kernel_mat[j_shift, k_shift] <- TRUE
 66 |             }
 67 |           }
 68 |         }
 69 |       }
 70 | 
 71 |       #Determine kernel weights
 72 |       ss_data[Pos1 %in% data_range[1]:data_range[2] & Pos2 %in% data_range[1]:data_range[2],within_data := T]
 73 |       ss_data[!is.na(ss_data$within_data), kernel_weight := kernel_mat[cbind(Pos1,Pos2)]]
 74 |       ss_data[!is.na(ss_data$within_data), within_kernel := within_kernel_mat[cbind(Pos1,Pos2)]]
 75 |       
 76 |       #Calculate kernel smoothed value for true data
 77 |       if(dim(ss_data[Pos1==i & Pos2==i])[1]==0){
 78 |         ss_data <- rbind(ss_data, list("Pos1"=i, "Pos2"=i, "within_kernel"=F, "kernel_score" = ss_data[within_kernel==T,sum(input*kernel_weight,na.rm=T)]), fill = T)
 79 |       }else{
 80 |         ss_data[Pos1==i & Pos2==i,kernel_score := ss_data[within_kernel==T,sum(input*kernel_weight,na.rm=T)]]
 81 |       }
 82 |       
 83 |       #Calculate kernel smoothed value for random distributions
 84 |       B = copy(ss_data[within_kernel==T,.(kernel_weight,input)])
 85 |       if(rand_strategy=="all_data"){
 86 |         sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples)
 87 |       }
 88 |       if(rand_strategy=="kernal_width"){
 89 |         sample_matrix = matrix(sample(ss_data[Pos1!=Pos2 & pos_perp<=max(ss_data[within_kernel==T,pos_perp]),c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples)
 90 |       }
 91 |       if(rand_strategy=="within_kernel"){
 92 |         sample_matrix = matrix(sample(ss_data[Pos1!=Pos2 & within_kernel==T,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples)
 93 |       }
 94 |       kernel_sampled = colSums(sample_matrix * matrix(rep(t(B[,kernel_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T)
 95 |       
 96 |       #P-value for true value
 97 |       ss_data[Pos1==i & Pos2==i,kernel_p := sum(kernel_sampled >= kernel_score)/Nsamples]      
 98 |     }
 99 |     #Avoid -Inf if logging p values by setting those positions smaller than all random samples to smallest non-zero pvalue
100 |     ss_data[kernel_p == 0 ,kernel_p := 1/Nsamples]
101 |     #Save
102 |     ss_data[,(paste0(eval_cols[eval_cols_idx], "_kernel_score")) := kernel_score]
103 |     ss_data[,(paste0(eval_cols[eval_cols_idx], "_kernel_p")) := kernel_p]
104 |     ss_data[,Pos := Pos1]
105 |     #Restrict to desired rows and columns, sort, save
106 |     ss_data <- ss_data[Pos1==Pos2,.SD,.SDcols=names(ss_data)[grep("^Pos$|_kernel_score$|_kernel_p$", names(ss_data))]]
107 |     setkey(ss_data, Pos)
108 |     ss_data_list[[eval_cols[eval_cols_idx]]] <- ss_data
109 |   }
110 | 
111 |   #Merge DT lists
112 |   ss_data_merge <- Reduce(function(...) merge(..., all = T), ss_data_list)
113 |   #Write to file
114 |   write.table(paste0(dataset_dir,"processed_data/",prefix,"kernel_structure_propensity.txt"),
115 |               x = ss_data_merge,quote = F,row.names = F,col.names = T)
116 |   return(ss_data_merge)
117 | }


--------------------------------------------------------------------------------
/scripts/misc/pairdistances_from_PDB_crystal.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #################################################################
 3 | ### extract position-pair distances for polymer from PDB file ###
 4 | #################################################################
 5 | 
 6 | pairdistances_from_PDB_crystal = function(input_file,
 7 |   dataset_dir,
 8 |   aa_seq,
 9 |   idx_pdb_start = 1,
10 |   idx_DMS_start = 1,
11 |   idx_DMS_end = NA,
12 |   dist_cutoff = 8,
13 |   debug_this=F,
14 |   suffix = ""){
15 |   
16 |   ### variables 
17 |   # input_file: PDB file
18 |   # dataset_dir: dataset directory, like "GB1/", 
19 |   #### > will deposit three .txt file (one for pair distances and one for secondary structure of psuedomonomer, one for pair distances of polymer) to dataset_dir/processed_data/ and plot contactmaps to /dataset_dir/results/preprocessing/
20 |   # aa_seq: amino acid sequence of reference (monomer) structure (in DMS data) -- this is compared to inferred monomer sequence from PDB file and produces error if they do not agree
21 |   # idx_pdb_start: first position in PDB to consider (WARNING: argument passed to pairdistances_from_PDB)
22 |   # idx_DMS_start: first position in reference sequence to consider (WARNING: argument passed to pairdistances_from_PDB)
23 |   # idx_DMS_end: first position in reference sequence to consider, if NA (default), it will compare the full reference sequence to the PDB file sequence (WARNING: argument passed to pairdistances_from_PDB)
24 |   # dist_cutoff: in Angstrom, used for plotting the contact map (WARNING: argument passed to pairdistances_from_PDB)
25 |   # debug_this: if TRUE, the function will stop after printing comparision between PDB seq and DMS seq to adjust position indicies if necessary (WARNING: argument passed to pairdistances_from_PDB)
26 |   # suffix: to be added to (WARNING: argument passed to pairdistances_from_PDB)
27 | 
28 |   require(data.table)
29 | 
30 |   #Atom lines
31 |   atom_lns <- c("ATOM")
32 |   #Reformat PDB file
33 |   #Read PDB file
34 |   pdb_tab <- read.pdb(input_file)
35 |   #Get all atoms
36 |   temp_atoms <- pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,]
37 |   #Convert structure into "pseudomonomer" (residue ids stricly ascending and same chain)
38 |   resid_rle <- rle(temp_atoms$resid)
39 |   pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,]$resid <- rep(resid_rle$values[1]:(resid_rle$values[1]+length(resid_rle$values)-1), times = resid_rle$lengths)
40 |   pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,]$chainid <- "A"
41 |   #Amino acid sequence of pseudomonomer
42 |   pseudomonomer_length <- length(unique(pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,c("resname", "resid")])$resname)
43 |   aa_seq_pseudomonomer <- paste0(rep(aa_seq, pseudomonomer_length/nchar(aa_seq)), collapse = "")
44 |   #Write to PDB file
45 |   input_file_pseudomonomer <- file.path(dataset_dir, "processed_data", paste0(strsplit(basename(input_file), "\\.")[[1]][1], "_pseudomonomer", suffix, ".pdb"))
46 |   write.pdb(pdb_tab, file = input_file_pseudomonomer)
47 |   #Get pair distances
48 |   pairdistances_from_PDB(input_file_pseudomonomer, dataset_dir = dataset_dir, aa_seq = aa_seq_pseudomonomer, 
49 |     idx_pdb_start = idx_pdb_start, idx_DMS_start = idx_DMS_start, idx_DMS_end = idx_DMS_end, dist_cutoff = dist_cutoff, debug_this = debug_this, suffix = suffix)
50 |   contactmap <- fread(file.path(dataset_dir, "processed_data", paste0("PDB_contactmap_", strsplit(basename(input_file), "\\.")[[1]][1], "_pseudomonomer_A", suffix, ".txt")))
51 |   #Translate positions back to monomer positions
52 |   contactmap[, Pos1 := (Pos1-1)%%nchar(aa_seq)+1]
53 |   contactmap[, Pos2 := (Pos2-1)%%nchar(aa_seq)+1]
54 |   #HAmin
55 |   setkey(contactmap, HAmin)
56 |   contactmap_HAmin <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, HAmin, HAmin_sd)]
57 |   #scHAmin
58 |   setkey(contactmap, scHAmin)
59 |   contactmap_scHAmin <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, scHAmin, scHAmin_sd)]
60 |   #CB
61 |   setkey(contactmap, CB)
62 |   contactmap_CB <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, CB, CB_sd)]
63 |   #Merge
64 |   setkey(contactmap, Pos1, Pos2)
65 |   contactmap <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, WT_AA1, WT_AA2, chainids)]
66 |   setkey(contactmap_HAmin, Pos1, Pos2)
67 |   setkey(contactmap_scHAmin, Pos1, Pos2)
68 |   setkey(contactmap_CB, Pos1, Pos2)
69 |   contactmap <- contactmap[contactmap_HAmin,][contactmap_scHAmin,][contactmap_CB,][,.(Pos1, Pos2, WT_AA1, WT_AA2, chainids, HAmin, scHAmin, CB, HAmin_sd, scHAmin_sd, CB_sd)]
70 |   #Save pairwise distance table
71 |   pdb_filename = strsplit(strsplit(input_file,"/")[[1]][length(strsplit(input_file,"/")[[1]])],"\\.")[[1]][1]
72 |   write.table(file = paste0(dataset_dir,'processed_data/PDB_contactmap_',pdb_filename,'_A',suffix,".txt",collapse = ""),
73 |               x = contactmap,quote = F,row.names = F,col.names = T)
74 | }
75 | 
76 | 


--------------------------------------------------------------------------------
/scripts/misc/tau_specificity_score.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #############################################################
 3 | ##### Tau pattern-specificity score (Yanai, et al 2004)   ###
 4 | #############################################################
 5 | 
 6 | tau_specificity_score <- function(x, 
 7 |   min_length=3){
 8 | 
 9 |   ### variables 
10 |   # x: vector of scores
11 |   # min_length: minimum number of scores (otherwise returns NA)
12 | 
13 |   x <- x[!is.na(x)]
14 |   if(length(x)>=min_length){
15 |     if(max(x)==0){
16 |       return(NA)
17 |     }else{
18 |       return(sum(1-x/max(x))/(length(x)-1))
19 |     }
20 |   }else{
21 |     return(NA)
22 |   }
23 | }


--------------------------------------------------------------------------------
/scripts/misc/tile_heatmap_wrapper.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ###################################
  3 | ### ggplot tile heatmap wrapper ###
  4 | ###################################
  5 | 
  6 | tile_heatmap_wrapper<-function(input_matrix, 
  7 |   output_file, 
  8 |   width=10, 
  9 |   height=4, 
 10 |   units="in",
 11 |   colour_clip=4, 
 12 |   cluster='both', 
 13 |   xlab='x', 
 14 |   ylab='y', 
 15 |   xtick_labels=NULL, 
 16 |   ytick_labels=NULL,
 17 |   colour_type='continuous', 
 18 |   colour_low='blue', 
 19 |   colour_high='red', 
 20 |   colour_mid='white', 
 21 |   colour_limits=NULL, 
 22 |   mono=F, 
 23 |   na_colour="grey50",
 24 |   xaxis_angle=330, 
 25 |   xaxis_hjust=0, 
 26 |   xaxis_vjust=NULL, 
 27 |   xaxis_size=5, 
 28 |   omit_xtext=F, 
 29 |   omit_xticks=F,
 30 |   yaxis_angle=NULL, 
 31 |   yaxis_hjust=NULL, 
 32 |   yaxis_vjust=NULL, 
 33 |   yaxis_size=NULL, 
 34 |   omit_ytext=F, 
 35 |   omit_yticks=F, 
 36 |   plot_title='', 
 37 |   input_matrix_text=NULL, 
 38 |   text_size=0.25, 
 39 |   highlight_regions=NULL, 
 40 |   x_breaks=waiver(), 
 41 |   y_breaks=waiver(), 
 42 |   plot = T){
 43 | 
 44 |   ### variables 
 45 |   # input_matrix: matrix of heatmap values (required)
 46 |   # output_file: plot output file path
 47 |   # width: plot width in "units"
 48 |   # height: plot height in "units"
 49 |   # units: plot size units ("in", "cm", or "mm")
 50 |   # colour_clip: maximum absolute value of colour scale
 51 |   # cluster: heirarchically cluster ("none", "row", "column", "none")
 52 |   # xlab: x-axis label
 53 |   # ylab: y-axis label
 54 |   # xtick_labels: display labels for x ticks
 55 |   # ytick_labels: display labels for y ticks
 56 |   # colour_type: colour scale type ("continuous", "categorical")
 57 |   # colour_low: colour scale lower limit colour -- passed to scale_colour_gradient2
 58 |   # colour_high: colour scale upper limit colour -- passed to scale_colour_gradient2 
 59 |   # colour_mid: colour scale zero colour -- passed to scale_colour_gradient2
 60 |   # colour_limits: upper and lower value limits of colour scale -- passed to scale_colour_gradient2
 61 |   # mono: use monotype font (True, False)
 62 |   # na_colour: colour to use for NA values
 63 |   # xaxis_angle: rotation angle for x tick labels -- passed to element_text
 64 |   # xaxis_hjust: horizontal justification of x tick labels (in [0, 1]) -- passed to element_text
 65 |   # xaxis_vjust: vertical justification of x tick labels (in [0, 1]) -- passed to element_text
 66 |   # xaxis_size: text size of x tick labels (in pts) -- passed to element_text
 67 |   # omit_xtext: omit x tick labels (True, False)
 68 |   # omit_xticks: omit x ticks (True, False)
 69 |   # yaxis_angle: rotation angle for y tick labels -- passed to element_text
 70 |   # yaxis_hjust: horizontal justification of y tick labels (in [0, 1]) -- passed to element_text
 71 |   # yaxis_vjust: vertical justification of y tick labels (in [0, 1]) -- passed to element_text
 72 |   # yaxis_size: text size of y tick labels (in pts) -- passed to element_text
 73 |   # omit_ytext: omit y tick labels (True, False)
 74 |   # omit_yticks: omit y ticks (True, False)
 75 |   # plot_title: main title for plot
 76 |   # input_matrix_text: matrix of heatmap text
 77 |   # text_size: size of heatmap text
 78 |   # highlight_regions: list of highlighted regions of form: list("red" = list("region1" = c(_min_, _max_), "region2" = c(_min_, _max_), ...), "blue" = list("region3" = c(_min_, _max_), ...), ...)
 79 |   # x_breaks: x-axis breaks (for displaing xtick_labels)
 80 |   # y_breaks: y-axis breaks (for displaing ytick_labels)
 81 |   # plot: whether to plot the heatmap (True, False)
 82 | 
 83 |   require(ggplot2)
 84 | 
 85 |   order_row<-rev(1:dim(input_matrix)[1])
 86 |   order_col<-1:dim(input_matrix)[2]
 87 |   if(cluster %in% c('both', 'row')){
 88 |     d <- dist(input_matrix, method = "euclidean") # distance matrix
 89 |     order_row <- hclust(d, method="ward")$order
 90 |   }
 91 |   if(cluster %in% c('both', 'column')){
 92 |     d <- dist(t(input_matrix), method = "euclidean") # distance matrix
 93 |     order_col<- hclust(d, method="ward")$order    
 94 |   }
 95 |   plot_df<-melt(input_matrix[order_row,order_col])
 96 |   colnames(plot_df)<-c('y', 'x', 'value')
 97 |   plot_df$label<-""
 98 |   if(!is.null(input_matrix_text)){
 99 |     plot_df_text<-melt(input_matrix_text[order_row,order_col])
100 |     colnames(plot_df_text)<-c('y', 'x', 'label')
101 |     plot_df$label<-plot_df_text$label
102 |   }
103 |   if(colour_type=='continuous' & colour_clip){
104 |     plot_df$value[plot_df$value>colour_clip]<-colour_clip
105 |     plot_df$value[plot_df$value<(-colour_clip)]<-(-colour_clip)    
106 |   }
107 |   p <- ggplot(plot_df, aes(x, y)) + geom_tile(aes(fill = value)) + geom_text(aes(label = label), size=text_size) +
108 |     # theme_bw() + 
109 |     theme(axis.text.x=list(element_text(angle = xaxis_angle, hjust = xaxis_hjust, vjust = xaxis_vjust, size = xaxis_size, family=c('', 'mono')[as.numeric(mono)+1]), element_blank())[[as.numeric(omit_xtext)+1]],
110 |           axis.text.y=list(element_text(angle = yaxis_angle, hjust = yaxis_hjust, vjust = yaxis_vjust, size = yaxis_size, family=c('', 'mono')[as.numeric(mono)+1]), element_blank())[[as.numeric(omit_ytext)+1]],
111 |           axis.ticks.x=list(element_line(), element_blank())[[as.numeric(omit_xticks)+1]],
112 |           axis.ticks.y=list(element_line(), element_blank())[[as.numeric(omit_yticks)+1]]) + 
113 |     xlab(xlab) + ylab(ylab) + labs(title = plot_title)
114 |   if(!is.null(highlight_regions)){
115 |     for(i in names(highlight_regions)){
116 |       for(j in names(highlight_regions[[i]])){
117 |         p <- p + geom_rect(data = NULL, mapping = aes_now(xmin=highlight_regions[[i]][[j]][1]-0.5, xmax=highlight_regions[[i]][[j]][2]+0.5, ymin=highlight_regions[[i]][[j]][1]-0.5, ymax=highlight_regions[[i]][[j]][2]+0.5), fill = NA, colour = i)
118 |       }
119 |     }
120 |   }
121 |   #xtick labels specified
122 |   if(!is.null(xtick_labels)){
123 |     if(is.numeric(plot_df$x)){
124 |       #X is numeric
125 |       p <- p + scale_x_continuous(breaks=x_breaks, labels=xtick_labels)
126 |     }else{
127 |       #X is discrete
128 |       p <- p + scale_x_discrete(breaks=x_breaks, labels=xtick_labels)
129 |     }
130 |   }else{
131 |     if(is.numeric(plot_df$x)){
132 |       p <- p + scale_x_continuous(breaks=x_breaks)
133 |     }
134 |   }
135 |   #ytick labels specified
136 |   if(!is.null(ytick_labels)){
137 |     if(is.numeric(plot_df$y)){
138 |       #Y is numeric
139 |       p <- p + scale_y_continuous(breaks=y_breaks, labels = ytick_labels)
140 |     }else{
141 |       #Y is discrete
142 |       p <- p + scale_y_discrete(breaks=y_breaks, labels=ytick_labels)
143 |     }
144 |   }else{
145 |     if(is.numeric(plot_df$y)){
146 |       p <- p + scale_y_continuous(breaks=y_breaks)
147 |     }
148 |   }
149 |   if(colour_type=='continuous'){
150 |     p <- p + scale_fill_gradient2(low = colour_low, high = colour_high, mid = colour_mid, midpoint = 0, limits=colour_limits, na.value=na_colour)    
151 |   }
152 |   if(colour_type=='categorical'){
153 |     p <- p + scale_fill_brewer(palette='Set1')    
154 |   }
155 |   if(plot){
156 |     ggsave(file=output_file, width=width, units=units, height=height)
157 |   }
158 |   return(p)
159 | }


--------------------------------------------------------------------------------
/scripts/pairdistances_from_PDB.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ######################################################################################
  3 | ### extract position-pair distances and secondary structure elements from PDB file ###
  4 | ######################################################################################
  5 | 
  6 | pairdistances_from_PDB = function(input_file,
  7 |                                   dataset_dir,
  8 |                                   given_chainids = "A",
  9 |                                   aa_seq,
 10 |                                   idx_pdb_start = 1,
 11 |                                   idx_DMS_start = 1,
 12 |                                   idx_DMS_end = NA,
 13 |                                   dist_cutoff = 8,
 14 |                                   debug_this=F,
 15 |                                   suffix = "") {
 16 |   
 17 |   ### variables 
 18 |   # input_file: PDB file
 19 |   # dataset_dir: dataset directory, like "GB1/", 
 20 |   #### > will deposit two .txt file (on for pair distances and one for secondary structure) to dataset_dir/processed_data/ and plot contactmaps to /dataset_dir/results/preprocessing/
 21 |   # given_chainids: which chain in the PDB file to extract distances from; in case of protein-protein interactions needs to be a vector with both chains that trans-distances should be calculated over, e.g. c("A","C")
 22 |   # aa_seq: amino acid sequence of reference structure (in DMS data)
 23 |   # idx_pdb_start: first position in PDB to consider
 24 |   # idx_DMS_start: first position in reference sequence to consider
 25 |   # idx_DMS_end: first position in reference sequence to consider, if NA (default), it will compare the full reference sequence to the PDB file sequence
 26 |   # dist_cutoff: in Angstrom, used for plotting the contact map
 27 |   # debug_this: if TRUE, the function will stop after printing comparision between PDB seq and DMS seq to adjust position indicies if necessary
 28 |   # suffix: to be added to 
 29 |   
 30 |   
 31 |   require(data.table)
 32 |   require(Rpdb) 
 33 |   require(pdist)
 34 |   require(ggplot2)
 35 |   require(cowplot)
 36 |   
 37 |   #if idx_DMS_end is not given, compare across full sequence length
 38 |   if (is.na(idx_DMS_end)) {
 39 |     idx_DMS_end = c(0)
 40 |     for (c in seq_along(given_chainids)) {
 41 |       idx_DMS_end[c] = length(strsplit(aa_seq[[c]],"")[[1]])
 42 |     }
 43 |   }
 44 |   
 45 |   #load PDB structure
 46 |   PDB_structure = read.pdb(input_file,MODEL=NULL)
 47 |   
 48 |   #for PDB files with NMR ensembles evaluate each model
 49 |   M = length(grep("MODEL",names(PDB_structure)))
 50 |   if (M==0) {M=1}
 51 |   for (m in 1:M) {
 52 |     
 53 |     #load model to structure data.table
 54 |     if (length(grep("MODEL",names(PDB_structure)))==0) {
 55 |       structure = data.table(eval(parse(text=paste0("PDB_structure$atoms"))))
 56 |     } else {
 57 |       structure = data.table(eval(parse(text=paste0("PDB_structure$MODEL.",m,"$atoms"))))
 58 |     }
 59 |     
 60 |     #restrict to ATOM entries
 61 |     structure = structure[recname=="ATOM" & chainid %in% given_chainids]
 62 |     
 63 |     #extract amino acid sequence from PDB file
 64 |     aaseq_PDB = unique(structure[,.(AA = convert_AAabr_one_three(as.character(unique(resname))),chainid),by=resid])
 65 |     setkey(aaseq_PDB,resid)
 66 |     
 67 |     ## compare given DMS aaseq and aaseq from PDB file
 68 |     DMS_aa_seq = list()
 69 |     PDB_aa_seq = list()
 70 |     for (c in seq_along(given_chainids)) {
 71 |       DMS_aa_seq[[c]] = strsplit(aa_seq[[c]],"")[[1]][idx_DMS_start[c]:idx_DMS_end[c]]
 72 |       PDB_aa_seq[[c]] = aaseq_PDB[chainid == given_chainids[c]][.(idx_pdb_start[c]:(idx_pdb_start[c]+length(DMS_aa_seq[[c]])-1)),AA]
 73 |       PDB_aa_seq[[c]][is.na(PDB_aa_seq)] = "X"
 74 |       
 75 |       if (m==1) {
 76 |         print(paste0(input_file," chain ",given_chainids[c]))
 77 |         print(paste0('DMS seq [',idx_DMS_start[c],':',idx_DMS_end[c],'] ',paste0(DMS_aa_seq[[c]],collapse="")))
 78 |         print(paste0(sum(DMS_aa_seq[[c]]==PDB_aa_seq[[c]]),'/',length(DMS_aa_seq[[c]]),'          ',paste0(as.numeric(DMS_aa_seq[[c]] == PDB_aa_seq[[c]]),collapse = "")))
 79 |         print(paste0('PDB seq [',idx_pdb_start[c],':',(idx_pdb_start[c]+length(DMS_aa_seq[[c]])-1),'] ',paste0(PDB_aa_seq[[c]],collapse="")))
 80 |       }
 81 |     }
 82 |     
 83 |     #if debug_this ==T function will stop here to make adjustments to position indicies
 84 |     if (debug_this) {
 85 |       browser()
 86 |     }
 87 |     
 88 |     #initialize indicies for distance calcualtions
 89 |     if (length(given_chainids) == 1) {
 90 |       two_chainids = rep(given_chainids,2)
 91 |       two_starts_DMS = rep(idx_DMS_start,2)
 92 |       two_starts = rep(idx_pdb_start,2)
 93 |       two_ends = rep(idx_pdb_start + (idx_DMS_end-idx_DMS_start),2)
 94 |     } else {
 95 |       two_chainids = given_chainids
 96 |       two_starts_DMS = idx_DMS_start
 97 |       two_starts = idx_pdb_start
 98 |       two_ends = idx_pdb_start + (idx_DMS_end-idx_DMS_start)
 99 |     }
100 |     
101 |     #initialize distance table
102 |     if (m==1) {
103 |       distance = data.table(Pos1=rep(two_starts[1]:two_ends[1],two_ends[2]-two_starts[2]+1),
104 |                             Pos2=rep(two_starts[2]:two_ends[2],each=two_ends[1]-two_starts[1]+1))
105 |       setkey(distance,Pos1,Pos2)
106 |       distance[,WT_AA1 := convert_AAabr_one_three(as.character(unique(structure[chainid == two_chainids[1] & resid == Pos1,resname]))),Pos1]
107 |       distance[,WT_AA2 := convert_AAabr_one_three(as.character(unique(structure[chainid == two_chainids[2] & resid == Pos2,resname]))),Pos2]
108 |       distance[,chainids := paste0(given_chainids)]
109 |     }
110 |     
111 |     #calculate minimal side-chain heavy atom distance
112 |     structure_HA = structure[union(intersect(grep(pattern="^[COSN][B-Z]$",elename),
113 |                                              which(resname != "GLY")),intersect(grep(pattern="^CA$",elename),which(resname == "GLY")))]
114 |     distance[,paste0("scHAmin",m):=min(as.matrix(pdist(as.matrix(structure_HA[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]),
115 |                                                        as.matrix(structure_HA[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))),
116 |              by=.(Pos1,Pos2)]
117 |     
118 |     #calculate minimal all heavy atom distance
119 |     structure_HA = structure[!grepl(pattern="H",elename)]
120 |     distance[,paste0("HAmin",m):=min(as.matrix(pdist(as.matrix(structure_HA[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]),
121 |                                                      as.matrix(structure_HA[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))),
122 |              by=.(Pos1,Pos2)]
123 |     
124 |     #calculate CB distances (use CA in case of Glycine)
125 |     structure_CB = structure[elename == "CB" | (elename == "CA" & resname == "GLY"),.(chainid,resid,x1,x2,x3)]
126 |     distance[,paste0("CB",m):=min(as.matrix(pdist(as.matrix(structure_CB[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]),
127 |                                                   as.matrix(structure_CB[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))),
128 |              by=.(Pos1,Pos2)]
129 |   }
130 |   
131 |   ## average over distances
132 |   distance[,scHAmin := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("scHAmin[0-9]",names(distance))]
133 |   distance[,HAmin := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("HAmin[0-9]",names(distance))]
134 |   distance[,CB := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("CB[0-9]",names(distance))]
135 |   
136 |   
137 |   #if there's multiple structural models, average over all and also calculate uncertainity
138 |   if (length(grep("MODEL",names(PDB_structure)))==0) {
139 |     distance[,scHAmin_sd := 0]
140 |     distance[,HAmin_sd := 0]
141 |     distance[,CB_sd := 0]
142 |   } else {
143 |     distance[,scHAmin_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("scHAmin[0-9]",names(distance))]
144 |     distance[,HAmin_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("HAmin[0-9]",names(distance))]
145 |     distance[,CB_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("CB[0-9]",names(distance))]
146 |   } 
147 |   
148 |   ##adjust positions to positions in alignment DMS/PDB sites
149 |   contactmap = distance[between(Pos1,two_starts[1],two_ends[1]) & 
150 |                           between(Pos2,two_starts[1],two_ends[2]),
151 |                         .(Pos1 = Pos1 - (two_starts[1] - two_starts_DMS[1]),
152 |                           Pos2 = Pos2 - (two_starts[2] - two_starts_DMS[2]),
153 |                           WT_AA1,WT_AA2,chainids,
154 |                           HAmin,scHAmin,CB,HAmin_sd,scHAmin_sd,CB_sd)]
155 |   
156 |   #save pairwise distance table
157 |   pdb_filename = strsplit(strsplit(input_file,"/")[[1]][length(strsplit(input_file,"/")[[1]])],"\\.")[[1]][1]
158 |   write.table(file = paste0(dataset_dir,'processed_data/PDB_contactmap_',pdb_filename,'_',paste0(given_chainids,collapse=""),suffix,".txt",collapse = ""),
159 |               x = contactmap,quote = F,row.names = F,col.names = T)
160 |   
161 |   
162 |   ################################################# 
163 |   ### extract secondary structure from PDB file ###
164 |   #################################################
165 |   if (length(given_chainids) == 1) { #if looking at a single chain
166 |     secondary_structure = data.table(Pos = idx_DMS_start:idx_DMS_end,ss = "C")
167 |     
168 |     output = scan(file=input_file,what="character",sep="\n")
169 |     
170 |     helix = output[grep(output,pattern="^HELIX")]
171 |     if (length(helix)>0) {
172 |       helix1 = sapply(X=1:length(helix),FUN = function(X){strsplit(helix[X],split="\\s+")[[1]]})
173 |       helix2 = data.table(t(helix1[4:9,]))
174 |       names(helix2) = c("aa1","chainid1","pos1","aa2","chainid2","pos2")
175 |       helix3 = helix2[chainid1 == given_chainids]
176 |       if (nrow(helix3)>0) {
177 |         for (i in 1:nrow(helix3)) {
178 |           secondary_structure[between(Pos,
179 |                                       helix3[i,as.numeric(pos1) - (two_starts[1] - two_starts_DMS[1])],
180 |                                       helix3[i,as.numeric(pos2) - (two_starts[1] - two_starts_DMS[1])]),
181 |                               ss := "H"]
182 |         }
183 |       }
184 |     }
185 |     
186 |     strand = output[grep(output,pattern="^SHEET")]
187 |     if (length(strand)>0) {
188 |       strand1 = sapply(X=1:length(strand),FUN = function(X){strsplit(strand[X],split="\\s+")[[1]]})  
189 |       if (is.list(strand1)) {
190 |         strand2 = data.table(t(strand1[[1]][5:10]))
191 |         for (l in 2:length(strand1)) {
192 |           strand2 = rbind(strand2,data.table(t(strand1[[l]][5:10])))
193 |         }
194 |       } else {
195 |         strand2 = data.table(t(strand1[5:10,]))  
196 |       }
197 |       
198 |       names(strand2) = c("aa1","chainid1","pos1","aa2","chainid2","pos2")
199 |       strand3 = strand2[chainid1 == given_chainids]
200 |       if (nrow(strand3) > 0) {
201 |         for (i in 1:nrow(strand3)) {
202 |           secondary_structure[between(Pos,
203 |                                       strand3[i,as.numeric(pos1) - (two_starts[1] - two_starts_DMS[1])],
204 |                                       strand3[i,as.numeric(pos2) - (two_starts[1] - two_starts_DMS[1])]),
205 |                               ss := "E"]
206 |         }
207 |       }
208 |     }
209 |     names(secondary_structure)[2] = "PDB"
210 |     
211 |     write.table(paste0(dataset_dir,"processed_data/PDB_secondary_structure_",pdb_filename,"_",given_chainids,suffix,".txt"),
212 |                 x = secondary_structure,quote = F,row.names = F,col.names = T)
213 |     
214 |     #for plotting
215 |     secondary_structure[,rleidx := rleid(PDB)]
216 |   }
217 |   
218 |   
219 |   
220 |   theme_set(theme_classic(base_size=9))
221 |   #plot contact map
222 |   P1=ggplot() +   
223 |     geom_raster(data=contactmap,aes(x=Pos1,y=Pos2,fill=HAmin<dist_cutoff),show.legend = F) +
224 |     scale_fill_manual(values = c("white","grey")) +
225 |     scale_x_continuous(expand = c(0,0)) +
226 |     scale_y_reverse(expand = c(0,0)) +
227 |     labs(fill = "<HAmin>",title=paste("all heavy atom distance < ",dist_cutoff,"A"),x="Pos1",y="Pos2")
228 |   if (length(given_chainids) == 1) { #add secondary structure
229 |     P1 = P1 +  
230 |       geom_segment(data = secondary_structure[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(PDB)),rleidx],
231 |                    aes(x=start,y=start,xend=end,yend=end,color=ss,size=ss),show.legend = F) +
232 |       scale_size_manual(breaks = c("C","H","E"),values = c(0.5,1.5,1.5)) +
233 |       scale_color_manual(breaks = c("C","H","E"),values = c("black","orange","darkgreen"))
234 |   }
235 |   
236 |   P2=ggplot() +   
237 |     geom_raster(data=contactmap,aes(x=Pos1,y=Pos2,fill=scHAmin<dist_cutoff),show.legend = F) +
238 |     scale_fill_manual(values = c("white","grey")) +
239 |     scale_x_continuous(expand = c(0,0)) +
240 |     scale_y_reverse(expand = c(0,0)) +
241 |     labs(fill = "<scHAmin>",title=paste("side-chain heavy atom distance < ",dist_cutoff,"A"),x="Pos1",y="Pos2")
242 |   if (length(given_chainids) == 1) { #add secondary structure
243 |     P2 = P2 +  
244 |       geom_segment(data = secondary_structure[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(PDB)),rleidx],
245 |                    aes(x=start,y=start,xend=end,yend=end,color=ss,size=ss),show.legend = F) +
246 |       scale_size_manual(breaks = c("C","H","E"),values = c(0.5,1.5,1.5)) +
247 |       scale_color_manual(breaks = c("C","H","E"),values = c("black","orange","darkgreen"))
248 |   }
249 |   
250 |   P3=ggplot() +   
251 |     geom_raster(data=contactmap,aes(x=Pos1,y=Pos2,fill=CB<dist_cutoff),show.legend = F) +
252 |     scale_fill_manual(values = c("white","grey")) +
253 |     scale_x_continuous(expand = c(0,0)) +
254 |     scale_y_reverse(expand = c(0,0)) +
255 |     labs(fill = "<CB>",title=paste("Cbeta distances < ",dist_cutoff,"A"),x="Pos1",y="Pos2")
256 |   if (length(given_chainids) == 1) { #add secondary structure
257 |     P3 = P3 +  
258 |       geom_segment(data = secondary_structure[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(PDB)),rleidx],
259 |                    aes(x=start,y=start,xend=end,yend=end,color=ss,size=ss),show.legend = F) +
260 |       scale_size_manual(breaks = c("C","H","E"),values = c(0.5,1.5,1.5)) +
261 |       scale_color_manual(breaks = c("C","H","E"),values = c("black","orange","darkgreen"))
262 |   }
263 |   
264 |   
265 |   #plot this fourth one just for the secondary structure legend
266 |   P4=ggplot() +   
267 |     geom_raster(data=contactmap,aes(x=Pos1,y=Pos2,fill=scHAmin),show.legend = F) +
268 |     scale_fill_gradient(low="white",high="grey") +
269 |     scale_x_continuous(expand = c(0,0)) +
270 |     scale_y_reverse(expand = c(0,0)) +
271 |     labs(fill = "",title="scHAmin absolute distances",x="Pos1",y="Pos2")
272 |   if (length(given_chainids) == 1) { #add secondary structure
273 |     P4 = P4 +  
274 |       geom_segment(data = secondary_structure[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(PDB)),rleidx],
275 |                    aes(x=start,y=start,xend=end,yend=end,color=ss,size=ss),show.legend = T) +
276 |       scale_size_manual(breaks = c("C","H","E"),values = c(0.5,1.5,1.5)) +
277 |       scale_color_manual(breaks = c("C","H","E"),values = c("black","orange","darkgreen"))
278 |   }
279 |   
280 |   P = plot_grid(plotlist = list(P1,P2,P3,P4),nrow=2)
281 |   P
282 |   ggsave(plot = P,filename = paste0(dataset_dir,'results/preprocessing/contactmap_',pdb_filename,'_',paste0(given_chainids,collapse=""),suffix,'.pdf',collapse = ""),width = 8,height=7)
283 | }
284 | 
285 | 


--------------------------------------------------------------------------------
/scripts/plot_fitness_surface.R:
--------------------------------------------------------------------------------
 1 | #################### plot A-B-AB surfaces #################### 
 2 | #subfunction for call_epistasis class of functions
 3 | plot_fitness_surface = function(double_data,F_fit_loess_model,List,dataset_dir,prefix) {
 4 |   
 5 |   #range of data to plot, omitting long tails in single mutant space
 6 |   xyrange = double_data[is.fitness == T & is.reads0==T,quantile(fitness1,c(0.005,0.995),na.rm=T)]
 7 |   xy = seq(xyrange[1],
 8 |            xyrange[2],
 9 |            abs(diff(xyrange))/25)
10 |   
11 |   Fd_pred = predict(List$F_median_fit,data.frame(fitness1=rep(xy,length(xy)),fitness2=rep(xy,each=length(xy))))
12 |   Fd_pred2 =  predict(F_fit_loess_model,data.frame(fitness1=rep(xy,length(xy)),fitness2=rep(xy,each=length(xy))))
13 |   
14 |   Fd_pred_q05 = predict(List$F_lower_fit,data.frame(fitness1=rep(xy,length(xy)),fitness2=rep(xy,each=length(xy))))
15 |   Fd_pred_q95 = predict(List$F_upper_fit,data.frame(fitness1=rep(xy,length(xy)),fitness2=rep(xy,each=length(xy))))
16 |   
17 |   xyz = double_data[is.fitness == T & is.reads0==T,between(fitness1,xyrange[1],xyrange[2]) & between(fitness2,xyrange[1],xyrange[2]),
18 |                     .(x=fitness1,y=fitness2,z=fitness,below_q05=fitness < F_fit_lower,above_q95 = fitness>F_fit_upper)]
19 |   
20 |   #number of points to plot, 10k is sufficient, otherwise PDF becomes very large
21 |   r = sample(x = nrow(xyz),size = min(c(10000,nrow(xyz))))
22 |   x=xyz$x
23 |   y=xyz$y
24 |   z=xyz$z
25 |   
26 |   #axis limits, adjust
27 |   xlim_plot = ylim_plot = c(xyrange[1] - 0.1*diff(xyrange),xyrange[2] + 0.1*diff(xyrange))
28 |   zlim_plot = quantile(xyz$z,probs = c(0.005,0.995),na.rm = T)
29 |   #plot angles
30 |   theta_plot = c(15,55)
31 |   phi_plot = 15 
32 |   for (idx in 1:length(theta_plot)) {
33 |     #### upper and lower surface with points
34 |     pdf(paste0(dataset_dir, "results/epistasis/",prefix,"epistasis_surface",idx,".pdf"), useDingbats=FALSE)
35 |     a=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q05,nrow=length(xy),ncol=length(xy)),
36 |             xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness",
37 |             xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot, 
38 |             theta = theta_plot[idx], phi = phi_plot, 
39 |             col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE)
40 |     b=trans3d(xyz[intersect(r,which(below_q05==T))]$x,
41 |               xyz[intersect(r,which(below_q05==T))]$y,
42 |               xyz[intersect(r,which(below_q05==T))]$z,a)
43 |     points(b$x,b$y,col=rgb(1,0.1,0.1),pch=16,cex=0.75)
44 |     par(new=TRUE)
45 |     a=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q05,nrow=length(xy),ncol=length(xy)),
46 |             xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness",
47 |             xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot,
48 |             theta = theta_plot[idx], phi = phi_plot,
49 |             col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE)
50 |     par(new=TRUE)
51 |     b=trans3d(xyz[intersect(r,which(below_q05==F & above_q95==F))]$x,
52 |               xyz[intersect(r,which(below_q05==F & above_q95==F))]$y,
53 |               xyz[intersect(r,which(below_q05==F & above_q95==F))]$z,a)
54 |     points(b$x,b$y,col=rgb(1,0.5,0.5),pch=16,cex=0.75)
55 |     par(new=TRUE)
56 |     a2=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q95,nrow=length(xy),ncol=length(xy)),
57 |              xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness",
58 |              xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot,
59 |              theta = theta_plot[idx], phi = phi_plot,
60 |              col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE)
61 |     par(new=TRUE)
62 |     b=trans3d(xyz[intersect(r,which(above_q95==T))]$x,
63 |               xyz[intersect(r,which(above_q95==T))]$y,
64 |               xyz[intersect(r,which(above_q95==T))]$z,a)
65 |     points(b$x,b$y,col=rgb(0.5,0.9,0.1),pch=16,cex=0.75)
66 |     dev.off()
67 |   }
68 | }


--------------------------------------------------------------------------------
/scripts/predict_secondary_structure_elements.R:
--------------------------------------------------------------------------------
  1 | ######################################################################
  2 | ##### predict alpha helices and beta strands with kernel smoothing ###
  3 | ######################################################################
  4 | predict_secondary_structure_elements = function(PWI,
  5 |                                            dataset_dir,
  6 |                                            prefix = "",
  7 |                                            known_SS = c(),
  8 |                                            scale_long = 1/4^2,
  9 |                                            seed_size=3,
 10 |                                            p_detection_threshold = 0.05,
 11 |                                            Nsamples = 10000,
 12 |                                            debug_this = F,
 13 |                                            return_list = F) {
 14 |   
 15 |   ### variables 
 16 |   # PWI: pairwise interaction score data.table; except for Pos1 and Pos2 this should only contain the scores that SS elements should be predicted from
 17 |   # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/
 18 |   # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc)
 19 |   # known_SS (optional): filepointer to a file with known secondary structure elements (from a PDB file) [to plot as comparision], a table with a position and a SS classifier column
 20 |   # scale_long: length scale for gaussian smoothing kernel
 21 |   # seed_size: number of positions the SSpropensities are initially aggregated over; must be an odd value !!!
 22 |   # p_detection_threshold: p-value threshold for calling a SS element
 23 |   # Nsamples: number of randomized controls to compare SS propensity against
 24 |   # debug_this: if TRUE, function will stop at certain points in scripts in order to understand bugs
 25 |   # return_list: if TRUE, ggplot2 and ss_data objects returned in addition to predicted secondary_structure (in named list)
 26 | 
 27 |   
 28 |   require(data.table)
 29 |   require(ggplot2)
 30 |   require(metap)
 31 |   
 32 |   #which scores should be used for prediction?
 33 |   eval_cols = setdiff(names(PWI),c("Pos1","Pos2","WT_AA1","WT_AA2","NposE","NnegE"))
 34 | 
 35 |   #Initialise list of returned data
 36 |   saved_objects <- list(
 37 |     "plot_objects" = list(), 
 38 |     "secondary_structure_score" = list())
 39 | 
 40 |   for (eval_cols_idx in seq_along(eval_cols)) {
 41 |     print(eval_cols[eval_cols_idx])
 42 |     ss_data = copy(PWI[Pos1<=Pos2,.(Pos1,Pos2,input = .SD),,.SDcols = eval_cols[eval_cols_idx]])  
 43 |     setkey(ss_data,Pos1,Pos2)
 44 |     #position range for prediction
 45 |     pos_range = c(min(c(ss_data$Pos1,ss_data$Pos2)),max(c(ss_data$Pos1,ss_data$Pos2)))
 46 |     
 47 |     #compute diagonal/perpendicular coordinates
 48 |     ss_data[,pos_diag := (Pos1+Pos2)/2]
 49 |     ss_data[,pos_perp := abs(Pos1-Pos2)/2] #this is half the actual distance between positions in a pair (for consistency with pos_diag)
 50 |     if (debug_this) {browser()}
 51 |     
 52 |     ### secondary structure propensities
 53 |     set.seed(1603)
 54 |     for (i in pos_range[1]:pos_range[2]) {
 55 |       j = i
 56 |       
 57 |       # hamming distances from center position
 58 |       ss_data[,ham := abs(Pos1-i) + abs(Pos2-j)]
 59 |       ss_data[,ham_perp := abs(Pos1-i - (Pos2-j))]
 60 |       ss_data[,ham_diag := abs(Pos1-i + Pos2-j)]
 61 |       
 62 |       ####################################
 63 |       ####### detect alpha helices ####### 
 64 |       ####################################
 65 |       
 66 |       # compute kernel weights
 67 |       if (i > pos_range[1]) {ss_data[,alpha_weight := NULL]}
 68 |       ss_data[ham_perp <= 5 & ham_diag < 12,alpha_weight := (cos(ham_perp*2*pi/3.6)+1/3) * exp(-scale_long*ham_diag^2)]
 69 |       ss_data[Pos1==Pos2,alpha_weight := NA]
 70 |       ss_data[is.na(input),alpha_weight := NA]
 71 |       
 72 |       # calculate kernel smoothed value for true data
 73 |       ss_data[Pos1==i & Pos2==i,alpha_score := ss_data[ham_perp <= 5 & ham_diag < 12,sum(input*alpha_weight,na.rm=T)]]
 74 |       
 75 |       # calculate kernel smoothed value for random distributions
 76 |       B = copy(ss_data[ham_perp <= 5 & ham_diag <= 12,.(ham,alpha_weight,input)])
 77 |       setkey(B,ham)
 78 |       sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples)
 79 |       alpha_sampled = colSums(sample_matrix * matrix(rep(t(B[,alpha_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T)
 80 |       
 81 |       # p value for true value
 82 |       ss_data[Pos1==i & Pos2==i,alpha_p := sum(alpha_sampled >= alpha_score)/Nsamples]
 83 |       
 84 |       ####################################
 85 |       ####### detect beta strands  ####### 
 86 |       ####################################
 87 |       
 88 |       # compute kernel weights
 89 |       if (i > pos_range[1]) {ss_data[,beta_weight := NULL]}
 90 |       ss_data[ham_perp <= 2,beta_weight := ((ham_perp+1) %% 2 - 1/3)*exp(-scale_long*ham_diag^2)]
 91 |       ss_data[ham_perp == 0,beta_weight := beta_weight * 2]
 92 |       ss_data[Pos1==Pos2,beta_weight := NA]
 93 |       ss_data[is.na(input),beta_weight := NA]
 94 |       
 95 |       # calculate kernel smoothed value for true data
 96 |       ss_data[Pos1==i & Pos2==i,beta_score := ss_data[ham_perp <= 2 & ham_diag <= 12,sum(input*beta_weight,na.rm=T)]]
 97 |       
 98 |       # calculate kernel smoothed value for random distributions
 99 |       B = copy(ss_data[ham_perp <= 2 & ham_diag <= 12,.(ham,beta_weight,input)])
100 |       setkey(B,ham)
101 |       sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples)
102 |       beta_sampled = colSums(sample_matrix * matrix(rep(t(B[,beta_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T)
103 |       
104 |       # p value for true value
105 |       ss_data[Pos1==i & Pos2==i,beta_p := sum(beta_sampled >= beta_score)/Nsamples]
106 |       
107 |     }
108 |     
109 |     #avoid -Inf if logging p values by setting those positions smaller than all random samples to smallest non-zero pvalue
110 |     ss_data[alpha_p == 0 ,alpha_p := 1/Nsamples]
111 |     ss_data[beta_p == 0 ,beta_p := 1/Nsamples]
112 |     
113 |     ###########################################################
114 |     ### call secondary structure elements from propensities ###
115 |     ###########################################################
116 |     
117 |     ### get alpha helices and beta strand p values
118 |     setkey(ss_data,Pos1)
119 |     ss_strands = ss_data[Pos1==Pos2,.(Pos1,alpha_p,beta_p)]
120 |     
121 |     # compute sumlog (combined p-values) for seeds of alpha helix
122 |     if (seed_size > 1) {
123 |       for (mid_idx in ss_strands[!is.na(alpha_p),Pos1]) {
124 |         ss_strands[Pos1==mid_idx,alpha_p_seed := ss_strands[between(Pos1,mid_idx-(seed_size-1)/2,mid_idx+(seed_size-1)/2),
125 |                                                             ifelse(sum(!is.na(alpha_p)) > 1,sumlog(alpha_p[!is.na(alpha_p)])$p,alpha_p[!is.na(alpha_p)])]]  
126 |       }
127 |     } else {
128 |       ss_strands[,alpha_p_seed := alpha_p]
129 |     }
130 |     
131 |     # compute sumlog for seeds of beta strands
132 |     if (seed_size > 1) {
133 |       for (mid_idx in ss_strands[!is.na(beta_p),Pos1]) {
134 |         ss_strands[Pos1==mid_idx,beta_p_seed := ss_strands[between(Pos1,mid_idx-(seed_size-1)/2,mid_idx+(seed_size-1)/2),
135 |                                                            ifelse(sum(!is.na(beta_p)) > 1,sumlog(beta_p[!is.na(beta_p)])$p,beta_p[!is.na(beta_p)])]]  
136 |       }
137 |     } else {
138 |       ss_strands[,beta_p_seed := beta_p]
139 |     }
140 |     
141 |     #set p-values NA if other structure is more probable
142 |     ss_strands[,beta_p0 := beta_p]
143 |     ss_strands[,alpha_p0 := alpha_p]
144 |     ss_strands[,beta_p_seed0 := beta_p_seed]
145 |     ss_strands[,alpha_p_seed0 := alpha_p_seed]
146 |     if (seed_size > 1) {
147 |       if (ss_strands[1+(seed_size-1)/2]$alpha_p_seed < ss_strands[1+(seed_size-1)/2]$beta_p_seed) {
148 |         ss_strands[1,':=' (beta_p = NA, beta_p_seed = NA)] } else {
149 |           ss_strands[1,':=' (alpha_p = NA, alpha_p_seed = NA)] }
150 |       if (ss_strands[.N-(seed_size-1)/2]$alpha_p_seed < ss_strands[.N-(seed_size-1)/2]$beta_p_seed) {
151 |         ss_strands[.N,':=' (beta_p = NA, beta_p_seed = NA)] } else {
152 |           ss_strands[.N,':=' (alpha_p = NA, alpha_p_seed = NA)] }
153 |     }
154 |     ss_strands[alpha_p_seed < beta_p_seed,':=' (beta_p_seed = NA, beta_p = NA)]
155 |     ss_strands[beta_p_seed < alpha_p_seed,':=' (alpha_p_seed = NA, alpha_p = NA)]
156 |     
157 |     #delete stretches smaller 5 for beta strands, set NA
158 |     ss_strands[ss_strands[,.(stretch = rleid(!is.na(alpha_p_seed)),not_na = !is.na(alpha_p_seed))][,.(short = .N<5 & not_na == T),stretch][,short],
159 |                ':=' (alpha_p =NA,alpha_p_seed = NA)]
160 |     #delete stretches smaller 3 for beta strands, set NA
161 |     ss_strands[ss_strands[,.(stretch = rleid(!is.na(beta_p_seed)),not_na = !is.na(beta_p_seed))][,.(short = .N<3 & not_na == T),stretch][,short],
162 |                ':=' (beta_p = NA,beta_p_seed = NA)]
163 |     
164 |     
165 |     
166 |     ## alpha helices:
167 |     # identify most significant stretches from seeds
168 |     helper = identify_expand_seeds(ss_strands[,.(pos=Pos1,p_ind=alpha_p,p_seed=alpha_p_seed)],seed_size,p_detection_threshold)
169 |     # merge
170 |     ss_strands = merge(ss_strands,helper[,.(Pos1=pos,alpha_strand=strand,alpha_strand_p=p_strand)],by="Pos1")
171 |     
172 |     ## beta strands:
173 |     # identify most significant stretches from seeds
174 |     helper = identify_expand_seeds(ss_strands[,.(pos=Pos1,p_ind=beta_p,p_seed = beta_p_seed)],seed_size,p_detection_threshold)
175 |     #merge
176 |     ss_strands = merge(ss_strands,helper[,.(Pos1=pos,beta_strand=strand,beta_strand_p=p_strand)],by="Pos1")
177 |     #Save
178 |     if(return_list){
179 |       saved_objects[["secondary_structure_score"]][[eval_cols[eval_cols_idx]]] <- copy(ss_strands)
180 |     }
181 |     
182 |     ### record predictions across input data
183 |     if (eval_cols_idx == 1) {
184 |       if (debug_this) {browser()}
185 |       secondary_structure = ss_strands[,.(Pos = Pos1,ss = ifelse(is.na(alpha_strand) & is.na(beta_strand),"C",ifelse(!is.na(alpha_strand),"H","E")))]
186 |       names(secondary_structure)[2] = eval_cols[1]
187 |     } else {
188 |       secondary_structure = merge(secondary_structure,
189 |                                   ss_strands[,.(Pos = Pos1,ss = ifelse(is.na(alpha_strand) & is.na(beta_strand),"C",ifelse(!is.na(alpha_strand),"H","E")))],
190 |                                   by = "Pos")
191 |       names(secondary_structure)[eval_cols_idx+1] = eval_cols[eval_cols_idx]
192 |     }
193 |     
194 |     
195 |     #############
196 |     ## compare to known_SS
197 |     #############
198 |     if (!is.null(known_SS)) {
199 |       known_ss_DT = fread(known_SS)
200 |       names(known_ss_DT)[2] = "SS"
201 |       known_ss_DT[,rleidx := rleid(SS)]
202 |       known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "C", rleidx = max(known_ss_DT$rleidx)+1))
203 |       known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "E", rleidx = max(known_ss_DT$rleidx)+1))
204 |       known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "H", rleidx = max(known_ss_DT$rleidx)+1))
205 |     }
206 |     
207 |     if (debug_this) {browser()}
208 |     ##################################################
209 |     ##### plot secondary structure element predictions
210 |     require(cowplot)
211 |     theme_set(theme_classic())
212 |     P1 = ggplot(data=ss_strands) + 
213 |       geom_line(aes(Pos1,y=alpha_p_seed0),color="darkgreen",linetype=2) +
214 |       geom_line(aes(Pos1,y=alpha_p_seed),color="darkgreen") +
215 |       
216 |       geom_line(aes(Pos1,y=beta_p_seed0),color="red",linetype=2) +
217 |       geom_line(aes(Pos1,y=beta_p_seed),color="red") +
218 |       
219 |       geom_hline(yintercept=0.05,linetype=3) +
220 |       geom_rect(data=unique(ss_strands[!is.na(alpha_strand),.(xmin=min(Pos1)-0.5,xmax=max(Pos1)+0.5,ymin=max(alpha_strand_p,10^-4),ymax=1),alpha_strand]),
221 |                 inherit.aes = F,
222 |                 aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax,group=alpha_strand),fill="green",alpha=0.2) +
223 |       geom_rect(data=unique(ss_strands[!is.na(beta_strand),.(xmin=min(Pos1)-0.5,xmax=max(Pos1)+0.5,ymin=max(beta_strand_p,10^-4),ymax=1),beta_strand]),
224 |                 inherit.aes = F,
225 |                 aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax,group=beta_strand),fill="orange",alpha=0.2) +
226 |       scale_x_continuous(breaks=seq(5,pos_range[2],5),expand = c(0,0)) +
227 |       coord_cartesian(xlim = c(pos_range[1]-0.5,pos_range[2]+0.5)) +
228 |       # scale_y_log10(breaks = c(10^-seq(-10,0))) +
229 |       scale_y_log10(breaks = c(10^-seq(0,ss_strands[,-log10(min(c(alpha_p_seed,beta_p_seed,10^-4),na.rm=T))])),expand = c(0.01,0)) +
230 |       labs(y="p value",title = eval_cols[eval_cols_idx])
231 |     
232 |     if (!is.null(known_SS)) { #add secondary structure
233 |       P1 = P1 +  
234 |         geom_segment(data = known_ss_DT[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(SS)),rleidx],
235 |                      aes(x=start,y=1.5,xend=end,yend=1.5,color=ss,size=ss),show.legend = F) +
236 |         scale_size_manual(breaks = c("C","E","H"),values = c(0.5,1.5,1.5)) +
237 |         scale_color_manual(breaks = c("C","E","H"),values = c("black","red","darkgreen"))
238 |     }
239 |     
240 |     ##################################################
241 |     ###### plot smoothed data around diagonal
242 |     setkey(ss_data,Pos1,Pos2)
243 |     
244 |     for (i in pos_range[1]:pos_range[2]) {
245 |       for (j in i:pos_range[2]) {
246 |         ss_data[,ham := abs(Pos1-i) + abs(Pos2-j)]
247 |         ss_data[,ham_perp := abs(Pos1-i - (Pos2-j))]
248 |         ss_data[,ham_diag := abs(Pos1-i + Pos2-j)]
249 |         
250 |         ss_data[,weight := as.double(NA)]
251 |         ss_data[ham_perp==0,weight := exp(-scale_long*ham_diag^2)]
252 |         ss_data[Pos1==Pos2,weight := NA]
253 |         ss_data[is.na(input),weight := NA]
254 |         #calculate true value
255 |         ss_data[Pos1==i & Pos2==j,score := ss_data[ham_perp==0,sum(input*weight,na.rm=T)/sum(weight,na.rm=T)]]
256 |       }
257 |     }
258 |     #average
259 |     avg = 1.5
260 |     for (i in ss_data[,unique(ham_diag)]) {
261 |       ss_data[ham_diag == i & ham_perp < 8,score_norm := score-ss_data[between(ham_diag,i-avg,i+avg) & ham_perp < 10,mean(score,na.rm=T)]]
262 |       ss_data[ham_diag == i & ham_perp < 8,input_norm := input-ss_data[between(ham_diag,i-avg,i+avg) & ham_perp < 10 & abs(input) != Inf,mean(input,na.rm=T)]]
263 |     }
264 |     #limit data range for better comparability
265 |     cutoff = quantile(c(ss_data[Pos1<Pos2 & !is.na(input_norm),abs(input_norm)],ss_data[!is.na(score_norm),abs(score_norm)]),0.95)
266 |     ss_data[input_norm > cutoff, input_norm := cutoff]
267 |     ss_data[input_norm < -cutoff, input_norm := -cutoff]
268 |     ss_data[score_norm > cutoff, score_norm := cutoff]
269 |     ss_data[score_norm < -cutoff, score_norm := -cutoff]
270 |     
271 |     P2 = ggplot(ss_data[ham_perp < 8 & ham_perp != 0]) +
272 |       geom_raster(aes(Pos1,Pos2,fill=input_norm)) +
273 |       geom_raster(aes(Pos2,Pos1,fill=score_norm)) +
274 |       scale_fill_gradient2(midpoint=0,low="tomato3",high="steelblue3",na.value = "white") +
275 |       scale_y_continuous(breaks=seq(1,8,1),expand = c(0,0)) + 
276 |       scale_x_continuous(breaks=seq(5,pos_range[2],5),expand = c(0,0)) +
277 |       coord_cartesian(xlim = c(pos_range[1]-0.5,pos_range[2]+0.5),ylim = c(pos_range[1]-0.5,pos_range[2]+0.5)) +
278 |       labs(x = "position",y = "diagonal position",fill = "score") +
279 |       geom_segment(data=unique(ss_strands[!is.na(alpha_strand),.(x=min(Pos1)-0.5,xend=max(Pos1)+0.5,y=min(Pos1)-0.5,yend=max(Pos1)+0.5),alpha_strand]),
280 |                    inherit.aes = F,
281 |                    aes(x=x,xend=xend,y=y,yend=yend,group=alpha_strand),color="darkgreen",size=1.5) +
282 |       geom_segment(data=unique(ss_strands[!is.na(beta_strand),.(x=min(Pos1)-0.5,xend=max(Pos1)+0.5,y=min(Pos1)-0.5,yend=max(Pos1)+0.5),beta_strand]),
283 |                    inherit.aes = F,
284 |                    aes(x=x,xend=xend,y=y,yend=yend,group=beta_strand),color="red",size=1.5)
285 |     
286 |     plot_grid(P1,P2,nrow=1)
287 |     
288 |     ggsave(file=paste0(dataset_dir,"results/secondary_structure/",prefix,eval_cols[eval_cols_idx],"_SSelements.pdf"),width=8.5,height=4)
289 | 
290 |     if(return_list){
291 |       saved_objects[["plot_objects"]][[eval_cols[eval_cols_idx]]] <- list("P1" = P1, "P2" = P2)
292 |     }
293 |   }
294 |   
295 |   write.table(paste0(dataset_dir,"processed_data/",prefix,"secondary_structure_prediction.txt"),
296 |               x = secondary_structure,quote = F,row.names = F,col.names = T)
297 |   
298 |   #Return objects
299 |   if(return_list){
300 |     saved_objects[["secondary_structure"]] <- secondary_structure
301 |     return(saved_objects)
302 |   }else{
303 |     return(secondary_structure)
304 |   }
305 | }
306 | 
307 | 


--------------------------------------------------------------------------------
/scripts/quantile_fitness_surface_adaptive.R:
--------------------------------------------------------------------------------
 1 | ### calculate quantile fitness surfaces
 2 | #subfunction for the call_epistasis class of functions
 3 | quantile_fitness_surface_adaptive = function(DT,Nq,Nv,span,Q) {
 4 |   
 5 |   # calculate quantile surface approximation on regular grid (defined by quantiles) given a sampled number of variants
 6 |   
 7 |   #define grid vector
 8 |   q = seq(min(c(quantile(DT[,fitness1],probs=0,na.rm=T),quantile(DT[,fitness2],probs=0,na.rm=T))),
 9 |           max(c(quantile(DT[,fitness1],probs = 1,na.rm=T),quantile(DT[,fitness2],probs = 1,na.rm=T)))+0.015,
10 |           length.out=Nq)
11 |   
12 |   #Fq: data.table for surface values at each gridpoint
13 |   Fq = data.table(fitness1=rep(q,length(q)),fitness2=rep(q,each=length(q)))
14 |   
15 |   #initialize different surface columns (median, upper, lower)
16 |   Fq[,F_median := as.numeric(NA)]
17 |   Fq[,F_lower := as.numeric(NA)]
18 |   Fq[,F_upper := as.numeric(NA)]
19 |   
20 |   #downsample variants to value given by Nv (or use all variants if number of variants if smaller Nv)
21 |   subDT = DT[sample(nrow(DT),min(c(Nv,nrow(DT)))),.(fitness1,fitness2,fitness_norm)]
22 |   
23 |   # run this in parallel for all grid points
24 |   require(parallel)
25 |   # Use the detectCores() function to find the number of cores in system
26 |   no_cores <- detectCores()-1
27 |   clust <- makeCluster(no_cores) 
28 |   # make variables available to each core's workspace
29 |   clusterExport(clust, list("subDT","span","Fq","Q"),envir = environment())
30 |   helper = parSapply(clust,X = 1:Nq^2, surface_at_gridpoint)
31 |   stopCluster(clust)
32 |   #transfer results from helper to Fq
33 |   Fq[,F_median := helper[1,]]
34 |   Fq[,F_lower := helper[2,]]
35 |   Fq[,F_upper := helper[3,]]
36 |   
37 |   # > loess fit regular grid surface
38 |   List = list()
39 |   List$F_median_fit = loess(F_median ~ fitness1 + fitness2,data = Fq,span=0.2)
40 |   List$F_lower_fit = loess(F_lower ~ fitness1 + fitness2,data = Fq,span=0.2)
41 |   List$F_upper_fit = loess(F_upper ~ fitness1 + fitness2,data = Fq,span=0.2)
42 |   
43 |   List$Fq = Fq
44 |   return(List)
45 | }


--------------------------------------------------------------------------------
/scripts/surface_at_gridpoint.R:
--------------------------------------------------------------------------------
1 | ### calculate fitness quantiles for nearest neighbours of gridpoint
2 | #subfunction for call_epistasis class of functions
3 | surface_at_gridpoint = function(i) {
4 |   require(data.table)
5 |   A = unlist(subDT[,.(D=sqrt((fitness1-Fq[i,fitness1])^2 +(fitness2-Fq[i,fitness2])^2),fitness_norm)][D < quantile(D,probs=span,na.rm=T),
6 |                                                                                                       .(quantile(fitness_norm,p=c(0.5,Q,1-Q),na.rm=T))])
7 |   return(A)
8 | }
9 | 


--------------------------------------------------------------------------------
/scripts/switch_double_DT.R:
--------------------------------------------------------------------------------
 1 | # double data.table while inter-switching specific columns; e.g. when complementing DMS datatable such that Pos1 and Pos2 are symmetric
 2 | switch_double_DT = function(DT,cols_switchdouble,cols_double) {
 3 |   # cols_switchdouble: list of columns pairs
 4 |   sd_text = ""
 5 |   for (i in 1:length(cols_switchdouble)) {
 6 |     sd_text = c(sd_text,paste0(cols_switchdouble[[i]][1]),"=c(",cols_switchdouble[[i]][1],",",cols_switchdouble[[i]][2],"),")
 7 |     sd_text = c(sd_text,paste0(cols_switchdouble[[i]][2]),"=c(",cols_switchdouble[[i]][2],",",cols_switchdouble[[i]][1],"),")
 8 |   }
 9 |   eval(parse(text = paste0("DT = DT[,.(",paste0(sd_text,collapse=""),paste0(cols_double,collapse=","),")]")))
10 | }
11 | 


--------------------------------------------------------------------------------