├── FOSJUN └── dataset │ ├── PDB │ └── 1fos_1.pdb │ └── elife-32472-supp1-v2.txt ├── FOSJUN_pipeline.R ├── GB1 └── dataset │ ├── GB1_sequence.fasta │ ├── Olson2014_TableS2_doubles.txt │ ├── Olson2014_TableS2_singles.txt │ ├── Olson2014_TableS2_wildtype.txt │ ├── PDB │ ├── 1pga.pdb │ └── g_xray.pdb │ └── PSIPRED │ ├── gb1.psipass2 │ ├── gb1.psipred.pdf │ ├── gb1.psipred.ps │ └── gb1.ss2 ├── GB1_downsampling └── dataset │ ├── GB1_CDS_nt │ └── amino_acid_codon_conversion.txt ├── GB1_downsampling_pipeline.R ├── GB1_pipeline.R ├── LICENSE ├── README.md ├── RRM └── dataset │ ├── PDB │ └── 1cvj.pdb │ ├── PSIPRED │ ├── PAB1.psipass2 │ ├── PAB1.psipred.pdf │ ├── PAB1.psipred.ps │ └── PAB1.ss2 │ ├── RRM_domain_sequence.fasta │ └── Supplementary_Table_5_doubles.txt ├── RRM_pipeline.R ├── WW ├── WW_pipeline.R └── dataset │ ├── PDB │ ├── 1k9q.pdb │ ├── 1k9q_model1_6to29.pdb │ └── 1k9q_model1_mod_manual.pdb │ ├── PSIPRED │ ├── WW1.psipass2 │ ├── WW1.psipred.pdf │ ├── WW1.psipred.ps │ └── WW1.ss2 │ ├── WW_sequence.fasta │ └── bash_scripts │ ├── 001_bash_download_SRA_data.sh │ ├── 002_bash_fastq_dump.sh │ ├── 003_bash_fastqc.sh │ ├── 004_bash_usearch_pairedreadmerging_Q20_ee0p1.sh │ └── 005_bash_usearch_fastx_unique_Q20.sh ├── WW_pipeline.R └── scripts ├── SS_from_PSIPRED.R ├── XPLOR ├── XPLOR_modeling_functions_v2.R ├── XPLOR_simulations.R ├── anneal_template.py └── refine_template.py ├── XPLOR_wrapper.R ├── analyse_XPLOR_results.R ├── calculate_pairwise_interaction_scores.R ├── call_epistasis_binary.R ├── convert_AAabr_one_three.R ├── create_directory_structure.R ├── deepcontact_transform_basic2d.R ├── epistasis_analytics.R ├── evaluate_contacts_vs_PDB.R ├── hbonds_from_betasheetpairing.R ├── identify_expand_seeds.R ├── misc ├── call_epistasis_symdata_v1.R ├── contact_matrix_from_pairdistances.R ├── kernel_structure_propensity.R ├── pairdistances_from_PDB_crystal.R ├── tau_specificity_score.R └── tile_heatmap_wrapper.R ├── pairdistances_from_PDB.R ├── plot_fitness_surface.R ├── predict_beta_sheets.R ├── predict_secondary_structure_elements.R ├── quantile_fitness_surface_adaptive.R ├── surface_at_gridpoint.R └── switch_double_DT.R /GB1/dataset/GB1_sequence.fasta: -------------------------------------------------------------------------------- 1 | >GB1 2 | MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE -------------------------------------------------------------------------------- /GB1/dataset/Olson2014_TableS2_singles.txt: -------------------------------------------------------------------------------- 1 | WT amino acid Position Mutation Input Count Selection Count Q 2 A 14663 38476 Q 2 C 13001 23023 Q 2 D 11488 18085 Q 2 E 9501 15629 Q 2 F 4770 13332 Q 2 G 12460 27778 Q 2 H 33615 71252 Q 2 I 13180 28931 Q 2 K 10166 23382 Q 2 L 40106 95276 Q 2 M 8287 20694 Q 2 N 20166 39445 Q 2 P 68124 167686 Q 2 R 37351 99959 Q 2 S 25071 55253 Q 2 T 24951 57168 Q 2 V 13922 28304 Q 2 W 4687 11796 Q 2 Y 11961 28793 Y 3 A 24572 24579 Y 3 C 4869 5187 Y 3 D 11538 190 Y 3 E 7916 121 Y 3 F 12855 23428 Y 3 G 9583 564 Y 3 H 34286 68210 Y 3 I 11006 5473 Y 3 K 12120 1089 Y 3 L 41550 56805 Y 3 M 9407 9818 Y 3 N 21962 4221 Y 3 P 115397 13682 Y 3 Q 19928 6234 Y 3 R 30444 4311 Y 3 S 38003 18224 Y 3 T 45311 11803 Y 3 V 13326 7033 Y 3 W 5089 9798 K 4 A 22568 27370 K 4 C 4357 5731 K 4 D 10377 10437 K 4 E 8752 9196 K 4 F 13134 17305 K 4 G 14831 14501 K 4 H 11157 14510 K 4 I 23783 39679 K 4 L 31238 71838 K 4 M 16821 27867 K 4 N 23440 28059 K 4 P 72825 4085 K 4 Q 8296 12864 K 4 R 22469 39647 K 4 S 31978 41200 K 4 T 47044 68114 K 4 V 18075 30181 K 4 W 4507 7256 K 4 Y 13209 17632 L 5 A 102479 100175 L 5 C 27859 37553 L 5 D 32890 222 L 5 E 26930 338 L 5 F 16197 14567 L 5 G 47338 18272 L 5 H 92720 29969 L 5 I 15033 5354 L 5 K 23710 568 L 5 M 34946 22256 L 5 N 29983 20241 L 5 P 133938 1649 L 5 Q 61046 21300 L 5 R 96469 1706 L 5 S 96274 71799 L 5 T 94325 95229 L 5 V 43625 54635 L 5 W 13800 1148 L 5 Y 47702 6813 I 6 A 3681 4673 I 6 C 15717 23476 I 6 D 5259 6253 I 6 E 4087 8062 I 6 F 8610 18820 I 6 G 9897 19596 I 6 H 26635 50187 I 6 K 3704 6958 I 6 L 51935 127409 I 6 M 21205 30447 I 6 N 5283 7607 I 6 P 49132 349 I 6 Q 36062 68637 I 6 R 61986 117930 I 6 S 29356 39046 I 6 T 30894 57423 I 6 V 3536 6449 I 6 W 23812 50273 I 6 Y 22409 54649 L 7 A 34404 55458 L 7 C 12449 22293 L 7 D 11778 477 L 7 E 18267 4238 L 7 F 6309 4059 L 7 G 35071 16207 L 7 H 15664 1471 L 7 I 12610 37038 L 7 K 13270 5978 L 7 M 17642 12594 L 7 N 12614 4505 L 7 P 50684 45596 L 7 Q 20595 6443 L 7 R 49573 10097 L 7 S 40306 44544 L 7 T 44897 66023 L 7 V 24444 53108 L 7 W 12875 1310 L 7 Y 10612 2241 N 8 A 41498 73325 N 8 C 10419 19998 N 8 D 11498 18688 N 8 E 20466 33074 N 8 F 7730 21302 N 8 G 27388 43321 N 8 H 11762 26546 N 8 I 7929 12477 N 8 K 14407 32239 N 8 L 39548 65846 N 8 M 11905 15333 N 8 P 38933 1496 N 8 Q 19093 36218 N 8 R 44725 109754 N 8 S 36002 58381 N 8 T 36479 41620 N 8 V 24209 37603 N 8 W 13454 29498 N 8 Y 8536 23340 G 9 A 65857 90820 G 9 C 14763 16172 G 9 D 23794 5463 G 9 E 16766 3278 G 9 F 13660 6108 G 9 H 14290 5115 G 9 I 29687 7088 G 9 K 21971 4655 G 9 L 35873 12146 G 9 M 28714 7328 G 9 N 34727 7288 G 9 P 46102 18963 G 9 Q 10714 2366 G 9 R 38107 16124 G 9 S 59986 42587 G 9 T 83378 44105 G 9 V 39908 10779 G 9 W 10074 4696 G 9 Y 15887 6886 K 10 A 21539 35206 K 10 C 8456 14389 K 10 D 23660 39063 K 10 E 15907 24117 K 10 F 9023 15345 K 10 G 38207 44504 K 10 H 19516 37164 K 10 I 11121 20274 K 10 L 32545 59716 K 10 M 17008 20959 K 10 N 26632 52790 K 10 P 18073 12463 K 10 Q 10356 17632 K 10 R 39021 106293 K 10 S 28845 45028 K 10 T 20743 38733 K 10 V 30739 58818 K 10 W 9949 19051 K 10 Y 16988 30337 T 11 A 21240 12806 T 11 C 16220 11209 T 11 D 18836 10728 T 11 E 21981 11689 T 11 F 17814 11918 T 11 G 51023 24644 T 11 H 29467 19328 T 11 I 13690 8447 T 11 K 7671 5108 T 11 L 52385 32984 T 11 M 13196 5420 T 11 N 11666 8132 T 11 P 62555 19555 T 11 Q 59534 38097 T 11 R 84936 60451 T 11 S 53756 68761 T 11 V 17090 11551 T 11 W 23128 16088 T 11 Y 20849 14953 L 12 A 80686 25231 L 12 C 23994 12420 L 12 D 37546 8641 L 12 E 44908 12601 L 12 F 21145 13767 L 12 G 126077 39005 L 12 H 48068 15096 L 12 I 38122 29613 L 12 K 36433 12378 L 12 M 41835 20147 L 12 N 51115 12577 L 12 P 92976 16826 L 12 Q 40010 14668 L 12 R 133367 45200 L 12 S 88507 27502 L 12 T 77923 30648 L 12 V 61316 35207 L 12 W 33165 9147 L 12 Y 41640 9491 K 13 A 51090 87056 K 13 C 11935 24232 K 13 D 16942 16219 K 13 E 23256 33571 K 13 F 15918 22650 K 13 G 16015 20792 K 13 H 17858 24023 K 13 I 18456 28325 K 13 L 53619 58806 K 13 M 17213 26339 K 13 N 24914 48289 K 13 P 92147 28369 K 13 Q 29766 48077 K 13 R 41546 101318 K 13 S 55067 151624 K 13 T 73085 212720 K 13 V 30084 56440 K 13 W 13803 44516 K 13 Y 16363 17424 G 14 A 59036 22637 G 14 C 15405 5946 G 14 D 19272 4508 G 14 E 24981 10605 G 14 F 18216 3836 G 14 H 28633 10268 G 14 I 15815 5793 G 14 K 15236 3390 G 14 L 73241 17722 G 14 M 19303 2655 G 14 N 18163 6746 G 14 P 107029 15918 G 14 Q 27506 9577 G 14 R 60734 17476 G 14 S 53429 19145 G 14 T 55458 23123 G 14 V 35098 14502 G 14 W 13041 6159 G 14 Y 17488 5065 E 15 A 35374 54806 E 15 C 11999 25648 E 15 D 25843 27352 E 15 F 13347 28524 E 15 G 43958 38387 E 15 H 26373 76960 E 15 I 21230 51279 E 15 K 19141 35533 E 15 L 66676 111516 E 15 M 23308 33648 E 15 N 28746 40639 E 15 P 45923 16911 E 15 Q 21882 51881 E 15 R 51673 127737 E 15 S 45073 119282 E 15 T 37041 94156 E 15 V 41378 106208 E 15 W 13024 32676 E 15 Y 17783 51121 T 16 A 38128 29083 T 16 C 18196 16997 T 16 D 12383 4727 T 16 E 27018 21493 T 16 F 20611 13948 T 16 G 113760 91632 T 16 H 20554 19280 T 16 I 22641 32740 T 16 K 14421 20458 T 16 L 86808 114278 T 16 M 42047 39266 T 16 N 10693 8357 T 16 P 90794 66331 T 16 Q 40611 48881 T 16 R 106367 184675 T 16 S 71248 69455 T 16 V 57504 73376 T 16 W 39584 20763 T 16 Y 15737 10608 T 17 A 49502 105171 T 17 C 12726 24562 T 17 D 23175 17511 T 17 E 26208 32446 T 17 F 19124 36535 T 17 G 77342 250890 T 17 H 29294 67973 T 17 I 21379 26724 T 17 K 17966 31951 T 17 L 45193 66035 T 17 M 16394 26591 T 17 N 24237 32505 T 17 P 46699 28322 T 17 Q 16481 31164 T 17 R 57501 142948 T 17 S 60126 120151 T 17 V 42924 57540 T 17 W 16674 41918 T 17 Y 17209 33102 T 18 A 24941 47753 T 18 C 12780 26146 T 18 D 14900 7125 T 18 E 18261 8260 T 18 F 10364 16480 T 18 G 31188 34862 T 18 H 15556 27435 T 18 I 12371 18372 T 18 K 13299 26175 T 18 L 40817 41499 T 18 M 12954 8022 T 18 N 14761 20680 T 18 P 32959 2273 T 18 Q 17438 20418 T 18 R 40452 84029 T 18 S 43099 70606 T 18 V 28486 56671 T 18 W 11084 18827 T 18 Y 11917 19875 E 19 A 18824 47540 E 19 C 4896 11333 E 19 D 11420 23477 E 19 F 7277 19654 E 19 G 25998 63362 E 19 H 5505 14168 E 19 I 8639 23671 E 19 K 10812 29121 E 19 L 20503 54766 E 19 M 11552 23452 E 19 N 13257 34536 E 19 P 18245 48044 E 19 Q 9046 23201 E 19 R 17346 48488 E 19 S 23583 57158 E 19 T 20505 49629 E 19 V 11313 30847 E 19 W 2977 8035 E 19 Y 5374 14191 A 20 C 24537 37806 A 20 D 20563 9947 A 20 E 15652 10857 A 20 F 16676 7008 A 20 G 44945 71573 A 20 H 42209 43445 A 20 I 19267 11063 A 20 K 15365 43568 A 20 L 77574 53578 A 20 M 24103 22390 A 20 N 26540 36162 A 20 P 51963 27499 A 20 Q 28547 43826 A 20 R 64542 264673 A 20 S 51408 99798 A 20 T 41910 78941 A 20 V 44986 82564 A 20 W 11207 7651 A 20 Y 24835 13413 V 21 A 13650 23326 V 21 C 14099 26854 V 21 D 10031 12945 V 21 E 19264 21930 V 21 F 13024 30576 V 21 G 61067 118204 V 21 H 15286 28460 V 21 I 24563 48919 V 21 K 20400 35012 V 21 L 79943 141666 V 21 M 47124 62252 V 21 N 8333 15994 V 21 P 55694 88295 V 21 Q 33194 55950 V 21 R 65544 175477 V 21 S 46233 79777 V 21 T 56564 93099 V 21 W 30867 77314 V 21 Y 11617 27353 D 22 A 9047 25911 D 22 C 10972 27342 D 22 E 16317 44052 D 22 F 6210 15240 D 22 G 23209 59545 D 22 H 11342 27861 D 22 I 19813 49271 D 22 K 13479 24719 D 22 L 34282 88054 D 22 M 20338 40916 D 22 N 16267 35157 D 22 P 16807 29528 D 22 Q 13803 37575 D 22 R 28638 61579 D 22 S 32697 70745 D 22 T 20286 48870 D 22 V 24190 64599 D 22 W 13688 38811 D 22 Y 9194 22559 A 23 C 18057 10764 A 23 D 11942 8380 A 23 E 17648 8102 A 23 F 21097 302 A 23 G 34128 30318 A 23 H 12892 1590 A 23 I 16994 2500 A 23 K 16460 1807 A 23 L 60733 8751 A 23 M 17224 1063 A 23 N 13924 13401 A 23 P 77434 143942 A 23 Q 15496 2201 A 23 R 47171 4880 A 23 S 56880 84047 A 23 T 36646 21766 A 23 V 39290 17221 A 23 W 19887 201 A 23 Y 13372 157 A 24 C 10754 36229 A 24 D 13144 24369 A 24 E 16991 103524 A 24 F 16392 103489 A 24 G 39536 168270 A 24 H 5846 37594 A 24 I 18686 47237 A 24 K 11269 3056 A 24 L 32451 73148 A 24 M 16651 61844 A 24 N 11510 48609 A 24 P 18147 108794 A 24 Q 7560 36593 A 24 R 24555 20139 A 24 S 34534 124522 A 24 T 26572 111754 A 24 V 53445 150842 A 24 W 9271 68725 A 24 Y 8272 71811 T 25 A 46590 93295 T 25 C 37737 91653 T 25 D 49759 32504 T 25 E 32850 20013 T 25 F 54190 167843 T 25 G 55253 88914 T 25 H 36621 89699 T 25 I 40464 105691 T 25 K 33520 71394 T 25 L 89027 172970 T 25 M 36011 127671 T 25 N 64696 106237 T 25 P 28742 59440 T 25 Q 23603 67274 T 25 R 62651 255882 T 25 S 85249 217570 T 25 V 64589 106108 T 25 W 20589 77262 T 25 Y 64615 129609 A 26 C 40939 52103 A 26 D 16721 171 A 26 E 21980 281 A 26 F 36840 476 A 26 G 55786 111192 A 26 H 36742 414 A 26 I 47218 12166 A 26 K 30044 275 A 26 L 153776 2683 A 26 M 77582 16353 A 26 N 17271 424 A 26 P 110496 44600 A 26 Q 59433 6474 A 26 R 112216 1777 A 26 S 97456 175161 A 26 T 89974 68569 A 26 V 47681 26746 A 26 W 65263 2807 A 26 Y 33011 463 E 27 A 33597 134 E 27 C 25543 227 E 27 D 27258 164 E 27 F 23278 87 E 27 G 62168 332 E 27 H 31349 152 E 27 I 41336 218 E 27 K 17150 73 E 27 L 88198 379 E 27 M 36178 152 E 27 N 28919 142 E 27 P 49182 300 E 27 Q 21151 90 E 27 R 83788 339 E 27 S 70701 295 E 27 T 41217 210 E 27 V 70819 345 E 27 W 17196 71 E 27 Y 25091 126 K 28 A 26191 2268 K 28 C 23792 5594 K 28 D 16711 145 K 28 E 16630 150 K 28 F 14273 3425 K 28 G 31664 2623 K 28 H 17790 3067 K 28 I 8669 4232 K 28 L 42503 14362 K 28 M 15445 3448 K 28 N 20952 1555 K 28 P 35132 4458 K 28 Q 21106 1918 K 28 R 49893 36868 K 28 S 52024 3631 K 28 T 33309 5330 K 28 V 20079 5930 K 28 W 17099 1177 K 28 Y 20806 1597 V 29 A 39812 92273 V 29 C 19851 45232 V 29 D 23564 10442 V 29 E 25354 12525 V 29 F 15440 26765 V 29 G 68382 114448 V 29 H 15935 32754 V 29 I 17523 39268 V 29 K 13073 51292 V 29 L 52829 116555 V 29 M 26006 53730 V 29 N 14142 35376 V 29 P 38418 7455 V 29 Q 10223 23746 V 29 R 33965 158139 V 29 S 54027 118947 V 29 T 46829 102169 V 29 W 11810 15508 V 29 Y 20082 36706 F 30 A 18125 6748 F 30 C 10727 10717 F 30 D 9088 65 F 30 E 12864 105 F 30 G 28882 2388 F 30 H 12045 17363 F 30 I 14663 5071 F 30 K 13030 1544 F 30 L 61921 84159 F 30 M 21611 24474 F 30 N 13478 3499 F 30 P 14188 144 F 30 Q 9211 1261 F 30 R 27385 236 F 30 S 35649 21685 F 30 T 20023 3170 F 30 V 28681 20083 F 30 W 15150 11664 F 30 Y 15979 11560 K 31 A 55355 306 K 31 C 19770 223 K 31 D 22264 107 K 31 E 34179 166 K 31 F 25061 132 K 31 G 89526 447 K 31 H 28017 103 K 31 I 37026 239 K 31 L 59926 627 K 31 M 29810 558 K 31 N 19575 120 K 31 P 37140 233 K 31 Q 19850 122 K 31 R 96271 38897 K 31 S 58473 277 K 31 T 70080 283 K 31 V 58660 287 K 31 W 37743 306 K 31 Y 25248 95 Q 32 A 45737 62296 Q 32 C 25211 35086 Q 32 D 26730 2704 Q 32 E 16067 2097 Q 32 F 13104 5269 Q 32 G 41499 107395 Q 32 H 34805 69113 Q 32 I 23939 16270 Q 32 K 19405 43027 Q 32 L 53796 89371 Q 32 M 17386 24572 Q 32 N 27848 78606 Q 32 P 84178 2721 Q 32 R 68222 161777 Q 32 S 75023 117147 Q 32 T 59318 68119 Q 32 V 37714 27235 Q 32 W 15661 6587 Q 32 Y 23896 8581 Y 33 A 42937 14211 Y 33 C 13618 7227 Y 33 D 12938 450 Y 33 E 27209 2110 Y 33 F 14944 40764 Y 33 G 32478 17473 Y 33 H 27158 17449 Y 33 I 9098 3549 Y 33 K 15792 8926 Y 33 L 52029 59654 Y 33 M 12336 10217 Y 33 N 12552 4067 Y 33 P 121812 1734 Y 33 Q 27382 11808 Y 33 R 55133 28853 Y 33 S 49671 15665 Y 33 T 36843 10498 Y 33 V 30407 9053 Y 33 W 20801 23701 A 34 C 10374 18607 A 34 D 15003 109 A 34 E 22624 158 A 34 F 15606 339 A 34 G 45889 20105 A 34 H 12544 110 A 34 I 18933 1713 A 34 K 14995 92 A 34 L 49709 3575 A 34 M 21124 28072 A 34 N 15857 173 A 34 P 65070 440 A 34 Q 14331 127 A 34 R 38446 770 A 34 S 66756 91257 A 34 T 63560 23065 A 34 V 41506 12859 A 34 W 12172 187 A 34 Y 13350 426 N 35 A 63720 1396 N 35 C 19456 1709 N 35 D 19194 226 N 35 E 23562 279 N 35 F 20382 650 N 35 G 53332 4777 N 35 H 10896 295 N 35 I 17748 977 N 35 K 20037 245 N 35 L 57502 1005 N 35 M 24975 530 N 35 P 25199 132 N 35 Q 17172 241 N 35 R 51659 1222 N 35 S 59069 5017 N 35 T 42623 7111 N 35 V 56124 1001 N 35 W 16332 844 N 35 Y 14304 375 D 36 A 37862 108638 D 36 C 13800 35933 D 36 E 24683 48688 D 36 F 14014 21786 D 36 G 40780 130232 D 36 H 20486 39282 D 36 I 17312 40222 D 36 K 16394 37048 D 36 L 54438 136708 D 36 M 22041 54362 D 36 N 14209 45759 D 36 P 61131 1491 D 36 Q 23100 53835 D 36 R 41259 121420 D 36 S 36216 104810 D 36 T 41883 105626 D 36 V 37504 107749 D 36 W 19650 29800 D 36 Y 17074 30458 N 37 A 10769 4178 N 37 C 6461 6480 N 37 D 5993 659 N 37 E 8814 2283 N 37 F 4969 492 N 37 G 13000 8525 N 37 H 14291 8514 N 37 I 13969 8967 N 37 K 27145 9793 N 37 L 30911 16367 N 37 M 16191 5886 N 37 P 18721 181 N 37 Q 12411 6013 N 37 R 37937 12594 N 37 S 32491 33449 N 37 T 9824 7775 N 37 V 10572 5024 N 37 W 17030 4636 N 37 Y 8329 1424 G 38 A 39224 67579 G 38 C 18699 17573 G 38 D 19563 19070 G 38 E 19515 15234 G 38 F 15474 15153 G 38 H 13944 11076 G 38 I 11316 4050 G 38 K 15497 11369 G 38 L 50251 53855 G 38 M 17289 13778 G 38 N 14985 12910 G 38 P 21785 259 G 38 Q 15375 12082 G 38 R 53827 39352 G 38 S 49899 51247 G 38 T 22571 18387 G 38 V 44184 31440 G 38 W 22393 23946 G 38 Y 11640 12772 V 39 A 50993 4710 V 39 C 29687 10729 V 39 D 27237 263 V 39 E 24644 1203 V 39 F 45582 25645 V 39 G 54685 677 V 39 H 10859 429 V 39 I 41684 89921 V 39 K 13498 406 V 39 L 59997 151290 V 39 M 22038 38487 V 39 N 21962 381 V 39 P 26222 350 V 39 Q 9603 1471 V 39 R 45394 1165 V 39 S 58447 2992 V 39 T 31568 11612 V 39 W 20952 325 V 39 Y 20930 2249 D 40 A 72913 261212 D 40 C 22805 62542 D 40 E 34701 44572 D 40 F 29759 135618 D 40 G 36838 47487 D 40 H 24747 113760 D 40 I 25622 68491 D 40 K 24297 41248 D 40 L 71102 96247 D 40 M 30818 50279 D 40 N 13713 36886 D 40 P 47981 28789 D 40 Q 26309 43722 D 40 R 92738 227667 D 40 S 84297 272107 D 40 T 64079 111421 D 40 V 76641 222704 D 40 W 34963 203511 D 40 Y 23207 134736 G 41 A 40035 7202 G 41 C 27048 2450 G 41 D 28032 249 G 41 E 25625 246 G 41 F 28902 2890 G 41 H 22666 147 G 41 I 19771 101 G 41 K 22532 137 G 41 L 61881 902 G 41 M 34714 679 G 41 N 21696 186 G 41 P 40724 298 G 41 Q 24377 152 G 41 R 85007 552 G 41 S 49820 1032 G 41 T 45043 276 G 41 V 53408 1274 G 41 W 35914 403 G 41 Y 27320 559 E 42 A 41720 110713 E 42 C 34433 91108 E 42 D 28410 58190 E 42 F 11284 31141 E 42 G 48896 132453 E 42 H 79258 179793 E 42 I 215952 707845 E 42 K 31298 88839 E 42 L 107262 396377 E 42 M 118559 355384 E 42 N 43914 117273 E 42 P 78746 93537 E 42 Q 21762 80916 E 42 R 101401 333896 E 42 S 205734 590830 E 42 T 75886 240399 E 42 V 58682 185266 E 42 W 15388 55497 E 42 Y 39163 106077 W 43 A 34051 222 W 43 C 31817 256 W 43 D 16025 93 W 43 E 19799 111 W 43 F 31836 722 W 43 G 28923 234 W 43 H 33953 247 W 43 I 13577 75 W 43 K 23290 106 W 43 L 90390 568 W 43 M 19622 103 W 43 N 20699 101 W 43 P 102078 542 W 43 Q 24945 142 W 43 R 60402 512 W 43 S 71157 404 W 43 T 45303 279 W 43 V 26354 151 W 43 Y 38099 531 T 44 A 54538 137791 T 44 C 23389 38879 T 44 D 43206 51360 T 44 E 36333 61956 T 44 F 46475 82200 T 44 G 61901 149762 T 44 H 27996 54886 T 44 I 98732 98379 T 44 K 31904 68023 T 44 L 90253 157222 T 44 M 47645 96567 T 44 N 68354 97684 T 44 P 45159 733 T 44 Q 18368 45672 T 44 R 52296 110049 T 44 S 104291 210464 T 44 V 95580 80822 T 44 W 18350 40417 T 44 Y 20506 30793 Y 45 A 34745 3261 Y 45 C 19082 6808 Y 45 D 15906 261 Y 45 E 16013 349 Y 45 F 18795 29616 Y 45 G 38347 2409 Y 45 H 12601 5219 Y 45 I 14215 5785 Y 45 K 14572 1205 Y 45 L 62081 47234 Y 45 M 23118 11079 Y 45 N 11795 911 Y 45 P 26630 338 Y 45 Q 9616 423 Y 45 R 47758 3954 Y 45 S 39780 2840 Y 45 T 23175 1611 Y 45 V 51587 12146 Y 45 W 26206 81940 D 46 A 48346 102811 D 46 C 20557 45983 D 46 E 37556 65024 D 46 F 18628 34090 D 46 G 23940 48100 D 46 H 27683 60059 D 46 I 21946 45693 D 46 K 32575 75213 D 46 L 64854 138921 D 46 M 26037 54534 D 46 N 20019 47544 D 46 P 39434 2273 D 46 Q 35722 76767 D 46 R 90713 214058 D 46 S 53436 128674 D 46 T 59465 160767 D 46 V 49576 109810 D 46 W 40239 79467 D 46 Y 21535 36622 D 47 A 7204 21366 D 47 C 13478 32851 D 47 E 12678 24553 D 47 F 6624 20154 D 47 G 51719 141201 D 47 H 26045 98148 D 47 I 77112 191499 D 47 K 36526 135127 D 47 L 67112 176415 D 47 M 39127 94955 D 47 N 32525 93407 D 47 P 19598 64620 D 47 Q 14347 42281 D 47 R 95155 459818 D 47 S 81857 253299 D 47 T 10792 28271 D 47 V 46400 122163 D 47 W 30408 112440 D 47 Y 12387 37249 A 48 C 25328 52729 A 48 D 10830 20547 A 48 E 19124 38164 A 48 F 14034 33455 A 48 G 45860 98142 A 48 H 9313 21925 A 48 I 20022 47453 A 48 K 37423 86560 A 48 L 105890 242246 A 48 M 42758 96305 A 48 N 19408 42082 A 48 P 18594 40783 A 48 Q 12222 27746 A 48 R 70326 162935 A 48 S 56835 124948 A 48 T 33682 75483 A 48 V 41059 95216 A 48 W 41855 94752 A 48 Y 23998 54383 T 49 A 23592 47971 T 49 C 22589 48249 T 49 D 16890 31419 T 49 E 10154 20579 T 49 F 64416 134447 T 49 G 44684 90121 T 49 H 7397 16284 T 49 I 43837 108528 T 49 K 9120 21107 T 49 L 60630 132230 T 49 M 22054 45431 T 49 N 20824 41777 T 49 P 11871 20258 T 49 Q 4755 10031 T 49 R 24029 54393 T 49 S 41183 92486 T 49 V 67111 133195 T 49 W 10081 22640 T 49 Y 20669 45721 K 50 A 20795 41734 K 50 C 16071 26863 K 50 D 7948 13072 K 50 E 10822 18816 K 50 F 21470 28018 K 50 G 44464 120012 K 50 H 7675 15678 K 50 I 13587 19713 K 50 L 45151 69429 K 50 M 17327 23878 K 50 N 11244 18577 K 50 P 7370 4866 K 50 Q 7663 14302 K 50 R 38243 88134 K 50 S 26578 50576 K 50 T 12098 21044 K 50 V 30878 56855 K 50 W 21825 35792 K 50 Y 12759 18361 T 51 A 26978 38976 T 51 C 18668 37878 T 51 D 13315 28365 T 51 E 18309 48531 T 51 F 14899 40786 T 51 G 46770 86314 T 51 H 11831 34308 T 51 I 15310 46622 T 51 K 11778 34328 T 51 L 37478 134802 T 51 M 18098 57337 T 51 N 11582 27958 T 51 P 14245 493 T 51 Q 14867 43071 T 51 R 50483 147093 T 51 S 35977 54557 T 51 V 37307 115951 T 51 W 25156 74824 T 51 Y 15632 44003 F 52 A 23631 718 F 52 C 39559 1525 F 52 D 43358 275 F 52 E 85794 592 F 52 G 130332 1109 F 52 H 42029 1615 F 52 I 37954 3926 F 52 K 89853 605 F 52 L 102269 37343 F 52 M 44034 13994 F 52 N 52808 379 F 52 P 47858 336 F 52 Q 63866 494 F 52 R 179406 1251 F 52 S 98399 1048 F 52 T 62918 784 F 52 V 37470 2101 F 52 W 76426 948 F 52 Y 53880 9805 T 53 A 35249 61175 T 53 C 11727 18709 T 53 D 16704 18240 T 53 E 22498 37446 T 53 F 7020 6562 T 53 G 39615 121803 T 53 H 13525 15852 T 53 I 8221 10792 T 53 K 20005 32942 T 53 L 42622 40972 T 53 M 9002 12462 T 53 N 14090 18607 T 53 P 30936 462 T 53 Q 13143 23217 T 53 R 46367 72498 T 53 S 40744 82571 T 53 V 30573 56318 T 53 W 12247 21316 T 53 Y 11914 12078 V 54 A 25806 52860 V 54 C 8191 17279 V 54 D 8840 416 V 54 E 8258 435 V 54 F 13497 16528 V 54 G 23276 9231 V 54 H 4506 1088 V 54 I 7362 4142 V 54 K 4728 54 V 54 L 42347 24038 V 54 M 6622 5786 V 54 N 6557 2202 V 54 P 10201 115 V 54 Q 4880 905 V 54 R 19127 176 V 54 S 19097 14125 V 54 T 11354 10421 V 54 W 7260 7582 V 54 Y 6720 286 T 55 A 20582 24299 T 55 C 18726 22674 T 55 D 17959 15830 T 55 E 18307 19558 T 55 F 16398 17146 T 55 G 44442 34757 T 55 H 10879 11224 T 55 I 18174 17308 T 55 K 16615 24381 T 55 L 35437 35932 T 55 M 18530 19909 T 55 N 17278 18366 T 55 P 7685 436 T 55 Q 12523 14755 T 55 R 48640 79126 T 55 S 31279 45403 T 55 V 29999 34713 T 55 W 18644 25725 T 55 Y 17203 19707 E 56 A 32602 17262 E 56 C 13643 4884 E 56 D 16629 4912 E 56 F 12085 4330 E 56 G 23114 9238 E 56 H 17551 7403 E 56 I 10718 2529 E 56 K 18397 8191 E 56 L 36684 8700 E 56 M 16210 4810 E 56 N 16311 5976 E 56 P 27785 6802 E 56 Q 19970 14058 E 56 R 45467 20955 E 56 S 42231 26020 E 56 T 29883 21199 E 56 V 29251 12541 E 56 W 17351 9023 E 56 Y 16560 5434 -------------------------------------------------------------------------------- /GB1/dataset/Olson2014_TableS2_wildtype.txt: -------------------------------------------------------------------------------- 1 | Input Count Selection Count 1759616 3041819 -------------------------------------------------------------------------------- /GB1/dataset/PSIPRED/gb1.psipass2: -------------------------------------------------------------------------------- 1 | # PSIPRED HFORMAT (PSIPRED V3.3) 2 | 3 | Conf: 93799980512412445775088889999997554258740465407823688529 4 | Pred: CEEEEEEECCCCCEEEEEEEECHHHHHHHHHHHHHHCCCCEEEEEECCCEEEEEEC 5 | AA: MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE 6 | 10 20 30 40 50 7 | 8 | -------------------------------------------------------------------------------- /GB1/dataset/PSIPRED/gb1.psipred.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lehner-lab/DMS2structure/3c1976b78a743d4d4c73921f047648500e671c8e/GB1/dataset/PSIPRED/gb1.psipred.pdf -------------------------------------------------------------------------------- /GB1/dataset/PSIPRED/gb1.ss2: -------------------------------------------------------------------------------- 1 | # PSIPRED VFORMAT (PSIPRED V3.3) 2 | 3 | 1 M C 1.000 0.000 0.000 4 | 2 Q E 0.293 0.002 0.652 5 | 3 Y E 0.106 0.017 0.846 6 | 4 K E 0.028 0.003 0.946 7 | 5 L E 0.012 0.004 0.972 8 | 6 I E 0.016 0.006 0.976 9 | 7 L E 0.061 0.007 0.923 10 | 8 N E 0.418 0.006 0.515 11 | 9 G C 0.742 0.004 0.202 12 | 10 K C 0.575 0.007 0.425 13 | 11 T C 0.608 0.008 0.390 14 | 12 L C 0.680 0.008 0.266 15 | 13 K C 0.524 0.011 0.385 16 | 14 G E 0.320 0.009 0.612 17 | 15 E E 0.249 0.005 0.726 18 | 16 T E 0.192 0.046 0.656 19 | 17 T E 0.131 0.030 0.700 20 | 18 T E 0.064 0.069 0.847 21 | 19 E E 0.056 0.110 0.875 22 | 20 A E 0.192 0.086 0.711 23 | 21 V E 0.468 0.123 0.480 24 | 22 D C 0.899 0.063 0.068 25 | 23 A H 0.070 0.905 0.002 26 | 24 A H 0.076 0.885 0.004 27 | 25 T H 0.048 0.926 0.003 28 | 26 A H 0.017 0.975 0.001 29 | 27 E H 0.013 0.983 0.000 30 | 28 K H 0.010 0.982 0.000 31 | 29 V H 0.008 0.985 0.000 32 | 30 F H 0.016 0.975 0.001 33 | 31 K H 0.032 0.947 0.005 34 | 32 Q H 0.095 0.851 0.032 35 | 33 Y H 0.218 0.750 0.023 36 | 34 A H 0.207 0.780 0.010 37 | 35 N H 0.247 0.736 0.008 38 | 36 D H 0.367 0.584 0.009 39 | 37 N C 0.741 0.196 0.021 40 | 38 G C 0.930 0.019 0.035 41 | 39 V C 0.858 0.008 0.122 42 | 40 D C 0.704 0.005 0.276 43 | 41 G E 0.453 0.002 0.523 44 | 42 E E 0.268 0.001 0.767 45 | 43 W E 0.173 0.000 0.847 46 | 44 T E 0.204 0.001 0.801 47 | 45 Y E 0.299 0.005 0.710 48 | 46 D E 0.493 0.012 0.501 49 | 47 D C 0.854 0.052 0.060 50 | 48 A C 0.907 0.029 0.042 51 | 49 T C 0.616 0.034 0.331 52 | 50 K E 0.344 0.008 0.661 53 | 51 T E 0.180 0.006 0.858 54 | 52 F E 0.097 0.001 0.920 55 | 53 T E 0.064 0.001 0.944 56 | 54 V E 0.215 0.001 0.748 57 | 55 T E 0.358 0.002 0.574 58 | 56 E C 0.977 0.000 0.001 59 | -------------------------------------------------------------------------------- /GB1_downsampling/dataset/GB1_CDS_nt: -------------------------------------------------------------------------------- 1 | >M13825.1:578-1924 Streptococcus sp. (Lancefield group G) spg gene encoding an immunoglobulin G binding protein 2 | ATGGAAAAAGAAAAAAAGGTAAAATACTTTTTACGTAAATCAGCTTTTGGGTTAGCATCCGTATCAGCTG 3 | CATTTTTAGTGGGATCAACGGTATTCGCTGTTGATTCACCAATCGAAGATACCCCAATTATTCGTAATGG 4 | TGGTGAATTAACTAATCTTCTGGGGAATTCAGAGACAACACTGGCTTTGCGTAATGAAGAGAGTGCTACA 5 | GCTGATTTGACAGCAGCAGCGGTAGCCGATACTGTGGCAGCAGCGGCAGCTGAAAATGCTGGGGCAGCAG 6 | CTTGGGAAGCAGCGGCAGCAGCAGATGCTCTAGCAAAAGCCAAAGCAGATGCCCTTAAAGAATTCAACAA 7 | ATATGGAGTAAGTGACTATTACAAGAATCTAATCAACAATGCCAAAACTGTTGAAGGCATAAAAGACCTT 8 | CAAGCACAAGTTGTTGAATCAGCGAAGAAAGCGCGTATTTCAGAAGCAACAGATGGCTTATCTGATTTCT 9 | TGAAATCGCAAACACCTGCTGAAGATACTGTTAAATCAATTGAATTAGCTGAAGCTAAAGTCTTAGCTAA 10 | CAGAGAACTTGACAAATATGGAGTAAGTGACTATCACAAGAACCTAATCAACAATGCCAAAACTGTTGAA 11 | GGTGTAAAAGAACTGATAGATGAAATTTTAGCTGCATTACCTAAGACTGACACTTACAAATTAATCCTTA 12 | ATGGTAAAACATTGAAAGGCGAAACAACTACTGAAGCTGTTGATGCTGCTACTGCAGAAAAAGTCTTCAA 13 | ACAATACGCTAACGACAACGGTGTTGACGGTGAATGGACTTACGACGATGCGACTAAGACCTTTACAGTT 14 | ACTGAAAAACCAGAAGTGATCGATGCGTCTGAATTAACACCAGCCGTGACAACTTACAAACTTGTTATTA 15 | ATGGTAAAACATTGAAAGGCGAAACAACTACTAAAGCAGTAGACGCAGAAACTGCAGAAAAAGCCTTCAA 16 | ACAATACGCTAACGACAACGGTGTTGATGGTGTTTGGACTTATGATGATGCGACTAAGACCTTTACGGTA 17 | ACTGAAATGGTTACAGAGGTTCCTGGTGATGCACCAACTGAACCAGAAAAACCAGAAGCAAGTATCCCTC 18 | TTGTTCCGTTAACTCCTGCAACTCCAATTGCTAAAGATGACGCTAAGAAAGACGATACTAAGAAAGAAGA 19 | TGCTAAAAAACCAGAAGCTAAGAAAGATGACGCTAAGAAAGCTGAAACTCTTCCTACAACTGGTGAAGGA 20 | AGCAACCCATTCTTCACAGCAGCTGCGCTTGCAGTAATGGCTGGTGCGGGTGCTTTGGCGGTCGCTTCAA 21 | AACGTAAAGAAGACTAA 22 | -------------------------------------------------------------------------------- /GB1_downsampling/dataset/amino_acid_codon_conversion.txt: -------------------------------------------------------------------------------- 1 | Amino Acid SLC DNA codons Isoleucine I ATT Isoleucine I ATC Isoleucine I ATA Leucine L CTT Leucine L CTC Leucine L CTA Leucine L CTG Leucine L TTA Leucine L TTG Valine V GTC Valine V GTA Valine V GTG Valine V GTT Phenylalanine F TTC Phenylalanine F TTT Methionine M ATG Cysteine C TGC Cysteine C TGT Alanine A GCC Alanine A GCA Alanine A GCG Alanine A GCT Glycine G GGG Glycine G GGA Glycine G GGC Glycine G GGT Proline P CCG Proline P CCA Proline P CCC Proline P CCT Threonine T ACT Threonine T ACC Threonine T ACA Threonine T ACG Serine S TCT Serine S TCC Serine S TCA Serine S TCG Serine S AGT Serine S AGC Tyrosine Y TAT Tyrosine Y TAC Tryptophan W TGG Glutamine Q CAG Glutamine Q CAA Asparagine N AAT Asparagine N AAC Histidine H CAT Histidine H CAC Glutamicacid E GAA Glutamicacid E GAG Asparticacid D GAT Asparticacid D GAC Lysine K AAA Lysine K AAG Arginine R CGT Arginine R CGC Arginine R CGA Arginine R CGG Arginine R AGA Arginine R AGG Stopcodons Stop TAG Stopcodons Stop TGA Stopcodons Stop TAA -------------------------------------------------------------------------------- /GB1_downsampling_pipeline.R: -------------------------------------------------------------------------------- 1 | ############################ 2 | ##### GB1 downsampling ##### 3 | ############################ 4 | 5 | #this is the pipeline used to analyse the downsampled versions of the GB1 data from Olson et al. 2014 6 | 7 | #first, set the working directory to the DMS2structure folder 8 | setwd("/where/is/DMS2structure/") 9 | 10 | #source scripts 11 | filelist = list.files('scripts/') 12 | sapply(paste0('scripts/',filelist),source,.GlobalEnv) 13 | 14 | #create the necessary subfolder structure for all results and processed data 15 | dataset_dir = "GB1_downsampling/" 16 | create_directory_structure(dataset_dir) 17 | #then save this script in the dataset_dir 18 | #and paste all necessary source data into dataset_dir/dataset 19 | 20 | #load required packages 21 | require(data.table) 22 | require(ggplot2) 23 | require(cowplot) 24 | require(GGally) 25 | theme_set(theme_minimal()) 26 | require(seqinr) 27 | 28 | 29 | ############################### 30 | ##### READ downsampling ####### 31 | ############################### 32 | 33 | read_downsampling = c(0.25,0.1,0.025) 34 | 35 | for (RD in seq_along(read_downsampling)) { 36 | 37 | #load original data 38 | wildtype = fread("GB1/dataset/Olson2014_TableS2_wildtype.txt", sep = "\t", header = TRUE) 39 | singles = fread("GB1/dataset/Olson2014_TableS2_singles.txt", sep = "\t",header = TRUE) 40 | doubles = fread("GB1/dataset/Olson2014_TableS2_doubles.txt", sep = "\t",header = TRUE) 41 | doubles[,c("V11","V12","V13","V14","V15","V16","V17","V18") := NULL] 42 | 43 | # rename coloumns 44 | colnames(wildtype) = c("count_r1_t0","count_r1_t1") 45 | colnames(singles) = c("WT_AA","Pos","Mut","count_r1_t0","count_r1_t1") 46 | colnames(doubles) = c("WT_AA1","Pos1","Mut1","WT_AA2","Pos2","Mut2","count_r1_t0","count_r1_t1","fitness1","fitness2") 47 | 48 | 49 | ## rearrange doubles$GB1 such that always Pos1 < Pos2 50 | doubles[Pos1 > Pos2,':=' (Pos1=Pos2,WT_AA1 = WT_AA2,Mut1 = Mut2,fitness1 = fitness2, 51 | Pos2=Pos1,WT_AA2 = WT_AA1,Mut2 = Mut1,fitness2 = fitness1)] 52 | 53 | 54 | ######################## 55 | ##### downsample ####### 56 | ######################## 57 | set.seed(1603) 58 | wildtype[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD])] 59 | wildtype[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD])] 60 | 61 | singles[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD]),.(Pos,Mut)] 62 | singles[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD]),.(Pos,Mut)] 63 | 64 | doubles[,count_r1_t0 := rbinom(1,size = count_r1_t0,prob = read_downsampling[RD]),.(Pos1,Pos2,Mut1,Mut2)] 65 | doubles[,count_r1_t1 := rbinom(1,size = count_r1_t1,prob = read_downsampling[RD]),.(Pos1,Pos2,Mut1,Mut2)] 66 | 67 | 68 | ## calculate fitness 69 | wildtype[,fitness:=0] 70 | singles[,fitness := log(count_r1_t1/count_r1_t0 * (wildtype$count_r1_t0 / wildtype$count_r1_t1))] 71 | 72 | #wild-type correction factor 73 | xd=density(singles$fitness,bw=.15) 74 | # plot(xd) 75 | #fitness peak ~ wildtype peak 76 | ## both fits give similar result for upper mode, exp(0.193) and exp(0.169) 77 | correction_factor_wildtype =xd$x[xd$y==max(xd$y)] 78 | 79 | #correct fitness value for this factor 80 | singles[,fitness := NULL] 81 | singles[,fitness := log(count_r1_t1/count_r1_t0 * (wildtype$count_r1_t0 / wildtype$count_r1_t1)) - correction_factor_wildtype] 82 | doubles[,fitness := log(count_r1_t1/count_r1_t0 / (wildtype$count_r1_t1 / wildtype$count_r1_t0)) - correction_factor_wildtype] 83 | 84 | # calculate standard-error of fitness values given read counts 85 | wildtype[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0)] 86 | singles[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0 + 1/wildtype$count_r1_t1 + 1/wildtype$count_r1_t0)] 87 | doubles[,sigma := sqrt(1/count_r1_t1 + 1/count_r1_t0 + 1/wildtype$count_r1_t1 + 1/wildtype$count_r1_t0)] 88 | 89 | # transfer single fitness/error values to doubles data.table 90 | doubles[,fitness1 := singles[Pos == Pos1 & Mut == Mut1,fitness],.(Pos1,Mut1)] 91 | doubles[,fitness2 := singles[Pos == Pos2 & Mut == Mut2,fitness],.(Pos2,Mut2)] 92 | doubles[,sigma1 := singles$sigma[singles$Pos %in% Pos1 & singles$Mut %in% Mut1],by=.(Pos1,Mut1)] 93 | doubles[,sigma2 := singles$sigma[singles$Pos %in% Pos2 & singles$Mut %in% Mut2],by=.(Pos2,Mut2)] 94 | 95 | #mark variants with nonsensical fitness values 96 | wildtype[,is.fitness := fitness > -Inf & !is.na(fitness)] 97 | singles[,is.fitness := fitness > -Inf & !is.na(fitness)] 98 | doubles[,is.fitness := fitness > -Inf & !is.na(fitness) & fitness1 > -Inf & fitness2 > -Inf] 99 | 100 | # define which variants have enough reads 101 | wildtype[,is.reads0 := TRUE] 102 | singles[,is.reads0 := TRUE] 103 | # only throw away variants with zero output counts if unclear where above lower_fitness_bound their fitness would be 104 | # only applies to doubles 105 | lower_read_cut = 200 106 | doubles[,is.reads0 := count_r1_t0 > 10 & (count_r1_t1 >= 1 | count_r1_t0 >= lower_read_cut)] 107 | 108 | # rearrange doubles data.table 109 | doubles = doubles[,.SD,,.SDcols = c("Pos1","Pos2","Mut1","Mut2","WT_AA1","WT_AA2", 110 | "count_r1_t0","count_r1_t1","is.fitness","is.reads0", 111 | "fitness1","fitness2","sigma1","sigma2", 112 | "fitness","sigma")] 113 | 114 | # save doubles data.table 115 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_RD",read_downsampling[RD],".txt"), 116 | quote = F,row.names = F, col.names = T) 117 | } 118 | 119 | 120 | 121 | ############################ 122 | ##### doped dataset ####### 123 | ############################ 124 | # only allow AA mutations 1nt hamming distance away from coding sequence 125 | # load a codon<>AA conversion table 126 | AA_codon_conversion = read.table(paste0(dataset_dir,"dataset/amino_acid_codon_conversion.txt"),sep = "\t",header = T) 127 | # load the coding sequence of G protein B1 domain 128 | Gprot_nuc_seq = read.fasta(paste0(dataset_dir,"dataset/GB1_CDS_nt")) 129 | ## 229:282 corresponds to 3:56 in GB1_Olson AA sequence 130 | # seqinr::translate(Gprot_nuc_seq$`M13825.1:578-1924`)[227:282] 131 | # seqinr::translate(Gprot_nuc_seq$`M13825.1:578-1924`[(227*3-2):(282*3)]) 132 | acgt = c("a","c","g","t") 133 | GB1_nuc_seq = Gprot_nuc_seq$`M13825.1:578-1924`[(227*3-2):(282*3)] 134 | 135 | #fix position 2 to Q/Glutamine, Codon CAA 136 | GB1_nuc_seq[4:6] = c("c","a","a") 137 | GB1_doped_mut = data.table(Pos = rep(0,55*12), WTaa = rep("",55*12), Mut = rep("",55*12)) 138 | for (aa_pos in 2:56) { 139 | for (nt_pos in 1:3) { 140 | for (nt in 1:4) { 141 | GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,Pos:= aa_pos] 142 | wt = GB1_nuc_seq[((aa_pos-1)*3+1) : ((aa_pos-1)*3+3)] 143 | GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,WTaa := seqinr::translate(wt)] 144 | mutated = wt 145 | mutated[nt_pos] = acgt[nt] 146 | GB1_doped_mut[(aa_pos-2)*12 + (nt_pos-1)*4 + nt,Mut := seqinr::translate(mutated)] 147 | } 148 | } 149 | } 150 | print(paste("possible NT1 mutations:",nrow(GB1_doped_mut))) 151 | #get rid of duplicate mutations 152 | GB1_doped_mut = unique(GB1_doped_mut) 153 | #get rid of mutations that don't change codons and those that give stop codons 154 | GB1_doped_mut = GB1_doped_mut[WTaa != Mut & Mut != "*"] 155 | print(paste("unique possible NT1 mutations, w/o PTC:",nrow(GB1_doped_mut))) 156 | print(paste0("fraction unique possible NNS mutations (",55*19,"): ",round(nrow(GB1_doped_mut)/55/19*100,digits=1),"%")) 157 | GB1_doped_mut[,PosMut := paste0(Pos,Mut)] 158 | 159 | #create doped double data.tables 160 | #full dataset 161 | doubles = fread("GB1/processed_data/DMS_doubles_preE.txt") 162 | doubles[,':=' (PosMut1 = paste0(Pos1,Mut1),PosMut2 = paste0(Pos2,Mut2))] 163 | doubles = doubles[PosMut1 %in% GB1_doped_mut$PosMut & PosMut2 %in% GB1_doped_mut$PosMut] 164 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_doped.txt"), 165 | quote = F,row.names = F, col.names = T) 166 | 167 | #from read-downsampled versions 168 | for (RD in read_downsampling) { 169 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_preE_RD",RD,".txt")) 170 | doubles[,':=' (PosMut1 = paste0(Pos1,Mut1),PosMut2 = paste0(Pos2,Mut2))] 171 | doubles = doubles[PosMut1 %in% GB1_doped_mut$PosMut & PosMut2 %in% GB1_doped_mut$PosMut] 172 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE_doped_RD",RD,".txt"), 173 | quote = F,row.names = F, col.names = T) 174 | } 175 | 176 | ################################################################## 177 | ### calculate epistasis null model and call pos./neg.epistasis ### 178 | ################################################################## 179 | 180 | #read all downsampled data files 181 | double_files = list.files(path = paste0(dataset_dir,"processed_data/"))[grep("DMS_doubles_preE_",list.files(path = paste0(dataset_dir,"processed_data/")))] 182 | ID = sapply(double_files,FUN=function(X){gsub(".txt","",gsub("DMS_doubles_preE_","",X))}) 183 | 184 | for (idx in seq_along(ID)) { 185 | doubles = fread(paste0(dataset_dir,"processed_data/",double_files[idx])) 186 | doubles2 = copy(doubles[,.SD,,.SDcols = c(1:16)]) 187 | #lower bound of fitness 188 | lower_bound_F = doubles2[is.fitness == T & is.reads0 == T & fitness1 + fitness2 < -8,median(fitness,na.rm=T)] 189 | #call epistatic interactions 190 | doubles = call_epistasis_binary(doubles, 191 | lower_bound_F, 192 | dataset_dir = dataset_dir, 193 | output_filename = paste0("DMS_doubles_",ID[idx],".txt"), 194 | prefix = paste0("GB1_",ID[idx])) 195 | } 196 | 197 | 198 | ############################################# 199 | ### calculate pairwise interaction scores ### 200 | ############################################# 201 | 202 | double_files = list.files(path = paste0(dataset_dir,"processed_data/"))[grep("DMS_doubles_preE_",list.files(path = paste0(dataset_dir,"processed_data/")))] 203 | ID = sapply(double_files,FUN=function(X){gsub(".txt","",gsub("DMS_doubles_preE_","",X))}) 204 | for (idx in seq_along(ID)) { 205 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_",ID[idx],".txt")) 206 | PWI = calculate_pairwise_interaction_scores(doubles, 207 | dataset_dir = dataset_dir, 208 | output_filename = paste0("DMS_PWI_",ID[idx],".txt"), 209 | detailed = F) 210 | } 211 | 212 | #### assemble all the combined scores from all PWI data.tables into one 213 | PWI_complete = fread("GB1/processed_data/DMS_PWI.txt")[,.(Pos1,Pos2,WT_AA1,WT_AA2,combined_score)] 214 | for (idx in seq_along(ID)) { 215 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_",ID[idx],".txt")) 216 | PWI_complete = merge(PWI_complete, 217 | PWI[,.(Pos1,Pos2,V1=combined_score)], 218 | by=c("Pos1","Pos2"),all=T) 219 | names(PWI_complete)[ncol(PWI_complete)] = paste0("combined_score_",ID[idx]) 220 | } 221 | write.table(PWI_complete,file=paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt"), 222 | row.names=F,quote = F) 223 | 224 | 225 | ######################################### 226 | ### deep contact transform PWI scores ### 227 | ######################################### 228 | 229 | prefix = "GB1_" 230 | 231 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt")) 232 | PWI_transformed = deepcontact_transform_basic2d(PWI[,-grep("WT_AA",names(PWI)),with=F], 233 | dataset_dir = dataset_dir, 234 | output_filename = "DMS_PWI_complete_deepcontact.txt", 235 | deepcontact_dir = "where/is/deepcontact/", 236 | prefix = prefix) 237 | 238 | 239 | 240 | 241 | ########################################################### 242 | ######## predict secondary structure from PWI data ######## 243 | ########################################################### 244 | 245 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt")) 246 | #### predict secondary structure 247 | predict_secondary_structure_elements(PWI, 248 | dataset_dir = dataset_dir, 249 | prefix = prefix, 250 | known_SS = "GB1/processed_data/PDB_secondary_structure_1pga_A.txt") 251 | 252 | #### predict beta sheets 253 | predict_beta_sheets(PWI, 254 | input_ss0 = fread(paste0(dataset_dir,"processed_data/",prefix,"_secondary_structure_prediction.txt")), 255 | dataset_dir = dataset_dir, 256 | prefix = prefix, 257 | known_ss_file = "GB1/processed_data/PDB_secondary_structure_1pga_A.txt", 258 | known_bsi_file = "GB1/processed_data/PDB_beta_sheet_hbonds_1pga.txt") 259 | 260 | 261 | 262 | 263 | ########################################################################################## 264 | ##### evaluate predicted contacts (top scoring pairs) against reference structure ######## 265 | ########################################################################################## 266 | 267 | contactmap = fread("GB1/processed_data/PDB_contactmap_1pga_A.txt") 268 | 269 | #### true positive rate of top contacts + contactmaps + eCDFs 270 | evaluate_contacts_vs_PDB(contacts = PWI, 271 | contactmap = contactmap[,.(Pos1,Pos2,scHAmin)], 272 | secondary_structure=NA, 273 | dataset_dir = dataset_dir, 274 | lindist=5, 275 | prefix = prefix) 276 | 277 | #### minimal number of edges connecting top contacts versus all position pairs 278 | evaluate_contatcs_minimaledges(PWI, 279 | contactmap = contactmap[,.(Pos1,Pos2,scHAmin)], 280 | dataset_dir = dataset_dir, 281 | prefix = prefix, 282 | lindist=5, 283 | N_contacts = 1, 284 | dist_cutoff = 8) 285 | 286 | #### interaction scores versus distance in reference structure 287 | score_vs_distance_scatter(contacts = PWI, 288 | contactmap = contactmap[,.(Pos1,Pos2,scHAmin)], 289 | dataset_dir = dataset_dir, 290 | prefix = prefix) 291 | 292 | 293 | ######################################### 294 | ##### XPLOR structure prediction ######## 295 | ######################################### 296 | 297 | #use predicted tertiary contacts and secondary structure elements to generate structural models 298 | 299 | #### use DMS data to derive restraints for tertiary contacts & use secondary structure restraints derived from PSIPRED predictions (no beta sheet pairing restraints) 300 | # +compare to deepcontact transformed scores 301 | 302 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete.txt")) 303 | PWI_DC = fread(paste0(dataset_dir,"processed_data/DMS_PWI_complete_deepcontact.txt")) 304 | names(PWI_DC)[names(PWI_DC) %in% setdiff(names(PWI_DC),c("Pos1","Pos2","WT_AA1","WT_AA2"))] = paste0(setdiff(names(PWI_DC),c("Pos1","Pos2","WT_AA1","WT_AA2")),"_DC") 305 | PWI2 = merge(PWI,PWI_DC,by=c("Pos1","Pos2"),all=T) 306 | 307 | XPLOR_wrapper(input_PWI = PWI2, 308 | SS_mode = "SSonly", 309 | input_SS_file = "PSIPRED_secondary_structure.txt", ### copy this from GB1 folder into the dataset folder 310 | prefix = "GB1_ds_PSIPRED_SSonly_", 311 | dataset_dir = dataset_dir, 312 | cores = 15, 313 | queue = "short-sl7,long-sl7", 314 | protein_sequence = scan("GB1/dataset/GB1_sequence.fasta",what = "character")[2], 315 | pdb_file = "GB1/dataset/PDB/g_xray.pdb", 316 | home_dir = "/where/to/temporarily/build/folderstructure/", 317 | cluster_dir = "cluster/directory/for/XPLOR/", 318 | login_serveraddress = "mylogin@serveraddress.com", 319 | debug_this = F) 320 | 321 | ##################################### 322 | ##### evaluate XPLOR results ######## 323 | ##################################### 324 | 325 | #this will analyse the "..._variables_results.RData" files generated by the XPLOR simulations 326 | # (copy these back to the dataset directory, e.g. to results/XPLOR/my_prefix_used/, which is the "XPLOR_dir" variable) 327 | # the script will give basic outputs into a "results" sub-directory, such as how models performed across stages, the RMSD/TMscore of the best models etc 328 | 329 | analyse_XPLOR_results(XPLOR_dir = paste0(dataset_dir,"results/XPLOR/GB1_ds_PSIPRED_SSonly_/"), 330 | contactmap = fread("GB1/processed_data/contactmap_1pga_A.txt"), 331 | draw_contactmaps = F) 332 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jörn Schmiedel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DMS2structure 2 | 3 | These are scripts used for the analysis of deep mutational scanning data in Schmiedel & Lehner, "Determining protein structures using deep mutagenesis", Nature Genetics (2019) (https://www.nature.com/articles/s41588-019-0431-x) 4 | 5 | The DATASET_pipeline.R scripts do the complete analysis for one dataset, the necessary data are already deposited in the [dataset_folder]/dataset/ folder, except for the WW domain, for which the sequencing data has to be downloaded and processed separately (as described in the pipeline script). 6 | 7 | 8 | ## required software 9 | To run these scripts, you will need 10 | R (version 3.4) 11 | DeepContact (can be cloned from https://github.com/largelymfs/deepcontact ) 12 | XPLOR-NIH (can be downloaded from https://nmr.cit.nih.gov/xplor-nih/ ) 13 | TMscore (can be downloaded from https://zhanglab.ccmb.med.umich.edu/TM-score/ ) 14 | 15 | ## required R packages 16 | data.table 17 | ggplot2 18 | cowplot 19 | GGally 20 | mgcv 21 | caTools 22 | parallel 23 | stringr 24 | gdata 25 | corpcor 26 | Rpdb 27 | pdist 28 | metap 29 | RColorBrewer 30 | ssh.utils 31 | seqinr 32 | optparse 33 | pheatmap 34 | -------------------------------------------------------------------------------- /RRM/dataset/PSIPRED/PAB1.psipass2: -------------------------------------------------------------------------------- 1 | # PSIPRED HFORMAT (PSIPRED V3.3) 2 | 3 | Conf: 965521289453511233322102761444557689999500246660566999999998 4 | Pred: CCEEEECCCCCCCCHHHHHHHHCCCCEEEEEEEECCCCCCEEEEEEEECCHHHHHHHHHH 5 | AA: GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDA 6 | 10 20 30 40 50 60 7 | 8 | 9 | Conf: 505200693667489 10 | Pred: HHCCCCCCCEEEECC 11 | AA: LNGMLLNGQEIYVAP 12 | 70 13 | 14 | -------------------------------------------------------------------------------- /RRM/dataset/PSIPRED/PAB1.psipred.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lehner-lab/DMS2structure/3c1976b78a743d4d4c73921f047648500e671c8e/RRM/dataset/PSIPRED/PAB1.psipred.pdf -------------------------------------------------------------------------------- /RRM/dataset/PSIPRED/PAB1.ss2: -------------------------------------------------------------------------------- 1 | # PSIPRED VFORMAT (PSIPRED V3.3) 2 | 3 | 1 G C 1.000 0.000 0.000 4 | 2 N C 0.801 0.001 0.178 5 | 3 I E 0.230 0.004 0.773 6 | 4 F E 0.211 0.005 0.727 7 | 5 I E 0.312 0.024 0.585 8 | 6 K E 0.382 0.076 0.537 9 | 7 N C 0.647 0.037 0.361 10 | 8 L C 0.891 0.071 0.065 11 | 9 H C 0.975 0.038 0.013 12 | 10 P C 0.778 0.325 0.021 13 | 11 D C 0.784 0.236 0.031 14 | 12 I C 0.688 0.317 0.033 15 | 13 D C 0.742 0.235 0.041 16 | 14 N C 0.554 0.398 0.058 17 | 15 K H 0.388 0.512 0.084 18 | 16 A H 0.307 0.581 0.065 19 | 17 L H 0.302 0.611 0.057 20 | 18 Y H 0.262 0.632 0.103 21 | 19 D H 0.287 0.629 0.092 22 | 20 T H 0.322 0.578 0.052 23 | 21 F H 0.341 0.565 0.031 24 | 22 S H 0.371 0.544 0.016 25 | 23 V C 0.475 0.450 0.021 26 | 24 F C 0.603 0.357 0.037 27 | 25 G C 0.854 0.128 0.035 28 | 26 D C 0.764 0.077 0.129 29 | 27 I E 0.409 0.041 0.560 30 | 28 L E 0.250 0.055 0.692 31 | 29 S E 0.261 0.045 0.669 32 | 30 S E 0.230 0.014 0.699 33 | 31 K E 0.205 0.009 0.720 34 | 32 I E 0.194 0.005 0.776 35 | 33 A E 0.152 0.003 0.853 36 | 34 T E 0.205 0.002 0.827 37 | 35 D C 0.929 0.002 0.101 38 | 36 E C 0.966 0.011 0.029 39 | 37 N C 0.982 0.008 0.016 40 | 38 G C 0.980 0.008 0.015 41 | 39 K C 0.959 0.012 0.035 42 | 40 S C 0.764 0.008 0.217 43 | 41 K E 0.433 0.010 0.524 44 | 42 G E 0.448 0.006 0.507 45 | 43 F E 0.336 0.009 0.581 46 | 44 G E 0.219 0.009 0.694 47 | 45 F E 0.134 0.010 0.823 48 | 46 V E 0.177 0.007 0.780 49 | 47 H E 0.174 0.007 0.780 50 | 48 F E 0.450 0.006 0.545 51 | 49 E C 0.767 0.033 0.174 52 | 50 E C 0.807 0.172 0.049 53 | 51 E H 0.172 0.867 0.003 54 | 52 G H 0.061 0.971 0.001 55 | 53 A H 0.037 0.980 0.001 56 | 54 A H 0.017 0.987 0.000 57 | 55 K H 0.014 0.982 0.000 58 | 56 E H 0.017 0.971 0.000 59 | 57 A H 0.029 0.957 0.000 60 | 58 I H 0.035 0.957 0.000 61 | 59 D H 0.037 0.952 0.001 62 | 60 A H 0.089 0.895 0.003 63 | 61 L H 0.198 0.792 0.003 64 | 62 N H 0.463 0.509 0.010 65 | 63 G C 0.754 0.195 0.079 66 | 64 M C 0.544 0.137 0.314 67 | 65 L C 0.310 0.307 0.357 68 | 66 L C 0.418 0.187 0.346 69 | 67 N C 0.804 0.027 0.113 70 | 68 G C 0.966 0.005 0.023 71 | 69 Q C 0.672 0.002 0.319 72 | 70 E E 0.203 0.002 0.851 73 | 71 I E 0.166 0.001 0.850 74 | 72 Y E 0.141 0.002 0.856 75 | 73 V E 0.275 0.002 0.684 76 | 74 A C 0.908 0.003 0.085 77 | 75 P C 0.985 0.000 0.000 78 | -------------------------------------------------------------------------------- /RRM/dataset/RRM_domain_sequence.fasta: -------------------------------------------------------------------------------- 1 | >RRM domain 2 | GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVAP -------------------------------------------------------------------------------- /RRM_pipeline.R: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ##### RRM domain from Melamed2013 ##### of PAB1 yeast gene 3 | ####################################### 4 | 5 | #this is the pipeline used to analyse the RRM domain data from Melamed et al. 2013 6 | 7 | #first, set the working directory to the DMS2structure folder 8 | setwd("/where/is/DMS2structure/") 9 | 10 | 11 | #source scripts 12 | filelist = list.files('scripts/') 13 | sapply(paste0('scripts/',filelist),source,.GlobalEnv) 14 | 15 | #create the necessary subfolder structure for all results and processed data 16 | dataset_dir = "RRM/" 17 | create_directory_structure(dataset_dir) 18 | #then save this script in the dataset_dir 19 | #and paste all necessary source data into dataset_dir/dataset 20 | 21 | #load required packages 22 | require(data.table) 23 | require(ggplot2) 24 | require(cowplot) 25 | 26 | ############################################################################################# 27 | ##### preprocess data (calculate fitness scores and errors, set quality thresholds etc) ##### 28 | ############################################################################################# 29 | 30 | ################# read data from Supplementary Table 5 from Melamed2013 31 | dataset = fread(paste0(dataset_dir,"dataset/Supplementary_Table_5_doubles.txt"), sep = "\t",header = TRUE) 32 | 33 | # extract position and amino acids 34 | dataset[,Pos1:=sapply(strsplit(dataset[,as.character(seqID_X)],"-"),FUN = function(X){as.integer(X[1])})] 35 | dataset[,Mut1:=sapply(strsplit(dataset[,as.character(seqID_X)],"-"),FUN = function(X){X[2]})] 36 | 37 | dataset[,Pos2:=sapply(strsplit(dataset[,as.character(seqID_Y)],"-"),FUN = function(X){as.integer(X[1])})] 38 | dataset[,Mut2:=sapply(strsplit(dataset[,as.character(seqID_Y)],"-"),FUN = function(X){X[2]})] 39 | 40 | # indicate library 41 | dataset[Pos1<=150,section := 1] 42 | dataset[between(Pos1,151,175),section := 2] 43 | dataset[Pos1>175,section := 3] 44 | 45 | 46 | #investigate dependency on input counts 47 | ggplot(dataset,aes(Input_reads,log(XY_Enrichment_score))) + 48 | geom_hex() + 49 | scale_x_log10() + 50 | scale_fill_gradient(trans="log",breaks=c(1,10,100)) + 51 | facet_wrap(~section) 52 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_Inputreads_fitness.pdf"),width=8,height=3) 53 | 54 | ### should treat libraries separately, develop independent error estimates 55 | ### output count should be Input_reads * XY_enrichment_score 56 | #define abbrev. variables 57 | dataset = dataset[,.(Pos1,Pos2,Mut1,Mut2,section,dist = Physical_distance, 58 | count_e1_s0 = Input_reads,count_e1_s1 = Input_reads * X_Enrichment_score, 59 | fitness1=log(X_Enrichment_score),fitness2=log(Y_Enrichment_score),fitness=log(XY_Enrichment_score))] 60 | 61 | # calculate Poissonian error 62 | dataset[,sigma := sqrt(1/count_e1_s0 + 1/count_e1_s1)] 63 | 64 | ggplot(dataset,aes(fitness,sigma)) + 65 | geom_hex() + 66 | scale_y_log10() + 67 | scale_fill_continuous(trans = "log10") + 68 | facet_grid( ~ section) 69 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_fitness_sigma.pdf"),width=8,height=3) 70 | 71 | 72 | # define STOP variants 73 | dataset[,STOP := Mut1 == "*" | Mut2 == "*"] 74 | 75 | # plot fitness distribution + STOPs 76 | ggplot(dataset,aes(fitness,color=factor(section),linetype=STOP)) + 77 | geom_density() 78 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_fitness_distribution.pdf"),width=5,height=3) 79 | # >> all libraries are different 80 | 81 | #estimate lower fitness bounds from STOP variants 82 | dataset[,lower_bound_F := weighted.mean(x = dataset[section == unique(unlist(.SD)) & STOP == TRUE,fitness], 83 | w = dataset[section == unique(unlist(.SD)) & STOP == TRUE,1/sigma^2]), 84 | section,.SDcols = "section"] 85 | unique(dataset[,.(lower_bound_F,section)][order(section)]) 86 | 87 | ## mark STOP variants and variants with enough reads (all) 88 | dataset[,is.fitness := !STOP] 89 | dataset[,is.reads0 := T] 90 | 91 | #give dummy sigmas for single mutants (no read counts for singles available) 92 | dataset[,sigma1 := 0.01] 93 | dataset[,sigma2 := 0.01] 94 | 95 | #correct Positions 96 | dataset[,Pos1 := Pos1 - 125] 97 | dataset[,Pos2 := Pos2 - 125] 98 | 99 | #add WT_AA 100 | WT_aaseq = scan(paste0(dataset_dir,"dataset/RRM_domain_sequence.fasta"),what="character",sep="\n")[2] 101 | WT_aaseq_split = data.table(Pos = 1:75,WT_AA = strsplit(WT_aaseq,"")[[1]]) 102 | 103 | dataset[,WT_AA1 := WT_aaseq_split[Pos == Pos1,WT_AA],Pos1] 104 | dataset[,WT_AA2 := WT_aaseq_split[Pos == Pos2,WT_AA],Pos2] 105 | 106 | 107 | # reorder double table 108 | doubles = dataset[,.(Pos1,Pos2,WT_AA1,WT_AA2,Mut1,Mut2, 109 | section,count_e1_s0,count_e1_s1,STOP,is.fitness,is.reads0, 110 | fitness,sigma,fitness1,sigma1,fitness2,sigma2,lower_bound_F)] 111 | 112 | #### save data.tables 113 | write.table(x = doubles, file = paste0(dataset_dir,"processed_data/DMS_doubles_preE.txt"), 114 | quote = F,row.names = F, col.names = T) 115 | 116 | 117 | ### also save extracted pair-distances from dataset table 118 | ggplot(unique(dataset[,.(Pos1,Pos2,dist,section)]),aes(Pos1,Pos2,fill=dist<8)) + 119 | geom_raster() + 120 | scale_fill_manual(values=c("grey95","grey25"),na.value = "red") 121 | ggsave(paste0(dataset_dir,"/results/preprocessing/RRM_contactmap_Melamed2013.pdf"),width=6,height=5) 122 | 123 | #only section 2 has a decent number of off-diagonal contacts 124 | # !!! this is missing position 14 of the second section (position 39 in absolute terms) 125 | 126 | write.table(x = unique(dataset[section == 1,.(Pos1,Pos2,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec1.txt"), 127 | quote = F,row.names = F, col.names = T) 128 | 129 | write.table(x = unique(dataset[section == 2 & !is.na(dist),.(Pos1 = Pos1-25,Pos2 = Pos2-25,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec2.txt"), 130 | quote = F,row.names = F, col.names = T) 131 | 132 | write.table(x = unique(dataset[section == 3,.(Pos1 = Pos1-50,Pos2 = Pos2-50,WT_AA1,WT_AA2,dist,section)]), file = paste0(dataset_dir,"processed_data/contactmap_RRM_sec3.txt"), 133 | quote = F,row.names = F, col.names = T) 134 | 135 | 136 | 137 | ################################################################## 138 | ### calculate epistasis null model and call pos./neg.epistasis ### 139 | ################################################################## 140 | 141 | ## given that only section 2 has off-diagonal contacts, focus on this 142 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_preE.txt")) 143 | doubles2 = copy(doubles[section==2]) 144 | doubles2[,':=' (Pos1 = Pos1-25,Pos2 = Pos2-25)] 145 | doubles = call_epistasis_binary(doubles2, 146 | unique(doubles2$lower_bound_F), 147 | dataset_dir = dataset_dir, 148 | prefix = "RRM_sec2_", 149 | output_filename = "DMS_doubles_sec2.txt", 150 | epistasis_error_from_slopes = F) 151 | 152 | ############################################# 153 | ### calculate pairwise interaction scores ### 154 | ############################################# 155 | 156 | doubles = fread(paste0(dataset_dir,"processed_data/DMS_doubles_sec2.txt")) 157 | PWI = calculate_pairwise_interaction_scores(doubles, 158 | dataset_dir = dataset_dir, 159 | output_filename = "DMS_PWI_sec2.txt", 160 | detailed = F) 161 | 162 | 163 | ######################################### 164 | ### deep contact transform PWI scores ### 165 | ######################################### 166 | 167 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2.txt")) 168 | PWI_transformed = deepcontact_transform_basic2d(PWI, 169 | dataset_dir = dataset_dir, 170 | deepcontact_dir = "where/is/deepcontact/", 171 | output_filename = "DMS_PWI_sec2_deepcontact.txt", 172 | prefix = "RRM_sec2_") 173 | 174 | ### negative controls for DeepContact 175 | # 3x permutate combined_scores, while keeping matrix symmetry 176 | set.seed(1603) 177 | PWI = fread(paste0(dataset_dir,"processed_data/DMS_PWI_sec2.txt"))[Pos11: 199 | return False 200 | if potList['CDIH'].violations()>0: 201 | return False 202 | if potList['BOND'].violations()>0: 203 | return False 204 | if potList['ANGL'].violations()>0: 205 | return False 206 | if potList['IMPR'].violations()>1: 207 | return False 208 | 209 | return True 210 | 211 | def calcOneStructure(loopInfo): 212 | """ this function calculates a single structure, performs analysis on the 213 | structure, and then writes out a pdb file, with remarks. 214 | """ 215 | 216 | # initialize parameters for high temp dynamics. 217 | InitialParams( rampedParams ) 218 | # high-temp dynamics setup - only need to specify parameters which 219 | # differfrom initial values in rampedParams 220 | InitialParams( highTempParams ) 221 | 222 | # high temp dynamics 223 | # 224 | protocol.initDynamics(dyn, 225 | potList=potList, # potential terms to use 226 | bathTemp=init_t, 227 | initVelocities=1, 228 | finalTime=10, # stops at 10ps or 5000 steps 229 | numSteps=5000, # whichever comes first 230 | printInterval=100) 231 | 232 | dyn.setETolerance( init_t/100 ) #used to det. stepsize. default: t/1000 233 | dyn.run() 234 | 235 | # initialize parameters for cooling loop 236 | InitialParams( rampedParams ) 237 | 238 | 239 | # initialize integrator for simulated annealing 240 | # 241 | protocol.initDynamics(dyn, 242 | potList=potList, 243 | numSteps=100, #at each temp: 100 steps or 244 | finalTime=.2 , # .2ps, whichever is less 245 | printInterval=100) 246 | 247 | # perform simulated annealing 248 | # 249 | cool.run() 250 | 251 | 252 | # final torsion angle minimization 253 | # 254 | protocol.initMinimize(dyn, 255 | printInterval=50) 256 | dyn.run() 257 | 258 | # final all- atom minimization 259 | # 260 | protocol.initMinimize(minc, 261 | potList=potList, 262 | dEPred=10) 263 | minc.run() 264 | 265 | #do analysis and write structure when this function returns 266 | pass 267 | 268 | 269 | 270 | from simulationTools import StructureLoop, FinalParams 271 | StructureLoop(numStructures=numberOfStructures, 272 | structLoopAction=calcOneStructure, 273 | calcMissingStructs=True, #calculate only missing structures 274 | doWriteStructures=True, #analyze and write coords after calc 275 | pdbTemplate=outFilename, 276 | genViolationStats=True, 277 | averagePotList=potList, 278 | #averageCrossTerms=refRMSD, 279 | averageTopFraction=0.1, #report only on best 50% of structs 280 | averageAccept=accept, #only use structures which pass accept() 281 | averageContext=FinalParams(rampedParams), 282 | averageFilename="contacts55perfect_diheSSpsipred/SCRIPT_ave.pdb", #generate regularized ave structure 283 | averageFitSel="name CA", 284 | averageCompSel="not resname ANI and not name H*" ).run() 285 | 286 | -------------------------------------------------------------------------------- /scripts/XPLOR_wrapper.R: -------------------------------------------------------------------------------- 1 | 2 | XPLOR_wrapper = function(input_PWI, 3 | SS_mode = "SSonly", 4 | input_SS_file, 5 | L = c(0.5,1,1.5,2), 6 | predictorXlength = c(), 7 | prefix, 8 | dataset_dir, 9 | protein_sequence, 10 | pdb_file, 11 | cores = 12, 12 | queue = "long-sl7,short-sl7", 13 | numberOfStructures = c(500,500,500), 14 | top_avg_fraction = c(0.1,0.1,0.1), 15 | NOE_pot_soft = c(TRUE,TRUE,FALSE), 16 | home_dir, 17 | cluster_dir, 18 | reporting_email = NA, 19 | login_serveraddress, 20 | debug_this = F, 21 | linear_dist = 5, 22 | dist_restraint = 8) { 23 | 24 | #these scripts are multi-level wrapper functions to run XPLOR-NIH on a compute cluster 25 | #they locally create/modify the necessary scripts and folder structure, and scp this to the cluster, then remotely execute parallel jobs 26 | #this might take quite some work to adapt to other working environments 27 | 28 | ### variables 29 | # input_PWI: pairwise interaction score data.table; this should have Pos1, Pos2, WT_AA1 and WT_AA2 columns + all interscores that restraints should be derived from 30 | # SS_mode: - either "SSonly", in which case it only uses predicted secondary structure elements (from DMS data or e.g. PSIPRED) for restraints 31 | # - or "SSsheets", in which it also derives restraints for beta sheet pairing hbonding 32 | # input_SS_file: secondary structure element input file, 33 | # - if SS_mode == "SSonly" this is a table with position columns and either ONE ss index column to be used for all given interaction scores, or a SS index column per interaction score (with the interaction score as column name) 34 | # - if SS_mode == "SSsheets" this needs to be a secondary structure element RData file that also contains the beta sheet pairing table 35 | # L: number of top contacts * protein length used for tertiary contact restraints, can be a vector, a simulation per score x L combination will be started [additionally, if there is a "control" score, the script will create a negative control, i.e. L = 0] 36 | # predictorXlength: this can be a data.table with first column indicating interaction score, second column indicate L; this will rerun only the specific score:L combinations indicated instead of all scores versus all L + negative control 37 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/ 38 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 39 | # protein_sequence: amino acid sequence of the protein to be modeled, used by XPLOR 40 | # pdb_file (optional): reference structure pdb file to compare structural models to in terms of RMSD and template modeling score 41 | # cores: number of cores to request on the computing cluster, max is 16 42 | # queue: which queues to submit jobs to 43 | # numberOfStructures: how many structures to create in each of the three modeling stages 44 | # top_avg_fraction: fraction of top evaluated models (by XPLOR total energy) to use to decide restraint violations (stages 1+2) and calculate an average structural model (stage 2 as starting point for stage 3, and stage 3 as a final output) 45 | # NOE_pot_soft: for the three different stages; TRUE: use soft well potentials for distance restraints (if potentially many restraitns from false positive contacts) 46 | # home_dir: a directory on local machine in which a "tmp" folder is create to build the script/folder structure locally then copy to cluster; this is removed at the end of the script; could be "/Users/me/" 47 | # cluster_dir: (main) directory on the compute cluster into which the folder structure should be copied and where the simulations are executed 48 | # reporting_email: if not NA: email to report to about job status for qsub system 49 | # login_serveraddress: mylogin@serveraddress.com 50 | # debug_this: if TRUE, script will stop at certain points for debugging 51 | # linear_dist: minimum linear (residue) distance for defining top predicted contacts 52 | # dist_restraint: distance restraint (in Angstrom) for top predicted contacts 53 | 54 | require(data.table) 55 | require(ssh.utils) 56 | #load utility scripts 57 | filelist = list.files('scripts/') 58 | sapply(paste0('scripts/',filelist),source,.GlobalEnv) 59 | 60 | #create tmp directory for data structure 61 | system(command = paste0("mkdir ",home_dir,"tmp"),wait=T) 62 | system(command = paste0("mkdir ",home_dir,"tmp/",prefix),wait=T) 63 | 64 | #which interaction score to use as predictors? 65 | predictor = setdiff(names(input_PWI),c("Pos1","Pos2","WT_AA1","WT_AA2")) 66 | #create data.table with predictor x L combinations 67 | if (length(predictorXlength) == 0) { 68 | predictorXlength = data.table(expand.grid(predictor,L)) 69 | names(predictorXlength) = c("predictor","L") 70 | if (length(grep("control",predictor)) > 0) { #if preditors include controls, also create a negative control without tertiary distance restraints 71 | predictorXlength = rbind(predictorXlength,data.table(predictor = "control",L = 0)) 72 | } 73 | } else { #the table was supplied with specific combinations 74 | names(predictorXlength) = c("predictor","L") 75 | } 76 | predictorXlength[,predictor := as.character(predictor)] 77 | 78 | if (debug_this) {browser()} 79 | 80 | #set up list structure for variables and results 81 | varlist = list() 82 | varlist$cores = cores 83 | varlist$protein = prefix 84 | varlist$cluster_dir = cluster_dir 85 | varlist$numberOfStructures = numberOfStructures 86 | varlist$top_avg_fraction = top_avg_fraction 87 | varlist$NOE_pot_soft = NOE_pot_soft 88 | varlist$filename = c("anneal_stage1","anneal_stage2","refine") 89 | varlist$contacts_noe_file = "NOE_restraints" 90 | varlist$ss_dihe_file = "DIHE_restraints" 91 | 92 | 93 | varlist$protein_length = nchar(protein_sequence) 94 | ### define .seq file for XPLOR, 3letter AA names 95 | write(x = paste(toupper(sapply(strsplit(protein_sequence,"")[[1]],convert_AAabr_one_three)),collapse=" "), 96 | file=paste0(home_dir,"tmp/",prefix,"/protein.seq")) 97 | varlist$protein_seq = paste0(cluster_dir,prefix,"/protein.seq") #replace by fasta seq, then create .seq file and copy 98 | 99 | #copy pdb template file 100 | system(command = paste0("cp ",pdb_file," ",home_dir,"tmp/",prefix,"/template.pdb"),wait=T) 101 | varlist$pdb_file = paste0(cluster_dir,prefix,"/template.pdb") 102 | 103 | 104 | ### per combination of predictor and L, create restraint files etc 105 | for (idx in 1:nrow(predictorXlength)) { 106 | varlist$predictor = predictorXlength$predictor[idx] 107 | varlist$L = predictorXlength$L[idx] 108 | varlist$folder = paste0(cluster_dir,prefix,"/",varlist$predictor,"/L",varlist$L,"/") 109 | 110 | 111 | ### initiate lists 112 | #for restraints 113 | varlist$NOE_DT = list() 114 | varlist$NOE_DT[[1]] = data.table(weight = as.numeric(),Pos1 = as.numeric(), Pos2 = as.numeric(), 115 | atom1 = as.character(), atom2 = as.character(), 116 | dist = as.numeric(), lower_dist = as.numeric(), upper_dist = as.numeric(), 117 | Pos1_opt2 = as.numeric(), Pos2_opt2 = as.numeric(), 118 | atom1_opt2 = as.character(),atom2_opt2 = as.character(),type = as.character()) 119 | varlist$DIHE_DT = list() 120 | varlist$DIHE_DT[[1]] = data.table(weight = as.numeric(),position = as.numeric(),angle=as.numeric(), 121 | delta_angle=as.numeric(),type = as.character(),ss = as.character()) 122 | #for evaluation 123 | varlist$energy_XPLOR = list() 124 | varlist$violations = list() 125 | 126 | ##### secondary structure elements 127 | if (SS_mode == "SSonly") { 128 | ### only secondary structure 129 | if (varlist$predictor == "control") { 130 | files = list.files(path = paste0(dataset_dir,"processed_data/")) 131 | load(paste0(dataset_dir,"processed_data/",files[grep("secondary_structure_elements_control.*RData",files)])) 132 | input_SS = ss_elements 133 | } else { 134 | input_SS = fread(paste0(dataset_dir,"processed_data/",input_SS_file)) 135 | } 136 | ### define secondary structure restraints 137 | if (ncol(input_SS) > 2){ 138 | varlist$SS_pred = input_SS[,.(position = Pos,ss = .SD),,.SDcols = varlist$predictor] 139 | } else if (ncol(input_SS) == 2) { 140 | varlist$SS_pred = input_SS[,.(position = Pos,ss = .SD),,.SDcols = 2] 141 | } else { 142 | print("number secondary structure inputs doesn't match the number of input features") 143 | secondary_structure = error 144 | } 145 | varlist$SS_pred[,rleidx := rleid(ss)] 146 | } else if (SS_mode == "SSsheets") { ## also include beta sheet hbonding 147 | 148 | ### secondary structure and beta sheet hbonding 149 | if (varlist$predictor == "control") { 150 | files = list.files(path = paste0(dataset_dir,"processed_data/")) 151 | load(paste0(dataset_dir,"processed_data/",files[grep("secondary_structure_elements_control.*RData",files)])) 152 | beta_hbonds = hbonds_from_betasheetpairing(beta_sheet_pairing) 153 | } else { 154 | load(paste0(dataset_dir,"processed_data/",input_SS_file,"secondary_structure_elements_",varlist$predictor,".RData")) 155 | beta_hbonds = hbonds_from_betasheetpairing(beta_sheet_pairing,ss_data) 156 | } 157 | # secondary structure 158 | varlist$SS_pred = ss_elements 159 | names(varlist$SS_pred) = c("position","ss") 160 | varlist$SS_pred[,rleidx := rleid(ss)] 161 | # write beta sheet hbonding to restraints list 162 | if (nrow(beta_hbonds)>0) { 163 | varlist$NOE_DT[[1]] = rbind(varlist$NOE_DT[[1]], data.table(weight=1, Pos1=beta_hbonds$hn_opt1,Pos2=beta_hbonds$o_opt1, atom1="hn", atom2="o", 164 | dist=2,lower_dist=0.2, upper_dist = 0.1, 165 | Pos1_opt2=beta_hbonds$hn_opt2,Pos2_opt2=beta_hbonds$o_opt2, 166 | atom1_opt2="hn", atom2_opt2="o",type = "beta sheet hbond")) 167 | #XPLOR doesn't create an O for last position when using seq2PSF 168 | varlist$NOE_DT[[1]] = varlist$NOE_DT[[1]][(is.na(Pos2_opt2) & !(Pos2 == varlist$protein_length)) | 169 | (!is.na(Pos2_opt2) & !(Pos2 == varlist$protein_length | Pos2_opt2 == varlist$protein_length))] 170 | } 171 | } 172 | 173 | ##### write phi/psi dihedral angle restraints for secondary structure 174 | # angle values +- delta for PSI and PHI angles in secondary structure elements 175 | # these values are taken from Table1 of "CONFOLD: Residue-residue contact-guided ab initio protein folding", Adhikari et al. 2015 176 | #alpha 177 | phiH=-63.5 178 | dphiH=4.5 179 | psiH=-41.5 180 | dpsiH=5 181 | #beta 182 | phiE=-118 183 | dphiE=10.7 184 | psiE=134 185 | dpsiE=8.6 186 | 187 | varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="E" & position > 1, 188 | .(weight=1,position,angle = phiE,delta_angle = dphiE,type="phi",ss="beta")]) 189 | varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="E" & position < varlist$protein_length, 190 | .(weight=1,position,angle = psiE,delta_angle = dpsiE,type="psi",ss="beta")]) 191 | varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="H" & position > 1, 192 | .(weight=1,position,angle = phiH,delta_angle = dphiH,type="phi",ss="alpha")]) 193 | varlist$DIHE_DT[[1]] = rbind(varlist$DIHE_DT[[1]],varlist$SS_pred[ss=="H" & position < varlist$protein_length, 194 | .(weight=1,position,angle = psiH,delta_angle = dpsiH,type="psi",ss="alpha")]) 195 | 196 | # if (debug_this) {browser()} 197 | #write O-O restraints in beta strands 198 | pos = varlist$SS_pred[,.(ss,next_E = varlist$SS_pred[position == (unlist(.SD)+1),ss=="E"]), 199 | position,.SDcols = "position"][ss=="E" & next_E & position < varlist$protein_length-1,position] 200 | if (length(pos) > 0) { 201 | varlist$NOE_DT[[1]] = rbind(varlist$NOE_DT[[1]], data.table(weight=1,Pos1=pos,Pos2=pos+1,atom1 = "o", atom2 = "o", 202 | dist = 4.5,lower_dist=0.1, upper_dist=0.1, 203 | Pos1_opt2 = NA, Pos2_opt2 = NA, atom1_opt2 = NA,atom2_opt2 = NA, type = "beta strand")) 204 | } 205 | 206 | ###### define restraints from top predicted contacts (Cbeta distances) ###### 207 | # get scores for predictor (only those > linear_dist apart in linear sequence) 208 | PWI = input_PWI[Pos1 < Pos2-linear_dist,cbind(Pos1,Pos2,WT_AA1,WT_AA2,.SD),,.SDcols = varlist$predictor] 209 | names(PWI)[5] = "score" 210 | 211 | #exclude contacts within predicted secondary structure 212 | for (i in varlist$SS_pred[ss %in% c("E","H"),unique(rleidx)]) { 213 | PWI[Pos1 %in% varlist$SS_pred[rleidx==i,position] & Pos2 %in% varlist$SS_pred[rleidx==i,position],score := NA] 214 | } 215 | if (varlist$predictor == "control") { 216 | set.seed(seed=1603) 217 | helper = PWI[Pos1= 1)] 45 | N_binned_merged[,N_neg_smooth := as.numeric(N_neg_smooth >= 1)] 46 | 47 | ggplot() + 48 | geom_raster(data=N_binned_merged,aes(X1,X2,fill=N_all),interpolate=F) + 49 | scale_fill_gradient(low="gray75",high="dodgerblue4",trans="log10",na.value = "white") + 50 | geom_contour(data=N_binned_merged,aes(X1,X2,z=N_neg_smooth,color="gold"),size=1,bins=1,alpha=0.75) + 51 | geom_contour(data=N_binned_merged,aes(X1,X2,z=N_pos_smooth,color="red"),size=1,bins=1,alpha=0.75) + 52 | scale_color_manual(values=c("gold","red"),labels=c("negative","positive")) + 53 | scale_x_continuous(expand = c(0,0)) + 54 | scale_y_continuous(expand = c(0,0)) + 55 | labs(x="single mutant fitness 1",y="single mutant fitness 2", 56 | color="epistasis subsets",fill="#variants") 57 | ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"epistasis_subsets_singlemutantspace.pdf"),height=4,width=6) 58 | } 59 | 60 | 61 | ########################################################################## 62 | ####### plot marginal distribution of the number of variants suitable #### 63 | ####### for epistasis classification over all position pairs ############# 64 | ########################################################################## 65 | epistasis_analytics_NumEvars_marginal = function(doubles, 66 | dataset_dir, 67 | prefix = "", 68 | modus = "cis") { 69 | ### variables 70 | # doubles: doubles data.table 71 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/ 72 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 73 | # modus: "cis" (single protein) or "trans" (protein-protein interaction) 74 | 75 | theme_set(theme_classic()) 76 | 77 | if (modus == "cis") { 78 | doubles_sym = switch_double_DT(doubles[is.reads0==T & is.fitness == T],list(c("Pos1","Pos2"),c("fitness1","fitness2")),c("pos_epistasis","neg_epistasis")) 79 | } else { 80 | doubles_sym = copy(doubles[is.reads0==T & is.fitness == T]) 81 | } 82 | 83 | DT_numbervars = doubles_sym[,.(num_all = .N, 84 | num_posE = sum(pos_epistasis==T), 85 | num_negE = sum(neg_epistasis==T)), 86 | .(Pos1,Pos2)] 87 | 88 | DT_numbervars_melt = melt(DT_numbervars,id.vars = "Pos1",measure.vars = c("num_all","num_posE","num_negE")) 89 | 90 | ggplot(DT_numbervars_melt,aes(value,color=variable,..count..)) + 91 | geom_density(adjust=0.5) + 92 | scale_color_manual(breaks=c("num_all","num_posE","num_negE"),values = c("black","red","gold"), 93 | labels = c(paste0("all, = ",DT_numbervars[,round(mean(num_all))]), 94 | paste0("pos.E, = ",DT_numbervars[,round(mean(num_posE))]), 95 | paste0("neg.E, = ",DT_numbervars[,round(mean(num_negE))]))) + 96 | scale_x_continuous(limits = c(0,361),breaks = seq(0,350,50),expand = c(0,0)) + 97 | scale_y_continuous(breaks = seq(0,350,50),expand = c(0,0)) + 98 | labs(x="number of double mutants per position pair",y="density [a.u.]", 99 | color = "data subset") 100 | ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"number_epistatic_variants.pdf"),width=5,height=3) 101 | } 102 | 103 | #################################################################### 104 | ####### number of variants suitable for epistasis classification ### 105 | ####### versus single mutant fitness ############################## 106 | epistasis_analytics_NumEvars_fitness = function(doubles, 107 | dataset_dir, 108 | prefix = "", 109 | modus = "cis") { 110 | 111 | ### variables 112 | # doubles: doubles data.table 113 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/ 114 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 115 | # modus: "cis" (single protein) or "trans" (protein-protein interaction) 116 | 117 | theme_set(theme_classic(base_size = 9)) 118 | 119 | if (modus == "cis") { 120 | doubles_sym = switch_double_DT(doubles[is.reads0==T & is.fitness == T],list(c("Pos1","Pos2"),c("fitness1","fitness2")),c("pos_epistasis","neg_epistasis")) 121 | DT_numbervars = doubles_sym[,.(num_all = .N, 122 | num_posE = sum(pos_epistasis==T), 123 | num_negE = sum(neg_epistasis==T), 124 | median_fitness = median(fitness1,na.rm=T), 125 | variants = "all"), 126 | .(Pos1,Pos2)] 127 | ##### fitness versus number of variants 128 | DT_mN_mF = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness),variants = "all"),Pos1] 129 | DT_mN_mF = rbind(DT_mN_mF,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness),variants = "posE"),Pos1]) 130 | DT_mN_mF = rbind(DT_mN_mF,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness),variants = "negE"),Pos1]) 131 | DT_mN_mF[,variants := factor(variants,levels = c("all","posE","negE"))] 132 | 133 | ggplot(DT_mN_mF,aes(median_fitness,median_N,color = variants)) + 134 | geom_point() + 135 | scale_color_manual(values = c("black","red","gold")) + 136 | geom_smooth(se=F) + 137 | labs(x = "median single mutant fitness at position", 138 | y = "median # of double mutants in pairs involving position") 139 | ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants.pdf"),width=5,height=4) 140 | 141 | } else { 142 | # doubles_sym = copy(doubles[is.reads0==T & is.fitness == T]) 143 | DT_numbervars = doubles[is.reads0==T & is.fitness == T, 144 | .(num_all = .N, 145 | num_posE = sum(pos_epistasis==T), 146 | num_negE = sum(neg_epistasis==T), 147 | median_fitness1 = median(fitness1,na.rm=T), 148 | median_fitness2 = median(fitness2,na.rm=T), 149 | variants = "all"), 150 | .(Pos1,Pos2)] 151 | ##### fitness versus number of variants 152 | DT_mN_mF1 = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness1),variants = "all"),Pos1] 153 | DT_mN_mF1 = rbind(DT_mN_mF1,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness1),variants = "posE"),Pos1]) 154 | DT_mN_mF1 = rbind(DT_mN_mF1,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness1),variants = "negE"),Pos1]) 155 | DT_mN_mF1[,variants := factor(variants,levels = c("all","posE","negE"))] 156 | 157 | ggplot(DT_mN_mF1,aes(median_fitness,median_N,color = variants)) + 158 | geom_point() + 159 | scale_color_manual(values = c("black","red","gold")) + 160 | geom_smooth(se=F) + 161 | labs(x = "median single mutant fitness at position", 162 | y = "median number of double mutants in position pairs involving position") 163 | ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants_protein1.pdf"),width=5,height=4) 164 | 165 | DT_mN_mF2 = DT_numbervars[,.(median_N = median(num_all),median_fitness = mean(median_fitness2),variants = "all"),Pos2] 166 | DT_mN_mF2 = rbind(DT_mN_mF2,DT_numbervars[,.(median_N = median(num_posE),median_fitness = mean(median_fitness2),variants = "posE"),Pos2]) 167 | DT_mN_mF2 = rbind(DT_mN_mF2,DT_numbervars[,.(median_N = median(num_negE),median_fitness = mean(median_fitness2),variants = "negE"),Pos2]) 168 | DT_mN_mF2[,variants := factor(variants,levels = c("all","posE","negE"))] 169 | ggplot(DT_mN_mF2,aes(median_fitness,median_N,color = variants)) + 170 | geom_point() + 171 | scale_color_manual(values = c("black","red","gold")) + 172 | geom_smooth(se=F) + 173 | labs(x = "median single mutant fitness at position", 174 | y = "median number of double mutants in position pairs involving position") 175 | ggsave(paste0(dataset_dir,"results/epistasis/",prefix,"fitness_vs_numberdoublemutants_protein2.pdf"),width=5,height=4) 176 | } 177 | } 178 | 179 | 180 | ############################################################### 181 | ##### spatial distribution of variants per position pair ###### 182 | ############################################################### 183 | epistasis_analytics_NumEvars_spatial = function(PWI, 184 | dataset_dir, 185 | prefix = "", 186 | modus = "cis") { 187 | 188 | ### variables 189 | # PWI: pairwise interaction score data.table 190 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/ 191 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 192 | # modus: "cis" (single protein) or "trans" (protein-protein interaction) 193 | 194 | if (modus == "cis") { #plot positive and negative subsets as halves of same matrix 195 | ggplot() + 196 | geom_raster(data = PWI[Pos1 F_fit_upper,c(.SD,type="posE"),,.SDcols = dist_type], #in this subset, variants that are classified as positive epistatic 258 | data[neg_epistasis==T,c(.SD,type="negEsubset"),,.SDcols = dist_type], #same for negative 259 | data[neg_epistasis==T & fitness < F_fit_lower,c(.SD,type="negE"),,.SDcols = dist_type]) 260 | title_label = "trans" 261 | } else { 262 | data1 = rbind(data[Pos1 F_fit_upper,c(.SD,type="posE"),,.SDcols = dist_type],#in this subset, variants that are classified as positive epistatic 265 | data[Pos1 ",lindist) 268 | } 269 | names(data1)[1] = "distance" 270 | 271 | data1[,type := factor(type,levels=c("all","posE","posEsubset","negE","negEsubset"))] 272 | setkey(data1,type) 273 | ggplot(data1,aes(x=distance,color=type)) + 274 | stat_ecdf() + 275 | scale_color_manual(values = c("black","red","orange","gold","yellow"), 276 | labels = data1[,.(paste0(type," ",round(sum(.SD 0) { 10 | for (i in 1:nrow(beta_sheet_pairing)) { 11 | L = beta_sheet_pairing[i,pos1_max - pos1_min + 1] 12 | if (beta_sheet_pairing[i,type=="anti-par"]) { #anti-parallel sheet 13 | 14 | if (L %% 2 == 1) { #if uneven, decide which side to keep; 15 | 16 | if (length(ss_data) > 0) { #use the one wiht lower p_value 17 | p_start = ss_data[Pos1 == beta_sheet_pairing[i,pos1_min] & Pos2 == beta_sheet_pairing[i,pos2_max],beta_antipar_p] 18 | p_end = ss_data[Pos1 == beta_sheet_pairing[i,pos1_max] & Pos2 == beta_sheet_pairing[i,pos2_min],beta_antipar_p] 19 | if (p_start < p_end) { 20 | beta_sheet_pairing[i,':=' (pos1_max = pos1_max - 1, pos2_min = pos2_min + 1)] 21 | } else { 22 | beta_sheet_pairing[i,':=' (pos2_max = pos2_max - 1, pos1_min = pos1_min + 1)] 23 | } 24 | } else { #without additional info, keep lower position 25 | beta_sheet_pairing[i,':=' (pos2_max = pos2_max - 1, pos1_min = pos1_min + 1)] 26 | } 27 | L=L-1 28 | } 29 | beta_hbonds = rbind(beta_hbonds,beta_sheet_pairing[i,.(hn_opt1 = c(seq(pos1_min,pos1_max,2),seq(pos2_max,pos2_min,-2)), 30 | o_opt1 = c(seq(pos2_max,pos2_min,-2),seq(pos1_min,pos1_max,2)), 31 | hn_opt2 = c(seq(pos1_min+1,pos1_max,2),seq(pos2_max-1,pos2_min,-2)), 32 | o_opt2 = c(seq(pos2_max-1,pos2_min,-2),seq(pos1_min+1,pos1_max,2)), 33 | sheet=i,hbond=nrow(beta_hbonds)+1:L)]) 34 | 35 | } else { #parallel sheet 36 | beta_hbonds = rbind(beta_hbonds,beta_sheet_pairing[i,.(hn_opt1 = c(seq(pos2_min+1,pos2_max,2),seq(pos1_min+2,pos1_max,2)), 37 | o_opt1 = c(seq(pos1_min,pos1_max-1,2),seq(pos2_min+1,pos2_max-1,2)), 38 | hn_opt2 = c(seq(pos1_min+1,pos1_max,2),seq(pos2_min+2,pos2_max,2)), 39 | o_opt2 = c(seq(pos2_min,pos2_max-1,2),seq(pos1_min+1,pos1_max-1,2)), 40 | sheet=i,hbond=nrow(beta_hbonds)+1:(L-1))]) 41 | } 42 | } 43 | if (nrow(beta_sheet_pairing) > 1) { 44 | #create all possible combinations of sheets 45 | require(combinat) 46 | sheet_comb = list() 47 | for (i in 1:(nrow(beta_sheet_pairing)-1)) { 48 | x=combn(1:nrow(beta_sheet_pairing),i) 49 | sheet_comb = c(sheet_comb,split(x, rep(1:ncol(x), each = nrow(x)))) 50 | } 51 | #swap options until hbonding is consistent 52 | iterations = 0 53 | while ((beta_hbonds[,.N,hn_opt1][,sum(N>1)>0] | beta_hbonds[,.N,hn_opt2][,sum(N>1)>0]) & iterations < 3) { 54 | for (i in 1:length(sheet_comb)) { 55 | beta_hbonds[sheet %in% sheet_comb[[i]], c("hn_opt1","o_opt1","hn_opt2","o_opt2") := .(hn_opt2,o_opt2,hn_opt1,o_opt1)] 56 | if (!beta_hbonds[,.N,hn_opt1][,sum(N>1)>0] & !beta_hbonds[,.N,hn_opt2][,sum(N>1)>0]) { 57 | break 58 | } else { #reverse 59 | beta_hbonds[sheet %in% sheet_comb[[i]], c("hn_opt1","o_opt1","hn_opt2","o_opt2") := .(hn_opt2,o_opt2,hn_opt1,o_opt1)] 60 | } 61 | if (i == length(sheet_comb)) {iterations = iterations + 1} 62 | } 63 | } 64 | } 65 | } 66 | return(beta_hbonds) 67 | } -------------------------------------------------------------------------------- /scripts/identify_expand_seeds.R: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | ############### function for expanding seeds and finding most significant stretches ############ 3 | ################################################################################################ 4 | ## subfunctions to predict_secondary_structure / sheets functions 5 | identify_expand_seeds = function(any_strand,seed_size = 3,p_threshold = 0.05,bridge_dist = 1,max_extension = 1) { 6 | 7 | #starting with lowest p-value triplet, expand 8 | idx=1 9 | any_strand_temp = copy(any_strand) 10 | any_strand[,':=' (strand = as.integer(NA), p_strand = as.numeric(NA))] 11 | while (min(any_strand_temp$p_seed,na.rm=T) < p_threshold) { 12 | #start with best available triplet seed 13 | psum = any_strand_temp[,min(p_seed,na.rm=T)] 14 | positions = any_strand_temp[which.min(p_seed),seq(pos-(seed_size-1)/2,pos+(seed_size-1)/2,1)] 15 | 16 | #expand triplet seed 17 | positions_down = c(positions,min(positions)-1) 18 | positions_up = c(positions,max(positions)+1) 19 | extension = 0 20 | while (extension <= max_extension) { 21 | 22 | psum_down = any_strand_temp[pos %in% positions_down & !is.na(p_ind),if(.N>1){sumlog(p_ind)$p}else{p_ind}] 23 | psum_up = any_strand_temp[pos %in% positions_up & !is.na(p_ind),if(.N>1){sumlog(p_ind)$p}else{p_ind}] 24 | if (psum_down < min(psum,psum_up)) { 25 | positions = positions_down 26 | psum = psum_down 27 | extension = 0 28 | } else if (psum_up < min(psum,psum_down)) { 29 | positions = positions_up 30 | psum = psum_up 31 | extension = 0 32 | } else { #check whether extending further gives a more significant strand 33 | extension = extension+1 34 | } 35 | positions_down = c(positions_down,min(positions)-1-extension) 36 | positions_up = c(positions_up,max(positions)+1+extension) 37 | } 38 | 39 | #record beta strands 40 | if (idx == 1) { 41 | any_strand[pos %in% positions,':=' (strand = idx, p_strand = psum)] 42 | idx=idx+1 43 | } else { 44 | #merge with adjacent beta strands if dist smaller than bridge_dist 45 | closeby_strands = unique(any_strand[!is.na(strand)][,.(min_dist=min(abs(pos-positions)),strand),pos][min_dist<= bridge_dist,strand]) 46 | if (length(closeby_strands) > 0) { 47 | if (length(closeby_strands) > 1) { 48 | any_strand[strand %in% closeby_strands,strand := min(closeby_strands)] 49 | } 50 | strand_idx = min(closeby_strands) 51 | any_strand[pos %in% positions,':=' (strand = strand_idx)] 52 | any_strand[strand == strand_idx, p_strand := sumlog(p_ind[!is.na(p_ind)])$p, strand] 53 | } else { # or record new beta strand 54 | strand_idx = idx 55 | any_strand[pos %in% positions,':=' (strand = strand_idx)] 56 | any_strand[strand == strand_idx, p_strand := sumlog(p_ind[!is.na(p_ind)])$p, strand] 57 | idx=idx+1 58 | } 59 | } 60 | 61 | #set p-values of 'used' positions to NA 62 | any_strand_temp[pos %in% positions,p_ind := NA] 63 | # any_strand_temp[between(pos,min(positions)-(seed_size-1)/2,max(positions)+(seed_size-1)/2),p_seed:=NA] 64 | any_strand_temp[pos %in% positions,p_seed:=NA] 65 | } 66 | 67 | #rearrange strand nrs 68 | strand_vec = unique(any_strand[!is.na(strand),strand]) 69 | if (length(strand_vec) > 1) { 70 | for (i in seq_along(strand_vec)) {any_strand[strand==strand_vec[i],strand_new:=i]} 71 | any_strand[,strand:=strand_new] 72 | any_strand[,strand_new:=NULL] 73 | } 74 | return(any_strand) 75 | } -------------------------------------------------------------------------------- /scripts/misc/call_epistasis_symdata_v1.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################# 3 | ### a modified version of epistasis classification for "symmetrical" data with detrimental and beneficial variants ### 4 | ################################################################# 5 | 6 | call_epistasis_symdata_v1 = function(double_data,lower_bound_F,upper_bound_F,output_dir,prefix = "",xsig=2,sym = T,Q = 0.05, epistasis_error_from_slopes = T) { 7 | 8 | 9 | #### version for "symmetrical" data with detrimental and beneficial variants 10 | # this has a modified version of epistasis classification and needs to know the lower and upper (!) bound of the fitness data 11 | 12 | #double_data: the doubles data.table 13 | #lower_bound_F: an estimate of the lower bound of the fitness assay 14 | #upper_bound_F: an estimate of the upper bound of the fitness assay 15 | #output_dir: base-output directory, like "GB1/", it will put results output_dir/results/epistasis/ 16 | #prefix:to be added to results files (in case of running diff. versions of data from same dataset etc) 17 | #xsig: significance threshold for calling epistasis, 2 is fine 18 | #sym: logical for whether the position-position fitness map is symmetrical (only FALSE for protein-protein interactions!!) 19 | #Q: the percentile used for upper (1-Q) and lower (Q) fitness surface calculation 20 | #epistasis_error_from_slopes: logical, should sigmeE (error for epistasis values) be calculate by taking slopes of median fitness surface into account 21 | 22 | 23 | set.seed(1603) 24 | 25 | require(data.table) 26 | require(mgcv) 27 | require(caTools) 28 | 29 | if (sym == T) { 30 | DT = switch_double_DT(double_data[is.fitness == TRUE & is.reads0 == TRUE], 31 | cols_switchdouble = list(c("fitness1","fitness2"),c("sigma1","sigma2")), 32 | cols_double = c("fitness","sigma")) 33 | } else { 34 | DT = copy(double_data[is.fitness == TRUE & is.reads0 == TRUE]) 35 | } 36 | 37 | ### use loess instead, 2d-gam is very inaccurate 38 | # browser() 39 | subDT = DT[sample(nrow(DT),min(c(100000,nrow(DT)))),.(fitness1,fitness2,fitness)] 40 | F_fit_loess_model = loess(fitness ~ fitness1 + fitness2,data=subDT,span=0.2) 41 | double_data[is.fitness == TRUE & is.reads0 == TRUE, 42 | F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")] 43 | 44 | ### >> calculate A-B-AB surface as median surface of gam-corrected surface 45 | Nq = 100 # grid points along one axis 46 | Nv = 500000 # variants used in estimation of surface 47 | span = max(c(0.01,500/nrow(DT))) # fraction of nearest neighbours to use for median calculation 48 | # Q = 0.05: values for quantile calcultion: 0.05 and 0.95, 49 | 50 | # predict loess surface fitness and correct fitness for it 51 | DT[,F_fit_loess := predict(F_fit_loess_model,newdata = .SD),,.SDcols = c("fitness1","fitness2")] 52 | DT = DT[,fitness_norm := fitness-F_fit_loess] 53 | 54 | # calculate quantile fitness surfaces 55 | List = quantile_fitness_surface_adaptive(DT,Nq,Nv,span,Q) 56 | 57 | double_data[is.fitness == TRUE & is.reads0==TRUE, 58 | F_fit_median := predict(List$F_median_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")] 59 | double_data[is.fitness == TRUE & is.reads0==TRUE, 60 | F_fit_lower := predict(List$F_lower_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")] 61 | double_data[is.fitness == TRUE & is.reads0==TRUE, 62 | F_fit_upper := predict(List$F_upper_fit,newdata = .SD) + F_fit_loess,,.SDcols = c("fitness1","fitness2")] 63 | 64 | if (epistasis_error_from_slopes) { 65 | #calculate slope of median surface to estimate error propagation from singles 66 | f1 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)]) + 67 | predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1 = fitness1 + 0.01,fitness2)]) 68 | double_data[is.fitness == TRUE & is.reads0==TRUE,slope1 := abs(F_fit_median - f1)/0.01] 69 | 70 | f2 = predict(List$F_median_fit,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2+ 0.01)]) + 71 | predict(F_fit_loess_model,newdata = double_data[is.fitness == TRUE & is.reads0==TRUE,.(fitness1,fitness2 = fitness2 + 0.01)]) 72 | double_data[is.fitness == TRUE & is.reads0==TRUE,slope2 := abs(F_fit_median - f2)/0.01] 73 | 74 | #from this calculate epistasis error 75 | double_data[,sigmaE := sqrt(sigma^2 + slope1^2 * sigma1^2 + slope2^2 * sigma2^2)] 76 | } else { 77 | double_data[,sigmaE := sqrt(sigma^2 + sigma1^2 + sigma2^2)] 78 | } 79 | 80 | ##################################################################### 81 | ######## define data subsets for positive/negative epistasis ######## 82 | ##################################################################### 83 | 84 | #estimate the width (95percent quantile) of the lower limit "background" 85 | lowerlimit_background_cutoff=double_data[is.fitness==TRUE & is.reads0 == TRUE & 86 | (fitness1 + xsig*sigma1 + fitness2 + xsig*sigma2) < lower_bound_F,quantile(fitness,probs=0.95,na.rm=T)] 87 | 88 | #same for the upper limit background 89 | upperlimit_background_cutoff=double_data[is.fitness==TRUE & is.reads0 == TRUE & 90 | (fitness1 - xsig*sigma1 + fitness2 - xsig*sigma2) > upper_bound_F,quantile(fitness,probs=0.95,na.rm=T)] 91 | 92 | # mark variants for positive epistasis analysis 93 | double_data[is.fitness==TRUE & is.reads0 == TRUE,pos_epistasis := FALSE] 94 | 95 | ## upper fitness restrictions for positive epistasis 96 | # only limitation here is that it is not too high into the upper background of fitness 97 | double_data[is.fitness==TRUE & is.reads0 == TRUE & 98 | F_fit_upper < upperlimit_background_cutoff, 99 | pos_epistasis := TRUE] 100 | 101 | # mark variants for negative epistasis analysis 102 | double_data[is.fitness==TRUE & is.reads0 == TRUE,neg_epistasis := FALSE] 103 | # #upper fitness restrictions for negative epistasis 104 | # only limitation here is that it is not too low into the lower background of fitness 105 | double_data[is.fitness==TRUE & is.reads0 == TRUE & 106 | F_fit_lower > lowerlimit_background_cutoff, 107 | neg_epistasis := TRUE] 108 | 109 | ######################################################## 110 | ################# plot fitness surfaces ################ 111 | ######################################################## 112 | plot_fitness_surface(double_data,F_fit_loess_model,List,output_dir,prefix) 113 | 114 | #Epistasis score and significant positive/negative classifications 115 | double_data[is.fitness==TRUE & is.reads0 == TRUE,epistasis := fitness - F_fit_median] 116 | double_data[pos_epistasis == TRUE,pos_epistasis_sig := fitness-F_fit_upper > 0] 117 | double_data[neg_epistasis == TRUE,neg_epistasis_sig := fitness-F_fit_lower < 0] 118 | 119 | return(double_data) 120 | } 121 | -------------------------------------------------------------------------------- /scripts/misc/contact_matrix_from_pairdistances.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################# 3 | ### convert pair distances into contact and distance matrices ### 4 | ################################################################# 5 | 6 | contact_matrix_from_pairdistances <- function(input_file, 7 | dataset_dir, 8 | idx_start = 1, 9 | dist_type = c("scHAmin", "HAmin", "CB"), 10 | dist_cutoff = 4.5, 11 | plot = T){ 12 | 13 | ### variables 14 | # input_file: pairdistances data.table 15 | # dataset_dir: dataset directory, like "GB1/", 16 | # idx_start: index of first position in contact matrix 17 | # dist_type: one of the following distance types: "scHAmin", "HAmin", "CB" 18 | # dist_cutoff: in Angstrom, used for determining the contact map 19 | # plot: whether to plot the heatmap (True, False) 20 | #### > will deposit contact maps (one for distance matrix and one for contact matrix) to dataset_dir/ 21 | 22 | require(data.table) 23 | require(ggplot2) 24 | 25 | #Read pair distances file 26 | contactmap <- fread(input_file) 27 | #Build distance matrix 28 | all_positions <- unique(unlist(contactmap[,.(Pos1, Pos2)])) 29 | all_positions_names <- idx_start:(idx_start+length(all_positions)-1) 30 | num_positions <- length(all_positions) 31 | dist_mat <- matrix(nrow = num_positions, ncol = num_positions) 32 | for(i in all_positions){ 33 | for(j in all_positions){ 34 | dist_mat[i,j] <- as.numeric(contactmap[Pos1==i & Pos2==j,.SD,.SDcols = c(dist_type)]) 35 | } 36 | } 37 | rownames(dist_mat) <- all_positions_names 38 | colnames(dist_mat) <- rownames(dist_mat) 39 | bin_mat<-dist_mat 40 | bin_mat[dist_mat=dist_cutoff]<-0 42 | bin_mat[dist_mat==0] <- NA 43 | #Plot heatmap matrices 44 | if(plot){ 45 | tile_heatmap_wrapper(dist_mat[rev(1:dim(dist_mat)[1]),], file.path(dataset_dir, gsub(".txt$", "_distance_matrix.pdf", basename(input_file))), width=5, height=5, xlab = "Residue position", ylab = "Residue position", colour_clip=F, cluster='none', xaxis_size=10, yaxis_size=10, xaxis_angle=90, x_breaks = all_positions_names, y_breaks = all_positions_names) 46 | tile_heatmap_wrapper(bin_mat[rev(1:dim(bin_mat)[1]),], file.path(dataset_dir, gsub(".txt$", "_contact_matrix.pdf", basename(input_file))), width=5, height=5, xlab = "Residue position", ylab = "Residue position", colour_clip=F, cluster='none', xaxis_size=10, yaxis_size=10, xaxis_angle =90, x_breaks = all_positions_names, y_breaks = all_positions_names) 47 | } 48 | #Return 49 | return(list(distance_matrix = dist_mat, contact_matrix = bin_mat)) 50 | } 51 | 52 | -------------------------------------------------------------------------------- /scripts/misc/kernel_structure_propensity.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################### 3 | ##### score kernel at every position along diagonal of PWI data ### 4 | ################################################################### 5 | 6 | kernel_structure_propensity <- function(PWI, 7 | kernel, 8 | dataset_dir, 9 | prefix = "", 10 | Nsamples = 10000, 11 | debug_this = F, 12 | rand_strategy = c("all_data", "within_kernel", "kernal_width")) { 13 | 14 | ### variables 15 | # PWI: pairwise interaction score data.table; except for Pos1 and Pos2 this should only contain the scores that SS elements should be predicted from 16 | # kernel: pairwise interaction score matrix; symmetric numeric matrix with NAs on the diagonal 17 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/processed_data/ 18 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 19 | # Nsamples: number of randomized controls to compare SS propensity against 20 | # debug_this: if TRUE, function will stop at certain points in scripts in order to understand bugs 21 | 22 | require(data.table) 23 | require(metap) 24 | 25 | #Initialize result list 26 | ss_data_list = list() 27 | 28 | #Which scores should be used for prediction? 29 | eval_cols = setdiff(names(PWI),c("Pos1","Pos2","WT_AA1","WT_AA2","NposE","NnegE")) 30 | 31 | for (eval_cols_idx in seq_along(eval_cols)) { 32 | print(eval_cols[eval_cols_idx]) 33 | ss_data = copy(PWI[Pos1<=Pos2,.(Pos1,Pos2,input = .SD),,.SDcols = eval_cols[eval_cols_idx]]) 34 | setkey(ss_data,Pos1,Pos2) 35 | #Perpendicular distance from diagonal 36 | ss_data[,pos_perp := abs(Pos1-Pos2)] 37 | #position range for prediction 38 | kernel_length <- dim(kernel)[1] 39 | data_range = c(min(c(ss_data$Pos1,ss_data$Pos2)),max(c(ss_data$Pos1,ss_data$Pos2))) 40 | data_length = length(data_range[1]:data_range[2]) 41 | pos_range = c(min(c(ss_data$Pos1,ss_data$Pos2))-(kernel_length-2),max(c(ss_data$Pos1,ss_data$Pos2))-1) 42 | 43 | if (debug_this) {browser()} 44 | 45 | #Kernel structure propensities 46 | set.seed(1603) 47 | for (i in pos_range[1]:pos_range[2]) { 48 | 49 | #Square distances from center position 50 | ss_data[,within_kernel := (Pos1-i)=0 & (Pos2-i)=0] 51 | 52 | #Construct kernel matrix 53 | kernel_mat <- matrix(NA, nrow=data_length, ncol=data_length) 54 | colnames(kernel_mat) <- data_range[1]:data_range[2] 55 | rownames(kernel_mat) <- colnames(kernel_mat) 56 | within_kernel_mat <- kernel_mat 57 | within_kernel_mat[is.na(within_kernel_mat)] <- FALSE 58 | for(j in 1:kernel_length){ 59 | for(k in 1:kernel_length){ 60 | j_shift <- j + i - 1 61 | k_shift <- k + i - 1 62 | if(j_shift>=data_range[1] & j_shift<=data_range[2] & k_shift>=data_range[1] & k_shift<=data_range[2]){ 63 | kernel_mat[j_shift, k_shift] <- kernel[j, k] 64 | if(j_shift!=k_shift){ 65 | within_kernel_mat[j_shift, k_shift] <- TRUE 66 | } 67 | } 68 | } 69 | } 70 | 71 | #Determine kernel weights 72 | ss_data[Pos1 %in% data_range[1]:data_range[2] & Pos2 %in% data_range[1]:data_range[2],within_data := T] 73 | ss_data[!is.na(ss_data$within_data), kernel_weight := kernel_mat[cbind(Pos1,Pos2)]] 74 | ss_data[!is.na(ss_data$within_data), within_kernel := within_kernel_mat[cbind(Pos1,Pos2)]] 75 | 76 | #Calculate kernel smoothed value for true data 77 | if(dim(ss_data[Pos1==i & Pos2==i])[1]==0){ 78 | ss_data <- rbind(ss_data, list("Pos1"=i, "Pos2"=i, "within_kernel"=F, "kernel_score" = ss_data[within_kernel==T,sum(input*kernel_weight,na.rm=T)]), fill = T) 79 | }else{ 80 | ss_data[Pos1==i & Pos2==i,kernel_score := ss_data[within_kernel==T,sum(input*kernel_weight,na.rm=T)]] 81 | } 82 | 83 | #Calculate kernel smoothed value for random distributions 84 | B = copy(ss_data[within_kernel==T,.(kernel_weight,input)]) 85 | if(rand_strategy=="all_data"){ 86 | sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples) 87 | } 88 | if(rand_strategy=="kernal_width"){ 89 | sample_matrix = matrix(sample(ss_data[Pos1!=Pos2 & pos_perp<=max(ss_data[within_kernel==T,pos_perp]),c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples) 90 | } 91 | if(rand_strategy=="within_kernel"){ 92 | sample_matrix = matrix(sample(ss_data[Pos1!=Pos2 & within_kernel==T,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples) 93 | } 94 | kernel_sampled = colSums(sample_matrix * matrix(rep(t(B[,kernel_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T) 95 | 96 | #P-value for true value 97 | ss_data[Pos1==i & Pos2==i,kernel_p := sum(kernel_sampled >= kernel_score)/Nsamples] 98 | } 99 | #Avoid -Inf if logging p values by setting those positions smaller than all random samples to smallest non-zero pvalue 100 | ss_data[kernel_p == 0 ,kernel_p := 1/Nsamples] 101 | #Save 102 | ss_data[,(paste0(eval_cols[eval_cols_idx], "_kernel_score")) := kernel_score] 103 | ss_data[,(paste0(eval_cols[eval_cols_idx], "_kernel_p")) := kernel_p] 104 | ss_data[,Pos := Pos1] 105 | #Restrict to desired rows and columns, sort, save 106 | ss_data <- ss_data[Pos1==Pos2,.SD,.SDcols=names(ss_data)[grep("^Pos$|_kernel_score$|_kernel_p$", names(ss_data))]] 107 | setkey(ss_data, Pos) 108 | ss_data_list[[eval_cols[eval_cols_idx]]] <- ss_data 109 | } 110 | 111 | #Merge DT lists 112 | ss_data_merge <- Reduce(function(...) merge(..., all = T), ss_data_list) 113 | #Write to file 114 | write.table(paste0(dataset_dir,"processed_data/",prefix,"kernel_structure_propensity.txt"), 115 | x = ss_data_merge,quote = F,row.names = F,col.names = T) 116 | return(ss_data_merge) 117 | } -------------------------------------------------------------------------------- /scripts/misc/pairdistances_from_PDB_crystal.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################# 3 | ### extract position-pair distances for polymer from PDB file ### 4 | ################################################################# 5 | 6 | pairdistances_from_PDB_crystal = function(input_file, 7 | dataset_dir, 8 | aa_seq, 9 | idx_pdb_start = 1, 10 | idx_DMS_start = 1, 11 | idx_DMS_end = NA, 12 | dist_cutoff = 8, 13 | debug_this=F, 14 | suffix = ""){ 15 | 16 | ### variables 17 | # input_file: PDB file 18 | # dataset_dir: dataset directory, like "GB1/", 19 | #### > will deposit three .txt file (one for pair distances and one for secondary structure of psuedomonomer, one for pair distances of polymer) to dataset_dir/processed_data/ and plot contactmaps to /dataset_dir/results/preprocessing/ 20 | # aa_seq: amino acid sequence of reference (monomer) structure (in DMS data) -- this is compared to inferred monomer sequence from PDB file and produces error if they do not agree 21 | # idx_pdb_start: first position in PDB to consider (WARNING: argument passed to pairdistances_from_PDB) 22 | # idx_DMS_start: first position in reference sequence to consider (WARNING: argument passed to pairdistances_from_PDB) 23 | # idx_DMS_end: first position in reference sequence to consider, if NA (default), it will compare the full reference sequence to the PDB file sequence (WARNING: argument passed to pairdistances_from_PDB) 24 | # dist_cutoff: in Angstrom, used for plotting the contact map (WARNING: argument passed to pairdistances_from_PDB) 25 | # debug_this: if TRUE, the function will stop after printing comparision between PDB seq and DMS seq to adjust position indicies if necessary (WARNING: argument passed to pairdistances_from_PDB) 26 | # suffix: to be added to (WARNING: argument passed to pairdistances_from_PDB) 27 | 28 | require(data.table) 29 | 30 | #Atom lines 31 | atom_lns <- c("ATOM") 32 | #Reformat PDB file 33 | #Read PDB file 34 | pdb_tab <- read.pdb(input_file) 35 | #Get all atoms 36 | temp_atoms <- pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,] 37 | #Convert structure into "pseudomonomer" (residue ids stricly ascending and same chain) 38 | resid_rle <- rle(temp_atoms$resid) 39 | pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,]$resid <- rep(resid_rle$values[1]:(resid_rle$values[1]+length(resid_rle$values)-1), times = resid_rle$lengths) 40 | pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,]$chainid <- "A" 41 | #Amino acid sequence of pseudomonomer 42 | pseudomonomer_length <- length(unique(pdb_tab$atoms[pdb_tab$atoms$recname %in% atom_lns,c("resname", "resid")])$resname) 43 | aa_seq_pseudomonomer <- paste0(rep(aa_seq, pseudomonomer_length/nchar(aa_seq)), collapse = "") 44 | #Write to PDB file 45 | input_file_pseudomonomer <- file.path(dataset_dir, "processed_data", paste0(strsplit(basename(input_file), "\\.")[[1]][1], "_pseudomonomer", suffix, ".pdb")) 46 | write.pdb(pdb_tab, file = input_file_pseudomonomer) 47 | #Get pair distances 48 | pairdistances_from_PDB(input_file_pseudomonomer, dataset_dir = dataset_dir, aa_seq = aa_seq_pseudomonomer, 49 | idx_pdb_start = idx_pdb_start, idx_DMS_start = idx_DMS_start, idx_DMS_end = idx_DMS_end, dist_cutoff = dist_cutoff, debug_this = debug_this, suffix = suffix) 50 | contactmap <- fread(file.path(dataset_dir, "processed_data", paste0("PDB_contactmap_", strsplit(basename(input_file), "\\.")[[1]][1], "_pseudomonomer_A", suffix, ".txt"))) 51 | #Translate positions back to monomer positions 52 | contactmap[, Pos1 := (Pos1-1)%%nchar(aa_seq)+1] 53 | contactmap[, Pos2 := (Pos2-1)%%nchar(aa_seq)+1] 54 | #HAmin 55 | setkey(contactmap, HAmin) 56 | contactmap_HAmin <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, HAmin, HAmin_sd)] 57 | #scHAmin 58 | setkey(contactmap, scHAmin) 59 | contactmap_scHAmin <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, scHAmin, scHAmin_sd)] 60 | #CB 61 | setkey(contactmap, CB) 62 | contactmap_CB <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, CB, CB_sd)] 63 | #Merge 64 | setkey(contactmap, Pos1, Pos2) 65 | contactmap <- contactmap[!duplicated(contactmap[,.(Pos1, Pos2)]), .(Pos1, Pos2, WT_AA1, WT_AA2, chainids)] 66 | setkey(contactmap_HAmin, Pos1, Pos2) 67 | setkey(contactmap_scHAmin, Pos1, Pos2) 68 | setkey(contactmap_CB, Pos1, Pos2) 69 | contactmap <- contactmap[contactmap_HAmin,][contactmap_scHAmin,][contactmap_CB,][,.(Pos1, Pos2, WT_AA1, WT_AA2, chainids, HAmin, scHAmin, CB, HAmin_sd, scHAmin_sd, CB_sd)] 70 | #Save pairwise distance table 71 | pdb_filename = strsplit(strsplit(input_file,"/")[[1]][length(strsplit(input_file,"/")[[1]])],"\\.")[[1]][1] 72 | write.table(file = paste0(dataset_dir,'processed_data/PDB_contactmap_',pdb_filename,'_A',suffix,".txt",collapse = ""), 73 | x = contactmap,quote = F,row.names = F,col.names = T) 74 | } 75 | 76 | -------------------------------------------------------------------------------- /scripts/misc/tau_specificity_score.R: -------------------------------------------------------------------------------- 1 | 2 | ############################################################# 3 | ##### Tau pattern-specificity score (Yanai, et al 2004) ### 4 | ############################################################# 5 | 6 | tau_specificity_score <- function(x, 7 | min_length=3){ 8 | 9 | ### variables 10 | # x: vector of scores 11 | # min_length: minimum number of scores (otherwise returns NA) 12 | 13 | x <- x[!is.na(x)] 14 | if(length(x)>=min_length){ 15 | if(max(x)==0){ 16 | return(NA) 17 | }else{ 18 | return(sum(1-x/max(x))/(length(x)-1)) 19 | } 20 | }else{ 21 | return(NA) 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/misc/tile_heatmap_wrapper.R: -------------------------------------------------------------------------------- 1 | 2 | ################################### 3 | ### ggplot tile heatmap wrapper ### 4 | ################################### 5 | 6 | tile_heatmap_wrapper<-function(input_matrix, 7 | output_file, 8 | width=10, 9 | height=4, 10 | units="in", 11 | colour_clip=4, 12 | cluster='both', 13 | xlab='x', 14 | ylab='y', 15 | xtick_labels=NULL, 16 | ytick_labels=NULL, 17 | colour_type='continuous', 18 | colour_low='blue', 19 | colour_high='red', 20 | colour_mid='white', 21 | colour_limits=NULL, 22 | mono=F, 23 | na_colour="grey50", 24 | xaxis_angle=330, 25 | xaxis_hjust=0, 26 | xaxis_vjust=NULL, 27 | xaxis_size=5, 28 | omit_xtext=F, 29 | omit_xticks=F, 30 | yaxis_angle=NULL, 31 | yaxis_hjust=NULL, 32 | yaxis_vjust=NULL, 33 | yaxis_size=NULL, 34 | omit_ytext=F, 35 | omit_yticks=F, 36 | plot_title='', 37 | input_matrix_text=NULL, 38 | text_size=0.25, 39 | highlight_regions=NULL, 40 | x_breaks=waiver(), 41 | y_breaks=waiver(), 42 | plot = T){ 43 | 44 | ### variables 45 | # input_matrix: matrix of heatmap values (required) 46 | # output_file: plot output file path 47 | # width: plot width in "units" 48 | # height: plot height in "units" 49 | # units: plot size units ("in", "cm", or "mm") 50 | # colour_clip: maximum absolute value of colour scale 51 | # cluster: heirarchically cluster ("none", "row", "column", "none") 52 | # xlab: x-axis label 53 | # ylab: y-axis label 54 | # xtick_labels: display labels for x ticks 55 | # ytick_labels: display labels for y ticks 56 | # colour_type: colour scale type ("continuous", "categorical") 57 | # colour_low: colour scale lower limit colour -- passed to scale_colour_gradient2 58 | # colour_high: colour scale upper limit colour -- passed to scale_colour_gradient2 59 | # colour_mid: colour scale zero colour -- passed to scale_colour_gradient2 60 | # colour_limits: upper and lower value limits of colour scale -- passed to scale_colour_gradient2 61 | # mono: use monotype font (True, False) 62 | # na_colour: colour to use for NA values 63 | # xaxis_angle: rotation angle for x tick labels -- passed to element_text 64 | # xaxis_hjust: horizontal justification of x tick labels (in [0, 1]) -- passed to element_text 65 | # xaxis_vjust: vertical justification of x tick labels (in [0, 1]) -- passed to element_text 66 | # xaxis_size: text size of x tick labels (in pts) -- passed to element_text 67 | # omit_xtext: omit x tick labels (True, False) 68 | # omit_xticks: omit x ticks (True, False) 69 | # yaxis_angle: rotation angle for y tick labels -- passed to element_text 70 | # yaxis_hjust: horizontal justification of y tick labels (in [0, 1]) -- passed to element_text 71 | # yaxis_vjust: vertical justification of y tick labels (in [0, 1]) -- passed to element_text 72 | # yaxis_size: text size of y tick labels (in pts) -- passed to element_text 73 | # omit_ytext: omit y tick labels (True, False) 74 | # omit_yticks: omit y ticks (True, False) 75 | # plot_title: main title for plot 76 | # input_matrix_text: matrix of heatmap text 77 | # text_size: size of heatmap text 78 | # highlight_regions: list of highlighted regions of form: list("red" = list("region1" = c(_min_, _max_), "region2" = c(_min_, _max_), ...), "blue" = list("region3" = c(_min_, _max_), ...), ...) 79 | # x_breaks: x-axis breaks (for displaing xtick_labels) 80 | # y_breaks: y-axis breaks (for displaing ytick_labels) 81 | # plot: whether to plot the heatmap (True, False) 82 | 83 | require(ggplot2) 84 | 85 | order_row<-rev(1:dim(input_matrix)[1]) 86 | order_col<-1:dim(input_matrix)[2] 87 | if(cluster %in% c('both', 'row')){ 88 | d <- dist(input_matrix, method = "euclidean") # distance matrix 89 | order_row <- hclust(d, method="ward")$order 90 | } 91 | if(cluster %in% c('both', 'column')){ 92 | d <- dist(t(input_matrix), method = "euclidean") # distance matrix 93 | order_col<- hclust(d, method="ward")$order 94 | } 95 | plot_df<-melt(input_matrix[order_row,order_col]) 96 | colnames(plot_df)<-c('y', 'x', 'value') 97 | plot_df$label<-"" 98 | if(!is.null(input_matrix_text)){ 99 | plot_df_text<-melt(input_matrix_text[order_row,order_col]) 100 | colnames(plot_df_text)<-c('y', 'x', 'label') 101 | plot_df$label<-plot_df_text$label 102 | } 103 | if(colour_type=='continuous' & colour_clip){ 104 | plot_df$value[plot_df$value>colour_clip]<-colour_clip 105 | plot_df$value[plot_df$value<(-colour_clip)]<-(-colour_clip) 106 | } 107 | p <- ggplot(plot_df, aes(x, y)) + geom_tile(aes(fill = value)) + geom_text(aes(label = label), size=text_size) + 108 | # theme_bw() + 109 | theme(axis.text.x=list(element_text(angle = xaxis_angle, hjust = xaxis_hjust, vjust = xaxis_vjust, size = xaxis_size, family=c('', 'mono')[as.numeric(mono)+1]), element_blank())[[as.numeric(omit_xtext)+1]], 110 | axis.text.y=list(element_text(angle = yaxis_angle, hjust = yaxis_hjust, vjust = yaxis_vjust, size = yaxis_size, family=c('', 'mono')[as.numeric(mono)+1]), element_blank())[[as.numeric(omit_ytext)+1]], 111 | axis.ticks.x=list(element_line(), element_blank())[[as.numeric(omit_xticks)+1]], 112 | axis.ticks.y=list(element_line(), element_blank())[[as.numeric(omit_yticks)+1]]) + 113 | xlab(xlab) + ylab(ylab) + labs(title = plot_title) 114 | if(!is.null(highlight_regions)){ 115 | for(i in names(highlight_regions)){ 116 | for(j in names(highlight_regions[[i]])){ 117 | p <- p + geom_rect(data = NULL, mapping = aes_now(xmin=highlight_regions[[i]][[j]][1]-0.5, xmax=highlight_regions[[i]][[j]][2]+0.5, ymin=highlight_regions[[i]][[j]][1]-0.5, ymax=highlight_regions[[i]][[j]][2]+0.5), fill = NA, colour = i) 118 | } 119 | } 120 | } 121 | #xtick labels specified 122 | if(!is.null(xtick_labels)){ 123 | if(is.numeric(plot_df$x)){ 124 | #X is numeric 125 | p <- p + scale_x_continuous(breaks=x_breaks, labels=xtick_labels) 126 | }else{ 127 | #X is discrete 128 | p <- p + scale_x_discrete(breaks=x_breaks, labels=xtick_labels) 129 | } 130 | }else{ 131 | if(is.numeric(plot_df$x)){ 132 | p <- p + scale_x_continuous(breaks=x_breaks) 133 | } 134 | } 135 | #ytick labels specified 136 | if(!is.null(ytick_labels)){ 137 | if(is.numeric(plot_df$y)){ 138 | #Y is numeric 139 | p <- p + scale_y_continuous(breaks=y_breaks, labels = ytick_labels) 140 | }else{ 141 | #Y is discrete 142 | p <- p + scale_y_discrete(breaks=y_breaks, labels=ytick_labels) 143 | } 144 | }else{ 145 | if(is.numeric(plot_df$y)){ 146 | p <- p + scale_y_continuous(breaks=y_breaks) 147 | } 148 | } 149 | if(colour_type=='continuous'){ 150 | p <- p + scale_fill_gradient2(low = colour_low, high = colour_high, mid = colour_mid, midpoint = 0, limits=colour_limits, na.value=na_colour) 151 | } 152 | if(colour_type=='categorical'){ 153 | p <- p + scale_fill_brewer(palette='Set1') 154 | } 155 | if(plot){ 156 | ggsave(file=output_file, width=width, units=units, height=height) 157 | } 158 | return(p) 159 | } -------------------------------------------------------------------------------- /scripts/pairdistances_from_PDB.R: -------------------------------------------------------------------------------- 1 | 2 | ###################################################################################### 3 | ### extract position-pair distances and secondary structure elements from PDB file ### 4 | ###################################################################################### 5 | 6 | pairdistances_from_PDB = function(input_file, 7 | dataset_dir, 8 | given_chainids = "A", 9 | aa_seq, 10 | idx_pdb_start = 1, 11 | idx_DMS_start = 1, 12 | idx_DMS_end = NA, 13 | dist_cutoff = 8, 14 | debug_this=F, 15 | suffix = "") { 16 | 17 | ### variables 18 | # input_file: PDB file 19 | # dataset_dir: dataset directory, like "GB1/", 20 | #### > will deposit two .txt file (on for pair distances and one for secondary structure) to dataset_dir/processed_data/ and plot contactmaps to /dataset_dir/results/preprocessing/ 21 | # given_chainids: which chain in the PDB file to extract distances from; in case of protein-protein interactions needs to be a vector with both chains that trans-distances should be calculated over, e.g. c("A","C") 22 | # aa_seq: amino acid sequence of reference structure (in DMS data) 23 | # idx_pdb_start: first position in PDB to consider 24 | # idx_DMS_start: first position in reference sequence to consider 25 | # idx_DMS_end: first position in reference sequence to consider, if NA (default), it will compare the full reference sequence to the PDB file sequence 26 | # dist_cutoff: in Angstrom, used for plotting the contact map 27 | # debug_this: if TRUE, the function will stop after printing comparision between PDB seq and DMS seq to adjust position indicies if necessary 28 | # suffix: to be added to 29 | 30 | 31 | require(data.table) 32 | require(Rpdb) 33 | require(pdist) 34 | require(ggplot2) 35 | require(cowplot) 36 | 37 | #if idx_DMS_end is not given, compare across full sequence length 38 | if (is.na(idx_DMS_end)) { 39 | idx_DMS_end = c(0) 40 | for (c in seq_along(given_chainids)) { 41 | idx_DMS_end[c] = length(strsplit(aa_seq[[c]],"")[[1]]) 42 | } 43 | } 44 | 45 | #load PDB structure 46 | PDB_structure = read.pdb(input_file,MODEL=NULL) 47 | 48 | #for PDB files with NMR ensembles evaluate each model 49 | M = length(grep("MODEL",names(PDB_structure))) 50 | if (M==0) {M=1} 51 | for (m in 1:M) { 52 | 53 | #load model to structure data.table 54 | if (length(grep("MODEL",names(PDB_structure)))==0) { 55 | structure = data.table(eval(parse(text=paste0("PDB_structure$atoms")))) 56 | } else { 57 | structure = data.table(eval(parse(text=paste0("PDB_structure$MODEL.",m,"$atoms")))) 58 | } 59 | 60 | #restrict to ATOM entries 61 | structure = structure[recname=="ATOM" & chainid %in% given_chainids] 62 | 63 | #extract amino acid sequence from PDB file 64 | aaseq_PDB = unique(structure[,.(AA = convert_AAabr_one_three(as.character(unique(resname))),chainid),by=resid]) 65 | setkey(aaseq_PDB,resid) 66 | 67 | ## compare given DMS aaseq and aaseq from PDB file 68 | DMS_aa_seq = list() 69 | PDB_aa_seq = list() 70 | for (c in seq_along(given_chainids)) { 71 | DMS_aa_seq[[c]] = strsplit(aa_seq[[c]],"")[[1]][idx_DMS_start[c]:idx_DMS_end[c]] 72 | PDB_aa_seq[[c]] = aaseq_PDB[chainid == given_chainids[c]][.(idx_pdb_start[c]:(idx_pdb_start[c]+length(DMS_aa_seq[[c]])-1)),AA] 73 | PDB_aa_seq[[c]][is.na(PDB_aa_seq)] = "X" 74 | 75 | if (m==1) { 76 | print(paste0(input_file," chain ",given_chainids[c])) 77 | print(paste0('DMS seq [',idx_DMS_start[c],':',idx_DMS_end[c],'] ',paste0(DMS_aa_seq[[c]],collapse=""))) 78 | print(paste0(sum(DMS_aa_seq[[c]]==PDB_aa_seq[[c]]),'/',length(DMS_aa_seq[[c]]),' ',paste0(as.numeric(DMS_aa_seq[[c]] == PDB_aa_seq[[c]]),collapse = ""))) 79 | print(paste0('PDB seq [',idx_pdb_start[c],':',(idx_pdb_start[c]+length(DMS_aa_seq[[c]])-1),'] ',paste0(PDB_aa_seq[[c]],collapse=""))) 80 | } 81 | } 82 | 83 | #if debug_this ==T function will stop here to make adjustments to position indicies 84 | if (debug_this) { 85 | browser() 86 | } 87 | 88 | #initialize indicies for distance calcualtions 89 | if (length(given_chainids) == 1) { 90 | two_chainids = rep(given_chainids,2) 91 | two_starts_DMS = rep(idx_DMS_start,2) 92 | two_starts = rep(idx_pdb_start,2) 93 | two_ends = rep(idx_pdb_start + (idx_DMS_end-idx_DMS_start),2) 94 | } else { 95 | two_chainids = given_chainids 96 | two_starts_DMS = idx_DMS_start 97 | two_starts = idx_pdb_start 98 | two_ends = idx_pdb_start + (idx_DMS_end-idx_DMS_start) 99 | } 100 | 101 | #initialize distance table 102 | if (m==1) { 103 | distance = data.table(Pos1=rep(two_starts[1]:two_ends[1],two_ends[2]-two_starts[2]+1), 104 | Pos2=rep(two_starts[2]:two_ends[2],each=two_ends[1]-two_starts[1]+1)) 105 | setkey(distance,Pos1,Pos2) 106 | distance[,WT_AA1 := convert_AAabr_one_three(as.character(unique(structure[chainid == two_chainids[1] & resid == Pos1,resname]))),Pos1] 107 | distance[,WT_AA2 := convert_AAabr_one_three(as.character(unique(structure[chainid == two_chainids[2] & resid == Pos2,resname]))),Pos2] 108 | distance[,chainids := paste0(given_chainids)] 109 | } 110 | 111 | #calculate minimal side-chain heavy atom distance 112 | structure_HA = structure[union(intersect(grep(pattern="^[COSN][B-Z]$",elename), 113 | which(resname != "GLY")),intersect(grep(pattern="^CA$",elename),which(resname == "GLY")))] 114 | distance[,paste0("scHAmin",m):=min(as.matrix(pdist(as.matrix(structure_HA[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]), 115 | as.matrix(structure_HA[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))), 116 | by=.(Pos1,Pos2)] 117 | 118 | #calculate minimal all heavy atom distance 119 | structure_HA = structure[!grepl(pattern="H",elename)] 120 | distance[,paste0("HAmin",m):=min(as.matrix(pdist(as.matrix(structure_HA[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]), 121 | as.matrix(structure_HA[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))), 122 | by=.(Pos1,Pos2)] 123 | 124 | #calculate CB distances (use CA in case of Glycine) 125 | structure_CB = structure[elename == "CB" | (elename == "CA" & resname == "GLY"),.(chainid,resid,x1,x2,x3)] 126 | distance[,paste0("CB",m):=min(as.matrix(pdist(as.matrix(structure_CB[chainid == two_chainids[1] & resid == Pos1,.(x1,x2,x3)]), 127 | as.matrix(structure_CB[chainid == two_chainids[2] & resid == Pos2,.(x1,x2,x3)])))), 128 | by=.(Pos1,Pos2)] 129 | } 130 | 131 | ## average over distances 132 | distance[,scHAmin := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("scHAmin[0-9]",names(distance))] 133 | distance[,HAmin := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("HAmin[0-9]",names(distance))] 134 | distance[,CB := rowMeans(.SD),by=.(Pos1,Pos2),.SDcols = grep("CB[0-9]",names(distance))] 135 | 136 | 137 | #if there's multiple structural models, average over all and also calculate uncertainity 138 | if (length(grep("MODEL",names(PDB_structure)))==0) { 139 | distance[,scHAmin_sd := 0] 140 | distance[,HAmin_sd := 0] 141 | distance[,CB_sd := 0] 142 | } else { 143 | distance[,scHAmin_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("scHAmin[0-9]",names(distance))] 144 | distance[,HAmin_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("HAmin[0-9]",names(distance))] 145 | distance[,CB_sd := stats::sd(.SD),by=.(Pos1,Pos2),.SDcols = grep("CB[0-9]",names(distance))] 146 | } 147 | 148 | ##adjust positions to positions in alignment DMS/PDB sites 149 | contactmap = distance[between(Pos1,two_starts[1],two_ends[1]) & 150 | between(Pos2,two_starts[1],two_ends[2]), 151 | .(Pos1 = Pos1 - (two_starts[1] - two_starts_DMS[1]), 152 | Pos2 = Pos2 - (two_starts[2] - two_starts_DMS[2]), 153 | WT_AA1,WT_AA2,chainids, 154 | HAmin,scHAmin,CB,HAmin_sd,scHAmin_sd,CB_sd)] 155 | 156 | #save pairwise distance table 157 | pdb_filename = strsplit(strsplit(input_file,"/")[[1]][length(strsplit(input_file,"/")[[1]])],"\\.")[[1]][1] 158 | write.table(file = paste0(dataset_dir,'processed_data/PDB_contactmap_',pdb_filename,'_',paste0(given_chainids,collapse=""),suffix,".txt",collapse = ""), 159 | x = contactmap,quote = F,row.names = F,col.names = T) 160 | 161 | 162 | ################################################# 163 | ### extract secondary structure from PDB file ### 164 | ################################################# 165 | if (length(given_chainids) == 1) { #if looking at a single chain 166 | secondary_structure = data.table(Pos = idx_DMS_start:idx_DMS_end,ss = "C") 167 | 168 | output = scan(file=input_file,what="character",sep="\n") 169 | 170 | helix = output[grep(output,pattern="^HELIX")] 171 | if (length(helix)>0) { 172 | helix1 = sapply(X=1:length(helix),FUN = function(X){strsplit(helix[X],split="\\s+")[[1]]}) 173 | helix2 = data.table(t(helix1[4:9,])) 174 | names(helix2) = c("aa1","chainid1","pos1","aa2","chainid2","pos2") 175 | helix3 = helix2[chainid1 == given_chainids] 176 | if (nrow(helix3)>0) { 177 | for (i in 1:nrow(helix3)) { 178 | secondary_structure[between(Pos, 179 | helix3[i,as.numeric(pos1) - (two_starts[1] - two_starts_DMS[1])], 180 | helix3[i,as.numeric(pos2) - (two_starts[1] - two_starts_DMS[1])]), 181 | ss := "H"] 182 | } 183 | } 184 | } 185 | 186 | strand = output[grep(output,pattern="^SHEET")] 187 | if (length(strand)>0) { 188 | strand1 = sapply(X=1:length(strand),FUN = function(X){strsplit(strand[X],split="\\s+")[[1]]}) 189 | if (is.list(strand1)) { 190 | strand2 = data.table(t(strand1[[1]][5:10])) 191 | for (l in 2:length(strand1)) { 192 | strand2 = rbind(strand2,data.table(t(strand1[[l]][5:10]))) 193 | } 194 | } else { 195 | strand2 = data.table(t(strand1[5:10,])) 196 | } 197 | 198 | names(strand2) = c("aa1","chainid1","pos1","aa2","chainid2","pos2") 199 | strand3 = strand2[chainid1 == given_chainids] 200 | if (nrow(strand3) > 0) { 201 | for (i in 1:nrow(strand3)) { 202 | secondary_structure[between(Pos, 203 | strand3[i,as.numeric(pos1) - (two_starts[1] - two_starts_DMS[1])], 204 | strand3[i,as.numeric(pos2) - (two_starts[1] - two_starts_DMS[1])]), 205 | ss := "E"] 206 | } 207 | } 208 | } 209 | names(secondary_structure)[2] = "PDB" 210 | 211 | write.table(paste0(dataset_dir,"processed_data/PDB_secondary_structure_",pdb_filename,"_",given_chainids,suffix,".txt"), 212 | x = secondary_structure,quote = F,row.names = F,col.names = T) 213 | 214 | #for plotting 215 | secondary_structure[,rleidx := rleid(PDB)] 216 | } 217 | 218 | 219 | 220 | theme_set(theme_classic(base_size=9)) 221 | #plot contact map 222 | P1=ggplot() + 223 | geom_raster(data=contactmap,aes(x=Pos1,y=Pos2,fill=HAminF_fit_upper)] 19 | 20 | #number of points to plot, 10k is sufficient, otherwise PDF becomes very large 21 | r = sample(x = nrow(xyz),size = min(c(10000,nrow(xyz)))) 22 | x=xyz$x 23 | y=xyz$y 24 | z=xyz$z 25 | 26 | #axis limits, adjust 27 | xlim_plot = ylim_plot = c(xyrange[1] - 0.1*diff(xyrange),xyrange[2] + 0.1*diff(xyrange)) 28 | zlim_plot = quantile(xyz$z,probs = c(0.005,0.995),na.rm = T) 29 | #plot angles 30 | theta_plot = c(15,55) 31 | phi_plot = 15 32 | for (idx in 1:length(theta_plot)) { 33 | #### upper and lower surface with points 34 | pdf(paste0(dataset_dir, "results/epistasis/",prefix,"epistasis_surface",idx,".pdf"), useDingbats=FALSE) 35 | a=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q05,nrow=length(xy),ncol=length(xy)), 36 | xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness", 37 | xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot, 38 | theta = theta_plot[idx], phi = phi_plot, 39 | col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE) 40 | b=trans3d(xyz[intersect(r,which(below_q05==T))]$x, 41 | xyz[intersect(r,which(below_q05==T))]$y, 42 | xyz[intersect(r,which(below_q05==T))]$z,a) 43 | points(b$x,b$y,col=rgb(1,0.1,0.1),pch=16,cex=0.75) 44 | par(new=TRUE) 45 | a=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q05,nrow=length(xy),ncol=length(xy)), 46 | xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness", 47 | xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot, 48 | theta = theta_plot[idx], phi = phi_plot, 49 | col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE) 50 | par(new=TRUE) 51 | b=trans3d(xyz[intersect(r,which(below_q05==F & above_q95==F))]$x, 52 | xyz[intersect(r,which(below_q05==F & above_q95==F))]$y, 53 | xyz[intersect(r,which(below_q05==F & above_q95==F))]$z,a) 54 | points(b$x,b$y,col=rgb(1,0.5,0.5),pch=16,cex=0.75) 55 | par(new=TRUE) 56 | a2=persp(xy,xy,matrix(Fd_pred2+Fd_pred_q95,nrow=length(xy),ncol=length(xy)), 57 | xlab="single mutant fitness 1",ylab="single mutant fitness 2",zlab="double mutant fitness", 58 | xlim = xlim_plot,ylim = ylim_plot,zlim = zlim_plot, 59 | theta = theta_plot[idx], phi = phi_plot, 60 | col=NA, nticks=5,ticktype="detailed",expand=0.75, box=TRUE) 61 | par(new=TRUE) 62 | b=trans3d(xyz[intersect(r,which(above_q95==T))]$x, 63 | xyz[intersect(r,which(above_q95==T))]$y, 64 | xyz[intersect(r,which(above_q95==T))]$z,a) 65 | points(b$x,b$y,col=rgb(0.5,0.9,0.1),pch=16,cex=0.75) 66 | dev.off() 67 | } 68 | } -------------------------------------------------------------------------------- /scripts/predict_secondary_structure_elements.R: -------------------------------------------------------------------------------- 1 | ###################################################################### 2 | ##### predict alpha helices and beta strands with kernel smoothing ### 3 | ###################################################################### 4 | predict_secondary_structure_elements = function(PWI, 5 | dataset_dir, 6 | prefix = "", 7 | known_SS = c(), 8 | scale_long = 1/4^2, 9 | seed_size=3, 10 | p_detection_threshold = 0.05, 11 | Nsamples = 10000, 12 | debug_this = F, 13 | return_list = F) { 14 | 15 | ### variables 16 | # PWI: pairwise interaction score data.table; except for Pos1 and Pos2 this should only contain the scores that SS elements should be predicted from 17 | # dataset_dir: dataset directory, like "GB1/", it will put results dataset_dir/results/PWI/ 18 | # prefix: to be added to results files (in case of running diff. versions of data from same dataset etc) 19 | # known_SS (optional): filepointer to a file with known secondary structure elements (from a PDB file) [to plot as comparision], a table with a position and a SS classifier column 20 | # scale_long: length scale for gaussian smoothing kernel 21 | # seed_size: number of positions the SSpropensities are initially aggregated over; must be an odd value !!! 22 | # p_detection_threshold: p-value threshold for calling a SS element 23 | # Nsamples: number of randomized controls to compare SS propensity against 24 | # debug_this: if TRUE, function will stop at certain points in scripts in order to understand bugs 25 | # return_list: if TRUE, ggplot2 and ss_data objects returned in addition to predicted secondary_structure (in named list) 26 | 27 | 28 | require(data.table) 29 | require(ggplot2) 30 | require(metap) 31 | 32 | #which scores should be used for prediction? 33 | eval_cols = setdiff(names(PWI),c("Pos1","Pos2","WT_AA1","WT_AA2","NposE","NnegE")) 34 | 35 | #Initialise list of returned data 36 | saved_objects <- list( 37 | "plot_objects" = list(), 38 | "secondary_structure_score" = list()) 39 | 40 | for (eval_cols_idx in seq_along(eval_cols)) { 41 | print(eval_cols[eval_cols_idx]) 42 | ss_data = copy(PWI[Pos1<=Pos2,.(Pos1,Pos2,input = .SD),,.SDcols = eval_cols[eval_cols_idx]]) 43 | setkey(ss_data,Pos1,Pos2) 44 | #position range for prediction 45 | pos_range = c(min(c(ss_data$Pos1,ss_data$Pos2)),max(c(ss_data$Pos1,ss_data$Pos2))) 46 | 47 | #compute diagonal/perpendicular coordinates 48 | ss_data[,pos_diag := (Pos1+Pos2)/2] 49 | ss_data[,pos_perp := abs(Pos1-Pos2)/2] #this is half the actual distance between positions in a pair (for consistency with pos_diag) 50 | if (debug_this) {browser()} 51 | 52 | ### secondary structure propensities 53 | set.seed(1603) 54 | for (i in pos_range[1]:pos_range[2]) { 55 | j = i 56 | 57 | # hamming distances from center position 58 | ss_data[,ham := abs(Pos1-i) + abs(Pos2-j)] 59 | ss_data[,ham_perp := abs(Pos1-i - (Pos2-j))] 60 | ss_data[,ham_diag := abs(Pos1-i + Pos2-j)] 61 | 62 | #################################### 63 | ####### detect alpha helices ####### 64 | #################################### 65 | 66 | # compute kernel weights 67 | if (i > pos_range[1]) {ss_data[,alpha_weight := NULL]} 68 | ss_data[ham_perp <= 5 & ham_diag < 12,alpha_weight := (cos(ham_perp*2*pi/3.6)+1/3) * exp(-scale_long*ham_diag^2)] 69 | ss_data[Pos1==Pos2,alpha_weight := NA] 70 | ss_data[is.na(input),alpha_weight := NA] 71 | 72 | # calculate kernel smoothed value for true data 73 | ss_data[Pos1==i & Pos2==i,alpha_score := ss_data[ham_perp <= 5 & ham_diag < 12,sum(input*alpha_weight,na.rm=T)]] 74 | 75 | # calculate kernel smoothed value for random distributions 76 | B = copy(ss_data[ham_perp <= 5 & ham_diag <= 12,.(ham,alpha_weight,input)]) 77 | setkey(B,ham) 78 | sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples) 79 | alpha_sampled = colSums(sample_matrix * matrix(rep(t(B[,alpha_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T) 80 | 81 | # p value for true value 82 | ss_data[Pos1==i & Pos2==i,alpha_p := sum(alpha_sampled >= alpha_score)/Nsamples] 83 | 84 | #################################### 85 | ####### detect beta strands ####### 86 | #################################### 87 | 88 | # compute kernel weights 89 | if (i > pos_range[1]) {ss_data[,beta_weight := NULL]} 90 | ss_data[ham_perp <= 2,beta_weight := ((ham_perp+1) %% 2 - 1/3)*exp(-scale_long*ham_diag^2)] 91 | ss_data[ham_perp == 0,beta_weight := beta_weight * 2] 92 | ss_data[Pos1==Pos2,beta_weight := NA] 93 | ss_data[is.na(input),beta_weight := NA] 94 | 95 | # calculate kernel smoothed value for true data 96 | ss_data[Pos1==i & Pos2==i,beta_score := ss_data[ham_perp <= 2 & ham_diag <= 12,sum(input*beta_weight,na.rm=T)]] 97 | 98 | # calculate kernel smoothed value for random distributions 99 | B = copy(ss_data[ham_perp <= 2 & ham_diag <= 12,.(ham,beta_weight,input)]) 100 | setkey(B,ham) 101 | sample_matrix = matrix(sample(ss_data[Pos1!=Pos2,c(input)],(nrow(B))*Nsamples,replace = T),nrow = nrow(B),ncol=Nsamples) 102 | beta_sampled = colSums(sample_matrix * matrix(rep(t(B[,beta_weight]),Nsamples),nrow=nrow(B),ncol=Nsamples),na.rm=T) 103 | 104 | # p value for true value 105 | ss_data[Pos1==i & Pos2==i,beta_p := sum(beta_sampled >= beta_score)/Nsamples] 106 | 107 | } 108 | 109 | #avoid -Inf if logging p values by setting those positions smaller than all random samples to smallest non-zero pvalue 110 | ss_data[alpha_p == 0 ,alpha_p := 1/Nsamples] 111 | ss_data[beta_p == 0 ,beta_p := 1/Nsamples] 112 | 113 | ########################################################### 114 | ### call secondary structure elements from propensities ### 115 | ########################################################### 116 | 117 | ### get alpha helices and beta strand p values 118 | setkey(ss_data,Pos1) 119 | ss_strands = ss_data[Pos1==Pos2,.(Pos1,alpha_p,beta_p)] 120 | 121 | # compute sumlog (combined p-values) for seeds of alpha helix 122 | if (seed_size > 1) { 123 | for (mid_idx in ss_strands[!is.na(alpha_p),Pos1]) { 124 | ss_strands[Pos1==mid_idx,alpha_p_seed := ss_strands[between(Pos1,mid_idx-(seed_size-1)/2,mid_idx+(seed_size-1)/2), 125 | ifelse(sum(!is.na(alpha_p)) > 1,sumlog(alpha_p[!is.na(alpha_p)])$p,alpha_p[!is.na(alpha_p)])]] 126 | } 127 | } else { 128 | ss_strands[,alpha_p_seed := alpha_p] 129 | } 130 | 131 | # compute sumlog for seeds of beta strands 132 | if (seed_size > 1) { 133 | for (mid_idx in ss_strands[!is.na(beta_p),Pos1]) { 134 | ss_strands[Pos1==mid_idx,beta_p_seed := ss_strands[between(Pos1,mid_idx-(seed_size-1)/2,mid_idx+(seed_size-1)/2), 135 | ifelse(sum(!is.na(beta_p)) > 1,sumlog(beta_p[!is.na(beta_p)])$p,beta_p[!is.na(beta_p)])]] 136 | } 137 | } else { 138 | ss_strands[,beta_p_seed := beta_p] 139 | } 140 | 141 | #set p-values NA if other structure is more probable 142 | ss_strands[,beta_p0 := beta_p] 143 | ss_strands[,alpha_p0 := alpha_p] 144 | ss_strands[,beta_p_seed0 := beta_p_seed] 145 | ss_strands[,alpha_p_seed0 := alpha_p_seed] 146 | if (seed_size > 1) { 147 | if (ss_strands[1+(seed_size-1)/2]$alpha_p_seed < ss_strands[1+(seed_size-1)/2]$beta_p_seed) { 148 | ss_strands[1,':=' (beta_p = NA, beta_p_seed = NA)] } else { 149 | ss_strands[1,':=' (alpha_p = NA, alpha_p_seed = NA)] } 150 | if (ss_strands[.N-(seed_size-1)/2]$alpha_p_seed < ss_strands[.N-(seed_size-1)/2]$beta_p_seed) { 151 | ss_strands[.N,':=' (beta_p = NA, beta_p_seed = NA)] } else { 152 | ss_strands[.N,':=' (alpha_p = NA, alpha_p_seed = NA)] } 153 | } 154 | ss_strands[alpha_p_seed < beta_p_seed,':=' (beta_p_seed = NA, beta_p = NA)] 155 | ss_strands[beta_p_seed < alpha_p_seed,':=' (alpha_p_seed = NA, alpha_p = NA)] 156 | 157 | #delete stretches smaller 5 for beta strands, set NA 158 | ss_strands[ss_strands[,.(stretch = rleid(!is.na(alpha_p_seed)),not_na = !is.na(alpha_p_seed))][,.(short = .N<5 & not_na == T),stretch][,short], 159 | ':=' (alpha_p =NA,alpha_p_seed = NA)] 160 | #delete stretches smaller 3 for beta strands, set NA 161 | ss_strands[ss_strands[,.(stretch = rleid(!is.na(beta_p_seed)),not_na = !is.na(beta_p_seed))][,.(short = .N<3 & not_na == T),stretch][,short], 162 | ':=' (beta_p = NA,beta_p_seed = NA)] 163 | 164 | 165 | 166 | ## alpha helices: 167 | # identify most significant stretches from seeds 168 | helper = identify_expand_seeds(ss_strands[,.(pos=Pos1,p_ind=alpha_p,p_seed=alpha_p_seed)],seed_size,p_detection_threshold) 169 | # merge 170 | ss_strands = merge(ss_strands,helper[,.(Pos1=pos,alpha_strand=strand,alpha_strand_p=p_strand)],by="Pos1") 171 | 172 | ## beta strands: 173 | # identify most significant stretches from seeds 174 | helper = identify_expand_seeds(ss_strands[,.(pos=Pos1,p_ind=beta_p,p_seed = beta_p_seed)],seed_size,p_detection_threshold) 175 | #merge 176 | ss_strands = merge(ss_strands,helper[,.(Pos1=pos,beta_strand=strand,beta_strand_p=p_strand)],by="Pos1") 177 | #Save 178 | if(return_list){ 179 | saved_objects[["secondary_structure_score"]][[eval_cols[eval_cols_idx]]] <- copy(ss_strands) 180 | } 181 | 182 | ### record predictions across input data 183 | if (eval_cols_idx == 1) { 184 | if (debug_this) {browser()} 185 | secondary_structure = ss_strands[,.(Pos = Pos1,ss = ifelse(is.na(alpha_strand) & is.na(beta_strand),"C",ifelse(!is.na(alpha_strand),"H","E")))] 186 | names(secondary_structure)[2] = eval_cols[1] 187 | } else { 188 | secondary_structure = merge(secondary_structure, 189 | ss_strands[,.(Pos = Pos1,ss = ifelse(is.na(alpha_strand) & is.na(beta_strand),"C",ifelse(!is.na(alpha_strand),"H","E")))], 190 | by = "Pos") 191 | names(secondary_structure)[eval_cols_idx+1] = eval_cols[eval_cols_idx] 192 | } 193 | 194 | 195 | ############# 196 | ## compare to known_SS 197 | ############# 198 | if (!is.null(known_SS)) { 199 | known_ss_DT = fread(known_SS) 200 | names(known_ss_DT)[2] = "SS" 201 | known_ss_DT[,rleidx := rleid(SS)] 202 | known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "C", rleidx = max(known_ss_DT$rleidx)+1)) 203 | known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "E", rleidx = max(known_ss_DT$rleidx)+1)) 204 | known_ss_DT = rbind(known_ss_DT,data.table(Pos = nrow(known_ss_DT)+1, SS = "H", rleidx = max(known_ss_DT$rleidx)+1)) 205 | } 206 | 207 | if (debug_this) {browser()} 208 | ################################################## 209 | ##### plot secondary structure element predictions 210 | require(cowplot) 211 | theme_set(theme_classic()) 212 | P1 = ggplot(data=ss_strands) + 213 | geom_line(aes(Pos1,y=alpha_p_seed0),color="darkgreen",linetype=2) + 214 | geom_line(aes(Pos1,y=alpha_p_seed),color="darkgreen") + 215 | 216 | geom_line(aes(Pos1,y=beta_p_seed0),color="red",linetype=2) + 217 | geom_line(aes(Pos1,y=beta_p_seed),color="red") + 218 | 219 | geom_hline(yintercept=0.05,linetype=3) + 220 | geom_rect(data=unique(ss_strands[!is.na(alpha_strand),.(xmin=min(Pos1)-0.5,xmax=max(Pos1)+0.5,ymin=max(alpha_strand_p,10^-4),ymax=1),alpha_strand]), 221 | inherit.aes = F, 222 | aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax,group=alpha_strand),fill="green",alpha=0.2) + 223 | geom_rect(data=unique(ss_strands[!is.na(beta_strand),.(xmin=min(Pos1)-0.5,xmax=max(Pos1)+0.5,ymin=max(beta_strand_p,10^-4),ymax=1),beta_strand]), 224 | inherit.aes = F, 225 | aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax,group=beta_strand),fill="orange",alpha=0.2) + 226 | scale_x_continuous(breaks=seq(5,pos_range[2],5),expand = c(0,0)) + 227 | coord_cartesian(xlim = c(pos_range[1]-0.5,pos_range[2]+0.5)) + 228 | # scale_y_log10(breaks = c(10^-seq(-10,0))) + 229 | scale_y_log10(breaks = c(10^-seq(0,ss_strands[,-log10(min(c(alpha_p_seed,beta_p_seed,10^-4),na.rm=T))])),expand = c(0.01,0)) + 230 | labs(y="p value",title = eval_cols[eval_cols_idx]) 231 | 232 | if (!is.null(known_SS)) { #add secondary structure 233 | P1 = P1 + 234 | geom_segment(data = known_ss_DT[,.(start = min(Pos)-0.5,end = max(Pos)+0.5,ss=unique(SS)),rleidx], 235 | aes(x=start,y=1.5,xend=end,yend=1.5,color=ss,size=ss),show.legend = F) + 236 | scale_size_manual(breaks = c("C","E","H"),values = c(0.5,1.5,1.5)) + 237 | scale_color_manual(breaks = c("C","E","H"),values = c("black","red","darkgreen")) 238 | } 239 | 240 | ################################################## 241 | ###### plot smoothed data around diagonal 242 | setkey(ss_data,Pos1,Pos2) 243 | 244 | for (i in pos_range[1]:pos_range[2]) { 245 | for (j in i:pos_range[2]) { 246 | ss_data[,ham := abs(Pos1-i) + abs(Pos2-j)] 247 | ss_data[,ham_perp := abs(Pos1-i - (Pos2-j))] 248 | ss_data[,ham_diag := abs(Pos1-i + Pos2-j)] 249 | 250 | ss_data[,weight := as.double(NA)] 251 | ss_data[ham_perp==0,weight := exp(-scale_long*ham_diag^2)] 252 | ss_data[Pos1==Pos2,weight := NA] 253 | ss_data[is.na(input),weight := NA] 254 | #calculate true value 255 | ss_data[Pos1==i & Pos2==j,score := ss_data[ham_perp==0,sum(input*weight,na.rm=T)/sum(weight,na.rm=T)]] 256 | } 257 | } 258 | #average 259 | avg = 1.5 260 | for (i in ss_data[,unique(ham_diag)]) { 261 | ss_data[ham_diag == i & ham_perp < 8,score_norm := score-ss_data[between(ham_diag,i-avg,i+avg) & ham_perp < 10,mean(score,na.rm=T)]] 262 | ss_data[ham_diag == i & ham_perp < 8,input_norm := input-ss_data[between(ham_diag,i-avg,i+avg) & ham_perp < 10 & abs(input) != Inf,mean(input,na.rm=T)]] 263 | } 264 | #limit data range for better comparability 265 | cutoff = quantile(c(ss_data[Pos1 cutoff, input_norm := cutoff] 267 | ss_data[input_norm < -cutoff, input_norm := -cutoff] 268 | ss_data[score_norm > cutoff, score_norm := cutoff] 269 | ss_data[score_norm < -cutoff, score_norm := -cutoff] 270 | 271 | P2 = ggplot(ss_data[ham_perp < 8 & ham_perp != 0]) + 272 | geom_raster(aes(Pos1,Pos2,fill=input_norm)) + 273 | geom_raster(aes(Pos2,Pos1,fill=score_norm)) + 274 | scale_fill_gradient2(midpoint=0,low="tomato3",high="steelblue3",na.value = "white") + 275 | scale_y_continuous(breaks=seq(1,8,1),expand = c(0,0)) + 276 | scale_x_continuous(breaks=seq(5,pos_range[2],5),expand = c(0,0)) + 277 | coord_cartesian(xlim = c(pos_range[1]-0.5,pos_range[2]+0.5),ylim = c(pos_range[1]-0.5,pos_range[2]+0.5)) + 278 | labs(x = "position",y = "diagonal position",fill = "score") + 279 | geom_segment(data=unique(ss_strands[!is.na(alpha_strand),.(x=min(Pos1)-0.5,xend=max(Pos1)+0.5,y=min(Pos1)-0.5,yend=max(Pos1)+0.5),alpha_strand]), 280 | inherit.aes = F, 281 | aes(x=x,xend=xend,y=y,yend=yend,group=alpha_strand),color="darkgreen",size=1.5) + 282 | geom_segment(data=unique(ss_strands[!is.na(beta_strand),.(x=min(Pos1)-0.5,xend=max(Pos1)+0.5,y=min(Pos1)-0.5,yend=max(Pos1)+0.5),beta_strand]), 283 | inherit.aes = F, 284 | aes(x=x,xend=xend,y=y,yend=yend,group=beta_strand),color="red",size=1.5) 285 | 286 | plot_grid(P1,P2,nrow=1) 287 | 288 | ggsave(file=paste0(dataset_dir,"results/secondary_structure/",prefix,eval_cols[eval_cols_idx],"_SSelements.pdf"),width=8.5,height=4) 289 | 290 | if(return_list){ 291 | saved_objects[["plot_objects"]][[eval_cols[eval_cols_idx]]] <- list("P1" = P1, "P2" = P2) 292 | } 293 | } 294 | 295 | write.table(paste0(dataset_dir,"processed_data/",prefix,"secondary_structure_prediction.txt"), 296 | x = secondary_structure,quote = F,row.names = F,col.names = T) 297 | 298 | #Return objects 299 | if(return_list){ 300 | saved_objects[["secondary_structure"]] <- secondary_structure 301 | return(saved_objects) 302 | }else{ 303 | return(secondary_structure) 304 | } 305 | } 306 | 307 | -------------------------------------------------------------------------------- /scripts/quantile_fitness_surface_adaptive.R: -------------------------------------------------------------------------------- 1 | ### calculate quantile fitness surfaces 2 | #subfunction for the call_epistasis class of functions 3 | quantile_fitness_surface_adaptive = function(DT,Nq,Nv,span,Q) { 4 | 5 | # calculate quantile surface approximation on regular grid (defined by quantiles) given a sampled number of variants 6 | 7 | #define grid vector 8 | q = seq(min(c(quantile(DT[,fitness1],probs=0,na.rm=T),quantile(DT[,fitness2],probs=0,na.rm=T))), 9 | max(c(quantile(DT[,fitness1],probs = 1,na.rm=T),quantile(DT[,fitness2],probs = 1,na.rm=T)))+0.015, 10 | length.out=Nq) 11 | 12 | #Fq: data.table for surface values at each gridpoint 13 | Fq = data.table(fitness1=rep(q,length(q)),fitness2=rep(q,each=length(q))) 14 | 15 | #initialize different surface columns (median, upper, lower) 16 | Fq[,F_median := as.numeric(NA)] 17 | Fq[,F_lower := as.numeric(NA)] 18 | Fq[,F_upper := as.numeric(NA)] 19 | 20 | #downsample variants to value given by Nv (or use all variants if number of variants if smaller Nv) 21 | subDT = DT[sample(nrow(DT),min(c(Nv,nrow(DT)))),.(fitness1,fitness2,fitness_norm)] 22 | 23 | # run this in parallel for all grid points 24 | require(parallel) 25 | # Use the detectCores() function to find the number of cores in system 26 | no_cores <- detectCores()-1 27 | clust <- makeCluster(no_cores) 28 | # make variables available to each core's workspace 29 | clusterExport(clust, list("subDT","span","Fq","Q"),envir = environment()) 30 | helper = parSapply(clust,X = 1:Nq^2, surface_at_gridpoint) 31 | stopCluster(clust) 32 | #transfer results from helper to Fq 33 | Fq[,F_median := helper[1,]] 34 | Fq[,F_lower := helper[2,]] 35 | Fq[,F_upper := helper[3,]] 36 | 37 | # > loess fit regular grid surface 38 | List = list() 39 | List$F_median_fit = loess(F_median ~ fitness1 + fitness2,data = Fq,span=0.2) 40 | List$F_lower_fit = loess(F_lower ~ fitness1 + fitness2,data = Fq,span=0.2) 41 | List$F_upper_fit = loess(F_upper ~ fitness1 + fitness2,data = Fq,span=0.2) 42 | 43 | List$Fq = Fq 44 | return(List) 45 | } -------------------------------------------------------------------------------- /scripts/surface_at_gridpoint.R: -------------------------------------------------------------------------------- 1 | ### calculate fitness quantiles for nearest neighbours of gridpoint 2 | #subfunction for call_epistasis class of functions 3 | surface_at_gridpoint = function(i) { 4 | require(data.table) 5 | A = unlist(subDT[,.(D=sqrt((fitness1-Fq[i,fitness1])^2 +(fitness2-Fq[i,fitness2])^2),fitness_norm)][D < quantile(D,probs=span,na.rm=T), 6 | .(quantile(fitness_norm,p=c(0.5,Q,1-Q),na.rm=T))]) 7 | return(A) 8 | } 9 | -------------------------------------------------------------------------------- /scripts/switch_double_DT.R: -------------------------------------------------------------------------------- 1 | # double data.table while inter-switching specific columns; e.g. when complementing DMS datatable such that Pos1 and Pos2 are symmetric 2 | switch_double_DT = function(DT,cols_switchdouble,cols_double) { 3 | # cols_switchdouble: list of columns pairs 4 | sd_text = "" 5 | for (i in 1:length(cols_switchdouble)) { 6 | sd_text = c(sd_text,paste0(cols_switchdouble[[i]][1]),"=c(",cols_switchdouble[[i]][1],",",cols_switchdouble[[i]][2],"),") 7 | sd_text = c(sd_text,paste0(cols_switchdouble[[i]][2]),"=c(",cols_switchdouble[[i]][2],",",cols_switchdouble[[i]][1],"),") 8 | } 9 | eval(parse(text = paste0("DT = DT[,.(",paste0(sd_text,collapse=""),paste0(cols_double,collapse=","),")]"))) 10 | } 11 | --------------------------------------------------------------------------------