├── Inputs ├── homeodomain_PF00046_curated.txt └── pgk_IPR015824.txt ├── LICENSE ├── Outputs ├── homeodomain_PF00046_curated_consensus_output.txt ├── homeodomain_PF00046_curated_gapFrequencies.png ├── homeodomain_PF00046_curated_gapStrip.txt ├── homeodomain_PF00046_curated_residueFrequencies.csv ├── homeodomain_PF00046_curated_sequenceEntropies.png ├── pgk_IPR015824_lengthFiltered.txt ├── pgk_IPR015824_lengthFiltered_output.txt └── pgk_IPR015824_sequence_length_hist.png ├── README.md ├── consensus.py ├── length_filter.py └── modules ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── consensus_tools.cpython-36.pyc └── consensus_tools.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 msternke 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Outputs/homeodomain_PF00046_curated_consensus_output.txt: -------------------------------------------------------------------------------- 1 | Determined consensus sequence for: Inputs/homeodomain_PF00046_curated.txt 2 | 3 | Parameters used: 4 | Method for removing insertions: 1 5 | 6 | >Consensus_sequence 7 | KRKRTRFTPEQLEELEKEFEKNPYPSREEREELAKELGLTERQVKVWFQNRRAKWKK 8 | 9 | MSA sequence entropy per residue: 2.519 10 | 11 | Wrote gap stripped alignment to: homeodomain_PF00046_curated_gapStrip.txt 12 | 13 | Wrote CSV of residue frequencies to: homeodomain_PF00046_curated_residueFrequencies.csv 14 | 15 | -------------------------------------------------------------------------------- /Outputs/homeodomain_PF00046_curated_gapFrequencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msternke/protein-consensus-sequence/88c4632f373bc440f0afa4af28c6a250e5144dbf/Outputs/homeodomain_PF00046_curated_gapFrequencies.png -------------------------------------------------------------------------------- /Outputs/homeodomain_PF00046_curated_residueFrequencies.csv: -------------------------------------------------------------------------------- 1 | ,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y,- 2 | Position 1,0.0284401662655874,0.007438197330999781,0.014657624152264275,0.025158608619558084,0.010719754977029096,0.0284401662655874,0.017282870269087727,0.018595493327499452,0.06738131699846861,0.02450229709035222,0.007875738350470356,0.03172172391161671,0.048567053161233865,0.025814920148763947,0.03565959308685189,0.04178516735943995,0.031284182892146135,0.0284401662655874,0.003500328155764603,0.02100196893458762,0.4817326624371035 3 | Position 2,0.007656967840735069,0.0026252461168234523,0.0002187705097352877,0.0013126230584117262,0.0,0.005469262743382192,0.004594180704441042,0.0,0.19164296652811202,0.0004375410194705754,0.0017501640778823015,0.007656967840735069,0.010063443447823233,0.017282870269087727,0.37912929337125356,0.008313279369940932,0.0030627871362940277,0.0004375410194705754,0.0,0.0006563115292058631,0.3576897834171954 4 | Position 3,0.03894115073288121,0.004812951214176329,0.005031721723911617,0.02340844454167578,0.007438197330999781,0.018376722817764167,0.03281557646029315,0.024721067600087507,0.1778604244147889,0.011376066506234959,0.01028221395755852,0.03872238022314592,0.09078976154014438,0.030846641872675565,0.06738131699846861,0.06147451323561584,0.045066725005469266,0.02450229709035222,0.0,0.021439509954058193,0.2647123167796981 5 | Position 4,0.0013126230584117262,0.0013126230584117262,0.0,0.001968934587617589,0.0,0.0004375410194705754,0.002187705097352877,0.0002187705097352877,0.06497484139138045,0.001968934587617589,0.0002187705097352877,0.0017501640778823015,0.001968934587617589,0.005250492233646905,0.6558739881863924,0.0013126230584117262,0.0013126230584117262,0.0,0.0010938525486764385,0.021877050973528767,0.23495952745569898 6 | Position 5,0.017282870269087727,0.005250492233646905,0.002187705097352877,0.0026252461168234523,0.013563771603587836,0.002187705097352877,0.03106541238241085,0.025814920148763947,0.048785823670969154,0.033690658499234305,0.006344344782323342,0.008313279369940932,0.009625902428352658,0.027127543207175673,0.07481951432946839,0.02165828046379348,0.38416101509516515,0.04769197112229272,0.030846641872675565,0.004156639684970466,0.20280026252461167 7 | Position 6,0.0977904178516736,0.015970247210676,0.0006563115292058631,0.004812951214176329,0.009625902428352658,0.0024064756070881644,0.014001312623058412,0.0409100853204988,0.04834828265149858,0.040035003281557645,0.010063443447823233,0.043097790417851674,0.029752789323999124,0.012688689564646685,0.21570772259899365,0.07613213738788012,0.11310435353314373,0.07306935025158609,0.0004375410194705754,0.0037190986654998905,0.14767009407131917 8 | Position 7,0.01793918179829359,0.001968934587617589,0.0002187705097352877,0.0,0.4204769197112229,0.0004375410194705754,0.034128199518704876,0.06584992343032159,0.029534018814263836,0.11660468168890833,0.00940713191861737,0.0004375410194705754,0.03740975716473419,0.0002187705097352877,0.002187705097352877,0.0006563115292058631,0.033471887989499016,0.010063443447823233,0.005250492233646905,0.09910304091008532,0.11463574710129075 9 | Position 8,0.017064099759352438,0.0008750820389411508,0.01903303434697003,0.02450229709035222,0.0004375410194705754,0.009188361408882083,0.0030627871362940277,0.0002187705097352877,0.040035003281557645,0.0056880332531174796,0.0030627871362940277,0.045723036534675125,0.008313279369940932,0.004375410194705754,0.01334500109385255,0.2824327280682564,0.43863487201925183,0.0004375410194705754,0.0002187705097352877,0.0002187705097352877,0.08313279369940932 10 | Position 9,0.08925836797199738,0.002187705097352877,0.06497484139138045,0.07263180923211551,0.0028440166265587398,0.02647123167796981,0.024283526580616933,0.005031721723911617,0.11441697659155546,0.016407788230146575,0.0056880332531174796,0.036315904616057756,0.15554583242178954,0.06672500546926274,0.06497484139138045,0.0903522205206738,0.05644279151170422,0.027783854736381536,0.003937869175235178,0.004812951214176329,0.06891271056661562 11 | Position 10,0.07153795668343907,0.001968934587617589,0.07656967840735068,0.2472106760008751,0.026908772697440384,0.01793918179829359,0.0496609057099103,0.011376066506234959,0.0568803325311748,0.034346970028440164,0.007656967840735069,0.03587836359658718,0.011376066506234959,0.08072631809232116,0.02909647779479326,0.06672500546926274,0.03500328155764603,0.02384598556114636,0.015095165171734851,0.04134762633996937,0.05884926711879239 12 | Position 11,0.03281557646029315,0.0015313935681470138,0.011813607525705535,0.042660249398381096,0.0002187705097352877,0.0008750820389411508,0.009625902428352658,0.015095165171734851,0.008094508860205645,0.005469262743382192,0.0013126230584117262,0.005031721723911617,0.006125574272588055,0.7416320280026253,0.004594180704441042,0.01903303434697003,0.011157295996499672,0.030190330343469702,0.0,0.0015313935681470138,0.05119229927805732 13 | Position 12,0.011594837015970248,0.0026252461168234523,0.0008750820389411508,0.006344344782323342,0.004156639684970466,0.0008750820389411508,0.007656967840735069,0.11485451761102604,0.05316123386567491,0.4060380660686939,0.009844672938087946,0.011813607525705535,0.0004375410194705754,0.016626558739881864,0.08028877707285058,0.02909647779479326,0.06716254648873332,0.12841828921461387,0.0006563115292058631,0.003937869175235178,0.04353533143732225 14 | Position 13,0.09910304091008532,0.007000656311529206,0.0568803325311748,0.14635747101290747,0.016626558739881864,0.019251804856705315,0.029971559833734413,0.011376066506234959,0.09385254867643841,0.07022533362502735,0.01903303434697003,0.045504266024939836,0.01881426383723474,0.11572959964996718,0.08575803981623277,0.05556770947276307,0.037190986654998905,0.027565084226646247,0.0028440166265587398,0.016407788230146575,0.024939838109822796 15 | Position 14,0.11157295996499672,0.004375410194705754,0.0059068037628527675,0.20476919711222927,0.02909647779479326,0.013782542113323124,0.010063443447823233,0.14110697877926057,0.043097790417851674,0.06497484139138045,0.021877050973528767,0.0059068037628527675,0.0008750820389411508,0.040253773791292934,0.057974185079851234,0.026033690658499236,0.0708816451542332,0.12316779698096697,0.0008750820389411508,0.013782542113323124,0.009625902428352658 16 | Position 15,0.0,0.0,0.0,0.0,0.008313279369940932,0.0,0.0,0.0024064756070881644,0.0002187705097352877,0.9402756508422665,0.040035003281557645,0.0,0.0002187705097352877,0.0,0.0,0.0002187705097352877,0.0,0.0015313935681470138,0.0,0.0,0.006781885801793918 17 | Position 16,0.003500328155764603,0.002187705097352877,0.01225114854517611,0.6690002187705097,0.0030627871362940277,0.0015313935681470138,0.01225114854517611,0.010719754977029096,0.08728943338437979,0.014657624152264275,0.009188361408882083,0.040253773791292934,0.0002187705097352877,0.0444104134762634,0.042879019908116385,0.005031721723911617,0.01334500109385255,0.012032378035440822,0.0024064756070881644,0.008313279369940932,0.005469262743382192 18 | Position 17,0.13454386348720193,0.003281557646029315,0.05381754539488077,0.10982279588711441,0.004156639684970466,0.021877050973528767,0.01881426383723474,0.007219426821264493,0.20914460730693502,0.014438853642528986,0.01334500109385255,0.05250492233646904,0.0004375410194705754,0.06781885801793919,0.1352001750164078,0.09822795887114417,0.033690658499234305,0.00940713191861737,0.0004375410194705754,0.006344344782323342,0.005469262743382192 19 | Position 18,0.11419820608182017,0.022095821483264055,0.0015313935681470138,0.15707722598993656,0.06803762852767448,0.0024064756070881644,0.03740975716473419,0.05097352876832203,0.02822139575585211,0.07219426821264494,0.02384598556114636,0.018595493327499452,0.0,0.019470575366440604,0.06234959527455699,0.09188361408882083,0.04419164296652811,0.09341500765696784,0.02340844454167578,0.063880988842704,0.004812951214176329 20 | Position 19,0.003500328155764603,0.0004375410194705754,0.0,0.0,0.7943557208488295,0.0,0.0008750820389411508,0.0013126230584117262,0.0,0.011376066506234959,0.0010938525486764385,0.0,0.0002187705097352877,0.0002187705097352877,0.0002187705097352877,0.0006563115292058631,0.0,0.0006563115292058631,0.0028440166265587398,0.1774228833953183,0.004812951214176329 21 | Position 20,0.09822795887114417,0.011813607525705535,0.04703565959308685,0.15292058630496608,0.006781885801793918,0.0142200831327937,0.048785823670969154,0.0142200831327937,0.12338656749070226,0.0693502515860862,0.01225114854517611,0.06256836578429227,0.0002187705097352877,0.127761977685408,0.07985123605338,0.0656311529205863,0.030846641872675565,0.017501640778823015,0.0008750820389411508,0.01225114854517611,0.003500328155764603 22 | Position 21,0.05250492233646904,0.008969590899146795,0.026908772697440384,0.1354189455261431,0.027783854736381536,0.01225114854517611,0.02734631371691096,0.04244147888864581,0.15248304528549553,0.04812951214176329,0.02100196893458762,0.028658936775322687,0.0004375410194705754,0.10982279588711441,0.12448042003937869,0.05884926711879239,0.05381754539488077,0.04769197112229272,0.0006563115292058631,0.017064099759352438,0.003281557646029315 23 | Position 22,0.0177204112885583,0.06234959527455699,0.06650623495952745,0.051629840297527894,0.008532049879676219,0.024721067600087507,0.04397287245679282,0.008532049879676219,0.04703565959308685,0.006344344782323342,0.006344344782323342,0.2966528112010501,0.0010938525486764385,0.08619558083570335,0.008969590899146795,0.09363377816670312,0.14942025814920148,0.009188361408882083,0.0017501640778823015,0.007438197330999781,0.001968934587617589 24 | Position 23,0.03522205206738132,0.006125574272588055,0.012907460074381974,0.020345657405381756,0.0017501640778823015,0.016626558739881864,0.10282213957558521,0.006781885801793918,0.14241960183767227,0.004812951214176329,0.018157952308028878,0.05884926711879239,0.21395755852111134,0.12163640341281995,0.13038722380223147,0.06256836578429227,0.03303434697002844,0.0059068037628527675,0.001968934587617589,0.0015313935681470138,0.002187705097352877 25 | Position 24,0.005031721723911617,0.006781885801793918,0.003281557646029315,0.0037190986654998905,0.06431852986217458,0.0002187705097352877,0.07569459636840954,0.006344344782323342,0.10851017282870269,0.016407788230146575,0.0030627871362940277,0.051411069787792606,0.0013126230584117262,0.011594837015970248,0.04987967621964559,0.023189674031940496,0.0656311529205863,0.005250492233646905,0.014657624152264275,0.4817326624371035,0.001968934587617589 26 | Position 25,0.0037190986654998905,0.0006563115292058631,0.0002187705097352877,0.0,0.001968934587617589,0.0004375410194705754,0.0004375410194705754,0.05578647998249836,0.0,0.24677313498140452,0.007000656311529206,0.0004375410194705754,0.6440603806606869,0.0002187705097352877,0.0004375410194705754,0.0028440166265587398,0.0013126230584117262,0.029971559833734413,0.0013126230584117262,0.0006563115292058631,0.0017501640778823015 27 | Position 26,0.026033690658499236,0.016626558739881864,0.24458542988405163,0.03872238022314592,0.0002187705097352877,0.0354408225771166,0.005250492233646905,0.001968934587617589,0.024283526580616933,0.004812951214176329,0.0015313935681470138,0.09210238459855612,0.015751476700940712,0.004375410194705754,0.004812951214176329,0.28221395755852113,0.17829796543425946,0.019470575366440604,0.0,0.0017501640778823015,0.0017501640778823015 28 | Position 27,0.09757164734193831,0.0026252461168234523,0.03325311747976373,0.07985123605338,0.01028221395755852,0.04747320061255743,0.016626558739881864,0.05841172609932181,0.08860205644279151,0.03740975716473419,0.013563771603587836,0.015313935681470138,0.09582148326405601,0.042660249398381096,0.1310435353314373,0.07985123605338,0.051411069787792606,0.08466418726755634,0.0024064756070881644,0.00940713191861737,0.0017501640778823015 29 | Position 28,0.1190111572959965,0.004812951214176329,0.06891271056661562,0.16670312841828921,0.040035003281557645,0.023189674031940496,0.020126886895646467,0.015095165171734851,0.09078976154014438,0.03325311747976373,0.007000656311529206,0.033909429008969594,0.0656311529205863,0.08247648217020345,0.04703565959308685,0.07503828483920368,0.05250492233646904,0.02165828046379348,0.003500328155764603,0.02734631371691096,0.001968934587617589 30 | Position 29,0.04769197112229272,0.0024064756070881644,0.07722598993655655,0.2557427258805513,0.003281557646029315,0.003281557646029315,0.00940713191861737,0.02165828046379348,0.05556770947276307,0.0461605775541457,0.03740975716473419,0.007000656311529206,0.0008750820389411508,0.12404287901990811,0.13935681470137826,0.0177204112885583,0.10107197549770292,0.0444104134762634,0.0015313935681470138,0.0026252461168234523,0.0015313935681470138 31 | Position 30,0.007000656311529206,0.012907460074381974,0.0,0.001968934587617589,0.003937869175235178,0.0015313935681470138,0.0013126230584117262,0.1120105009844673,0.15379566834390723,0.06541238241085101,0.025158608619558084,0.0008750820389411508,0.0002187705097352877,0.01881426383723474,0.5309560271275432,0.003281557646029315,0.009625902428352658,0.03500328155764603,0.0010938525486764385,0.0142200831327937,0.0008750820389411508 32 | Position 31,0.08750820389411507,0.012469919054911398,0.05600525049223365,0.2647123167796981,0.008532049879676219,0.01028221395755852,0.014001312623058412,0.048567053161233865,0.09625902428352658,0.06125574272588055,0.024283526580616933,0.022314591992999344,0.0006563115292058631,0.08378910522861518,0.06409975935243929,0.04397287245679282,0.03500328155764603,0.04353533143732225,0.0024064756070881644,0.01903303434697003,0.0013126230584117262 33 | Position 32,0.0693502515860862,0.003937869175235178,0.05316123386567491,0.29271494202581494,0.008750820389411508,0.022314591992999344,0.03500328155764603,0.014438853642528986,0.07985123605338,0.03281557646029315,0.013563771603587836,0.031502953401881424,0.0017501640778823015,0.13169984686064318,0.10063443447823234,0.04922336469043973,0.028002625246116825,0.016407788230146575,0.003500328155764603,0.01050098446729381,0.0008750820389411508 34 | Position 33,0.001968934587617589,0.0010938525486764385,0.0,0.0,0.013563771603587836,0.0002187705097352877,0.0006563115292058631,0.2198643622839641,0.0,0.6862830890395974,0.0266900021877051,0.0,0.0,0.0002187705097352877,0.0,0.0004375410194705754,0.0024064756070881644,0.03522205206738132,0.0004375410194705754,0.009844672938087946,0.0010938525486764385 35 | Position 34,0.6709691533581273,0.013782542113323124,0.0004375410194705754,0.003281557646029315,0.0006563115292058631,0.036753445635528334,0.0008750820389411508,0.008969590899146795,0.0008750820389411508,0.0028440166265587398,0.008313279369940932,0.0008750820389411508,0.0006563115292058631,0.004375410194705754,0.015313935681470138,0.17195362065193612,0.030627871362940276,0.027127543207175673,0.0,0.0006563115292058631,0.0006563115292058631 36 | Position 35,0.1120105009844673,0.005469262743382192,0.04397287245679282,0.12754320717567272,0.0056880332531174796,0.02340844454167578,0.0284401662655874,0.016407788230146575,0.15182673375628966,0.04244147888864581,0.020126886895646467,0.05316123386567491,0.0002187705097352877,0.08378910522861518,0.11944869831546708,0.08094508860205644,0.05075475825858674,0.02931524830452855,0.0008750820389411508,0.003500328155764603,0.0006563115292058631 37 | Position 36,0.07241303872238022,0.004812951214176329,0.033690658499234305,0.19426821264493546,0.004375410194705754,0.009844672938087946,0.015751476700940712,0.02362721505141107,0.14942025814920148,0.042222708378910526,0.023189674031940496,0.02056442791511704,0.0015313935681470138,0.13695033909429008,0.11135418945526143,0.07503828483920368,0.03456574053817545,0.030190330343469702,0.008094508860205645,0.007219426821264493,0.0008750820389411508 38 | Position 37,0.010938525486764383,0.0496609057099103,0.0,0.0006563115292058631,0.014876394661999562,0.0024064756070881644,0.0006563115292058631,0.1067600087508204,0.0002187705097352877,0.535331437322249,0.020126886895646467,0.0024064756070881644,0.0002187705097352877,0.0006563115292058631,0.0004375410194705754,0.015095165171734851,0.17807919492452418,0.05622402100196894,0.0013126230584117262,0.003281557646029315,0.0006563115292058631 39 | Position 38,0.01793918179829359,0.015313935681470138,0.0728505797418508,0.028658936775322687,0.0006563115292058631,0.39181798293590026,0.030846641872675565,0.0002187705097352877,0.08094508860205644,0.0037190986654998905,0.004812951214176329,0.17195362065193612,0.009844672938087946,0.05381754539488077,0.03894115073288121,0.06256836578429227,0.004375410194705754,0.0028440166265587398,0.0026252461168234523,0.003937869175235178,0.0013126230584117262 40 | Position 39,0.003281557646029315,0.008750820389411508,0.0,0.0010938525486764385,0.005250492233646905,0.0002187705097352877,0.0010938525486764385,0.07919492452417413,0.0030627871362940277,0.6795012032378035,0.10413476263399694,0.0,0.0004375410194705754,0.0008750820389411508,0.0030627871362940277,0.0030627871362940277,0.003500328155764603,0.0851017282870269,0.01050098446729381,0.006344344782323342,0.0015313935681470138 41 | Position 40,0.018595493327499452,0.002187705097352877,0.06475607088164516,0.08094508860205644,0.0,0.015313935681470138,0.009625902428352658,0.0017501640778823015,0.036972216145263616,0.005250492233646905,0.0017501640778823015,0.04047254430102822,0.11966746882520236,0.042660249398381096,0.02822139575585211,0.23320936337781667,0.29052723692846205,0.003937869175235178,0.0028440166265587398,0.0,0.0013126230584117262 42 | Position 41,0.030627871362940276,0.001968934587617589,0.05009844672938088,0.3824108510172829,0.015970247210676,0.01881426383723474,0.008313279369940932,0.017282870269087727,0.06344344782323343,0.036534675125793045,0.011376066506234959,0.01312623058411726,0.14832640560052504,0.025814920148763947,0.047910741632028006,0.02559614963902866,0.02931524830452855,0.0496609057099103,0.0013126230584117262,0.020345657405381756,0.0017501640778823015 43 | Position 42,0.08050754758258587,0.0056880332531174796,0.037190986654998905,0.06344344782323343,0.002187705097352877,0.014876394661999562,0.015095165171734851,0.009844672938087946,0.13235615838984904,0.017501640778823015,0.005031721723911617,0.03172172391161671,0.01334500109385255,0.02822139575585211,0.2931524830452855,0.09604025377379129,0.12557427258805512,0.01793918179829359,0.001968934587617589,0.0059068037628527675,0.0024064756070881644 44 | Position 43,0.02165828046379348,0.0015313935681470138,0.01050098446729381,0.0533800043754102,0.0024064756070881644,0.0037190986654998905,0.006781885801793918,0.00656311529205863,0.051411069787792606,0.005469262743382192,0.0006563115292058631,0.042879019908116385,0.0004375410194705754,0.46970028440166267,0.14329468387661343,0.03325311747976373,0.04812951214176329,0.09166484357908554,0.0013126230584117262,0.0026252461168234523,0.0026252461168234523 45 | Position 44,0.004375410194705754,0.0002187705097352877,0.0002187705097352877,0.0006563115292058631,0.0010938525486764385,0.0006563115292058631,0.0,0.3043097790417852,0.0002187705097352877,0.026908772697440384,0.0004375410194705754,0.0002187705097352877,0.0,0.0,0.0004375410194705754,0.0,0.015095165171734851,0.640341281995187,0.0,0.0002187705097352877,0.004594180704441042 46 | Position 45,0.048567053161233865,0.0013126230584117262,0.017282870269087727,0.05469262743382192,0.018595493327499452,0.006781885801793918,0.011813607525705535,0.012032378035440822,0.30540363159046163,0.008750820389411508,0.008750820389411508,0.024939838109822796,0.0006563115292058631,0.2644935462699628,0.0905709910304091,0.0461605775541457,0.031502953401881424,0.02165828046379348,0.003500328155764603,0.016845329249617153,0.0056880332531174796 47 | Position 46,0.015532706191205425,0.0056880332531174796,0.005250492233646905,0.004812951214176329,0.034346970028440164,0.008969590899146795,0.009625902428352658,0.2535550207831984,0.04047254430102822,0.007438197330999781,0.010719754977029096,0.10697877926055567,0.0006563115292058631,0.00656311529205863,0.03959746226208707,0.018376722817764167,0.07197549770290965,0.30977904178516735,0.0010938525486764385,0.027783854736381536,0.02078319842485233 48 | Position 47,0.0,0.0008750820389411508,0.0002187705097352877,0.0,0.018376722817764167,0.0006563115292058631,0.0026252461168234523,0.0004375410194705754,0.0002187705097352877,0.0006563115292058631,0.0,0.0002187705097352877,0.0,0.0,0.0013126230584117262,0.0010938525486764385,0.0006563115292058631,0.0,0.9317436009625902,0.008094508860205645,0.03281557646029315 49 | Position 48,0.0004375410194705754,0.0004375410194705754,0.0,0.0,0.9240866331218551,0.0,0.0004375410194705754,0.0026252461168234523,0.0,0.007000656311529206,0.0002187705097352877,0.0002187705097352877,0.0,0.0004375410194705754,0.0004375410194705754,0.0002187705097352877,0.0,0.003500328155764603,0.0028440166265587398,0.021220739444322904,0.03587836359658718 50 | Position 49,0.02056442791511704,0.04462918398599869,0.003281557646029315,0.012907460074381974,0.002187705097352877,0.01618901772041129,0.012032378035440822,0.005469262743382192,0.09363377816670312,0.004156639684970466,0.012688689564646685,0.011594837015970248,0.0010938525486764385,0.5889302122073944,0.054911397943557205,0.05578647998249836,0.014001312623058412,0.0030627871362940277,0.0006563115292058631,0.004375410194705754,0.03784729818420477 51 | Position 50,0.00656311529205863,0.0008750820389411508,0.022752133012469918,0.019251804856705315,0.0002187705097352877,0.003937869175235178,0.024939838109822796,0.0024064756070881644,0.014657624152264275,0.0030627871362940277,0.0026252461168234523,0.7956683439072413,0.0,0.00656311529205863,0.030409100853204987,0.009844672938087946,0.005031721723911617,0.0026252461168234523,0.0006563115292058631,0.0030627871362940277,0.04484795449573398 52 | Position 51,0.03609713410632247,0.005250492233646905,0.0004375410194705754,0.007219426821264493,0.0002187705097352877,0.0006563115292058631,0.036972216145263616,0.0,0.11244804200393788,0.002187705097352877,0.003937869175235178,0.008313279369940932,0.0002187705097352877,0.04812951214176329,0.6453730037190987,0.009188361408882083,0.028002625246116825,0.0002187705097352877,0.0006563115292058631,0.007219426821264493,0.04725443010282214 53 | Position 52,0.0,0.0013126230584117262,0.0002187705097352877,0.0004375410194705754,0.0,0.0002187705097352877,0.0006563115292058631,0.0,0.03172172391161671,0.008313279369940932,0.0002187705097352877,0.0026252461168234523,0.0002187705097352877,0.0024064756070881644,0.8956464668562678,0.0004375410194705754,0.0002187705097352877,0.0,0.0006563115292058631,0.0006563115292058631,0.05403631590461606 54 | Position 53,0.352876832203019,0.010063443447823233,0.0024064756070881644,0.008094508860205645,0.01028221395755852,0.0017501640778823015,0.004156639684970466,0.015095165171734851,0.017501640778823015,0.016407788230146575,0.09363377816670312,0.04637934806388099,0.0004375410194705754,0.10347845110479108,0.0409100853204988,0.04397287245679282,0.07175672719317436,0.01618901772041129,0.0015313935681470138,0.05600525049223365,0.0870706628746445 55 | Position 54,0.022314591992999344,0.0006563115292058631,0.0015313935681470138,0.008969590899146795,0.0006563115292058631,0.0017501640778823015,0.007438197330999781,0.004594180704441042,0.4847954495733975,0.006125574272588055,0.0026252461168234523,0.012469919054911398,0.0006563115292058631,0.0656311529205863,0.2325530518486108,0.02100196893458762,0.008313279369940932,0.0028440166265587398,0.0006563115292058631,0.0008750820389411508,0.1135418945526143 56 | Position 55,0.04134762633996937,0.024721067600087507,0.09144607306935025,0.11310435353314373,0.028877707285057973,0.00940713191861737,0.029534018814263836,0.027565084226646247,0.015313935681470138,0.06103697221614526,0.033909429008969594,0.011813607525705535,0.0008750820389411508,0.0496609057099103,0.014001312623058412,0.04112885583023408,0.03281557646029315,0.02100196893458762,0.15357689783417194,0.033909429008969594,0.1649529643404069 57 | Position 56,0.0,0.0,0.0,0.0,0.0,0.0,0.0008750820389411508,0.0,0.46970028440166267,0.0,0.0,0.0002187705097352877,0.0,0.005031721723911617,0.3176547801356377,0.0,0.0,0.0,0.0,0.0,0.20651936119011158 58 | Position 57,0.0,0.0,0.0,0.0,0.0,0.0,0.007000656311529206,0.0,0.2574928899584336,0.0,0.0,0.014657624152264275,0.0,0.04484795449573398,0.24217895427696345,0.0,0.0,0.0,0.0,0.0,0.4338219208050755 59 | -------------------------------------------------------------------------------- /Outputs/homeodomain_PF00046_curated_sequenceEntropies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msternke/protein-consensus-sequence/88c4632f373bc440f0afa4af28c6a250e5144dbf/Outputs/homeodomain_PF00046_curated_sequenceEntropies.png -------------------------------------------------------------------------------- /Outputs/pgk_IPR015824_lengthFiltered_output.txt: -------------------------------------------------------------------------------- 1 | Filtered sequence set: Inputs/pgk_IPR015824.txt 2 | 3 | Parameters used: 4 | Sequence length filtering threshold: 0.3 5 | 6 | Number of sequences in initial alignment: 17724 7 | 8 | Median sequence length: 397 residues 9 | Sequence length lower boundary: 278 residues 10 | Sequence length upper boundary: 516 residues 11 | 12 | Number of sequences in final alignment: 16743 13 | 14 | Wrote length filtered sequence set to: pgk_IPR015824_lengthFiltered.txt -------------------------------------------------------------------------------- /Outputs/pgk_IPR015824_sequence_length_hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msternke/protein-consensus-sequence/88c4632f373bc440f0afa4af28c6a250e5144dbf/Outputs/pgk_IPR015824_sequence_length_hist.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # protein-consensus-sequence 2 | 3 | ## Table of contents 4 | * [Overview](#Overview) 5 | * [Requirements](#Requirements) 6 | * [Installation](#Installation) 7 | * [length_filter.py](#length_filter.py) 8 | * [consensus.py](#consensus.py) 9 | * [Workflow](#Workflow) 10 | * [References](#References) 11 | 12 | 13 | ## Overview 14 | Protein consensus sequence design has been shown to be a successful strategy for engineering highly stable proteins that retain their biological activities. A protein consensus sequences is composed of the most frequent residue at all positions in a multiple sequence alignment (MSA) of homologous protein sequences. All that is needed to design a protein consensus sequence is an MSA for the target protein family and basic coding scripts to determine residue frequencies at all positions in the MSA. Applying preprocessing steps to a sequence set can improve sequence alignment and the resulting consensus sequence. Here we have made available a script (length_filter.py) to assist in preprocessing a sequence set by filtering sequences by sequence length prior to sequence alignment, and a script (consensus.py) to determine residue frequencies at all positions in an MSA, filter residue insertions from the MSA, and determine a consensus sequence. 15 | 16 | ## Requirements 17 | Both scripts require Python3.6 or newer. Scripts were written using Python3.6 on a MacOSX (Unix) system. 18 | 19 | Both scripts require the following non-standard packages: 20 | numpy 21 | matplotlib 22 | 23 | These packages can be installed using pip from the command line by running: 24 | ``` 25 | pip3 install numpy matplotlib 26 | ``` 27 | For information on installing or using pip if necessary see: [pip](https://pip.pypa.io/en/stable/installing/). 28 | 29 | ## Installation 30 | To install protein-consensus-sequence, clone or download this repository to your local computer. 31 | 32 | Both length_filter.py and consensus.py are run from the command line. Customizable parameters are given as command line options to allow the user to use the scripts as desired. The following is a tutorial on how to use both the length_filter.py and consensus.py scripts. 33 | 34 | NOTE: Sequence sets and MSAs must be in the FASTA format. 35 | 36 | NOTE: Both length_filter.py and consensus.py must be made executable before first use. This can be done from the command line by: 37 | ``` 38 | chmod +x length_filter.py 39 | chmod +x consensus.py 40 | ``` 41 | 42 | ## length_filter.py 43 | The script length_filter.py will take in a sequence set (must be in FASTA format) and remove sequence truncations (very short sequences) and anamolously-long sequences (often from large insertions) from the set. This filtering is done by removing sequences that deviate from the median sequence length of the set by +/- a threshold length percentage. A default threshold of 30% deviation is used, but users can change this using the '-t' flag. This processesing step helps create better sequence alignments. 44 | 45 | ### Steps to run length_filter.py 46 | 1. Open command line interface and enter protein-consensus-sequence directory by: 47 | ``` 48 | cd /protein-consensus-sequence 49 | ``` 50 | 2. If not does so already, change file permissions to executable (see Installation section) 51 | 3. Place desired sequence set in Inputs folder 52 | 4. Run length_filter.py script (example is for default parameters) from command line by: 53 | ``` 54 | ./length_filter.py -i Inputs/ 55 | ``` 56 | 5. A curated sequence set file, a histogram of sequence lengths, and a summary file will be saved in the Outputs folder 57 | 58 | To use as an example, a sequence set of phosphoglycerate kinase (PGK) obtained from the Interpro database is in the Inputs folder. To run this example, run the following from the command line: 59 | ``` 60 | ./length_filter.py -i Inputs/pgk_IPR015824.txt 61 | ``` 62 | 63 | ### Command line options 64 | Customizable options can be given as flags on the command line. To see the various available flag options run 65 | ``` 66 | ./length_filter.py -h 67 | ``` 68 | The following table gives an overview of the flag options as well. 69 | 70 | | Flag | Description | 71 | |:-:|:-:| 72 | | -i | Filename for FASTA alignment.| 73 | | -o | Output FASTA filename. If not given will use name of input FASTA file as template to name output files. | 74 | | -t | Sequence length filtering threshold value (default: 0.3 for removing sequences that deviate +/- 30% from median sequence length of set). Must be a value between 0 and 1.| 75 | | -f | Include flag to prevent saving image of sequence lengths histogram | 76 | | -a | Include flag to keep gaps in sequences for output FASTA alignment (not recommended for further curation). | 77 | 78 | For example, to run the PGK sequence set with a different length filtering threshold value, run the following from the command line: 79 | ``` 80 | ./length_filter.py -i Inputs/pgk_IPR015824.txt -t 0.5 81 | ``` 82 | 83 | ## consensus.py 84 | The script consensus.py will take in an MSA (must be in FASTA format), calculate the residue frequencies at all positions in the MSA, filter insertion positions (see note below for explanation insertion filtering methods), and determine a consensus sequence for the MSA. 85 | 86 | NOTE: consensus.py can be run directly on an un-curated MSA obtained from a database such as Pfam (i.e. skipping steps 2-4 in Workflow section below). However, it is recommended that these curation steps be applied. 87 | 88 | ### Steps to run consensus.py 89 | 90 | 1. Open command line interface and enter protein-consensus-sequence directory by: 91 | ``` 92 | cd /protein-consensus-sequence 93 | ``` 94 | 2. If not does so already, change file permissions to executable (see Installation section) 95 | 3. Place desired MSA in Inputs folder 96 | 4. Run consensus.py script (example is for default parameters) from command line by: 97 | ``` 98 | ./consensus.py -i Inputs/ 99 | ``` 100 | 5. A insertion filtered (gap stripped) alignment of all sequences in the MSA, a CSV file of residue frequencies at all positions in the insertion filtered alignment, a plot of the gap frequencies for all positions, a plot of sequence entropties (a measure of position conservation), and a summary file will be saved in the Outputs folder. 101 | 102 | To use as an example, a previously curated MSA of homeodomains (HD) obtained from the Pfam database is in the Inputs folder. To run this example (using default parameters), run the following from the command line: 103 | ``` 104 | ./consensus.py -i Inputs/homeodomain_PF00046_curated.txt 105 | ``` 106 | 107 | ### Command line options 108 | 109 | Customizable options can be given as flags on the command line. To see the various available flag options run 110 | ``` 111 | ./consensus.py -h 112 | ``` 113 | The following table gives an overview of the flag options as well. 114 | 115 | | Flag | Description | 116 | |:-:|:-:| 117 | | -i | Filename for FASTA alignment.| 118 | |-c | Desired method for removing insertions. See below for explanation of methods | 119 | | -o | Output FASTA filename. If not given will use name of input FASTA file as template to name output files. | 120 | | -t | Gap frequecy threshold to define a consensus positions. Only valid for Option 1 for removing insertions. Must be a value between 0 and 1 (default: 0.5).| 121 | | -f | Include flag to prevent saving images of MSA data analysis. | 122 | 123 | ### Insertion filtering options 124 | Filtering an MSA for insertions is done differently by different groups. Filtering insertions can be viewed as "Which positions do I want to include in my consensus sequence?" The consensus.py script allows for users to choose which method they prefer. The available options are: 125 | 126 | 1. (Recommended) Remove positions for which the gap frequency is > 50%. Or put conversely, keep positions for which the gap frequency is < 50%. Note that for this strategy positions included in the consensus sequence can have a gap as the most frequence character in the alignment, however more than half of the sequences in the alignment will have a residue at all included positions. 127 | 2. Remove positions for which a gap is the most frequent character. Or put conversely, keep positions for which a residue is the most frequent character. 128 | 3. Remove positions that contain gaps in a user-specified reference sequence. For this option, the user will be asked to provide the ID of the references sequence. Note that the reference sequence and ID must be present in the MSA. 129 | 130 | ## Workflow 131 | The scripts available in protein-consensus-sequence do not encompass all steps in the process of designing a protein consensus sequence. Rather they are intended to be used in combination with a program to filter for sequence redundancy and a multiple sequence alignment program. The workflow for the entire process of determining a consensus sequence is: 132 | 133 | 1. Obtain a sequence set of target protein family from a database such as: 134 | [Pfam](http://pfam.xfam.org/) 135 | [Interpro](https://www.ebi.ac.uk/interpro/) 136 | [SMART](http://smart.embl-heidelberg.de/) 137 | 2. Filter sequence set by sequence lengths with length_filter.py script. 138 | 3. Filter sequence set for redundant sequences a program such as: 139 | [CDHIT](http://weizhongli-lab.org/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit) 140 | [UCLUST](https://drive5.com/usearch/manual/uclust_algo.html) 141 | 4. Align curated MSA using: 142 | [MAFFT](https://mafft.cbrc.jp/alignment/server/) 143 | [Clustal Omega](https://www.ebi.ac.uk/Tools/msa/clustalo/) 144 | 5. Determine consensus sequence using consensus.py script. 145 | 146 | 147 | ## References 148 | For more information on protein consensus sequence design see: 149 | 150 | 1. [Consensus sequence design as a general strategy to create hyperstable, biologically active proteins. Sternke et al. PNAS, 2019.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6561275/) 151 | 2. [Consensus protein design. Porebski and Buckle. Protein Eng Des Sel, 2016.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4917058/) 152 | -------------------------------------------------------------------------------- /consensus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | import os 8 | import collections 9 | import sys 10 | import csv 11 | import modules 12 | 13 | if __name__ == '__main__': 14 | 15 | #set path to current directory 16 | path = os.getcwd() + '/' 17 | 18 | #make directory for outputs if it does not exist 19 | if not os.path.exists('Outputs/'): 20 | os.makedirs('Outputs/') 21 | 22 | parser = argparse.ArgumentParser() 23 | 24 | parser.add_argument('-i', required=True, metavar = 'Input FASTA file.', help='Filename for FASTA alignment') 25 | parser.add_argument('-o', default='out.txt', metavar = 'Output gap stripped FASTA file name', help='Output FASTA filename. If not given will use name of input FASTA file as template to name output files.') 26 | parser.add_argument('-c', default='0', metavar = 'Method for removing insertions', help='Desired method for removing insertions. 1 = Positions with gap frequencies < threshold (0.5 default, change with -t flag). 2 = Positions with residue as most frequent character. 3 = Positions with residues in a specific sequence. If not given will ask for user input upon running script. See README for further explantion of methods.') 27 | parser.add_argument('-t', type=float, default = 0.5, metavar = 'Gap frequency threshold', help='Gap frequecy threshold to define a consensus positions. Only valid for Option 1 for removing insertions. Must be a value between 0 and 1 (default: 0.5)') 28 | parser.add_argument('-f', action='store_true', help='Include flag to prevent saving images of MSA data analysis.') 29 | 30 | args = parser.parse_args() 31 | 32 | #Show help if no arguments passed 33 | if len(sys.argv) < 2: 34 | parser.print_help() 35 | sys.exit(1) 36 | 37 | #FIGURE SETTINGS 38 | mpl.rcParams['axes.titlesize'] = 18 39 | mpl.rcParams['axes.labelsize'] = 18 40 | mpl.rcParams['xtick.labelsize'] = 14 41 | mpl.rcParams['ytick.labelsize'] = 14 42 | mpl.rcParams['axes.facecolor'] = 'FFFFFF' 43 | mpl.rcParams['axes.edgecolor'] = '000000' 44 | mpl.rcParams['axes.linewidth'] = 1.0 45 | mpl.rcParams['axes.labelweight'] = 'regular' 46 | mpl.rcParams['xtick.major.pad'] = 3 47 | mpl.rcParams['ytick.major.pad'] = 3 48 | plt.rcParams['font.family'] = 'sans-serif' 49 | 50 | print() 51 | print(f'Reading file: {args.i}') 52 | 53 | #read FASTA and populate lists for sequences and IDs 54 | seqs = list() 55 | names = list() 56 | 57 | #flag to strip gaps when cleaning sequence 58 | strip_gaps = False 59 | 60 | try: 61 | with open(args.i, 'r') as n: 62 | seqs, ids = modules.read_fasta(n, strip_gaps) 63 | except FileNotFoundError: 64 | print(f'Could not find file: {args.i}') 65 | sys.exit(1) 66 | 67 | #If no sequences were added to list, file was not in FASTA format 68 | if len(seqs) < 2: 69 | print('Provided file is not in FASTA format.') 70 | sys.exit(1) 71 | 72 | #Ensures sequences are aligned (all sequences have same length) 73 | if not modules.is_fasta_aligned(seqs): 74 | print('Sequences in provided FASTA are not aligned') 75 | sys.exit(1) 76 | 77 | num_seqs = len(seqs) 78 | 79 | #standard 20 amino acid alphabet 80 | res_list = list('ACDEFGHIKLMNPQRSTVWY-') 81 | 82 | #Matrix for residue frequencies at each position 83 | marginals = modules.marginal_frequencies(seqs, res_list) 84 | 85 | #Calculating consensus sequences for method based on user input 86 | consensus_positions = list() 87 | consensus_sequence = list() 88 | 89 | #Get user input for how to determine positions to include from MSA (handling gaps) 90 | #Users must eneter '1', '2', or '3' 91 | #Exits script if they do not enter valid input in five tries 92 | consensus_choice = args.c 93 | nTries = 5 94 | while consensus_choice not in ['1', '2', '3'] and nTries > 0: 95 | consensus_choice = input('\nWhich method for determining consensus positions do you want?\n***Note: Option 1 recommended.***\n1: Positions with gap frequencies < threshold (0.5 default, change with -t flag)\n2: Positions with residue as most frequent character\n3: Positions with residues in a specific sequence (you will give sequence ID)\n') 96 | nTries -= 1 97 | if nTries == 0: 98 | print('Invalid responses. Must enter 1, 2, or 3. Exiting program...') 99 | sys.exit(1) 100 | print() 101 | 102 | #Includes all positions for which gap frequency < 0.5 103 | #In other words, the sum of frequencies of 20 residues is > 0.5 104 | #This option can include positions where a gap is the most frequent occurrence 105 | if consensus_choice == '1': 106 | threshold = args.t 107 | if threshold < 0 or threshold > 1: 108 | print('Gap frequency threshold must be between 0 and 1') 109 | sys.exit(1) 110 | 111 | for i, j in enumerate(marginals): 112 | if j[-1] < threshold: 113 | max_res = np.argmax(j[:-1]) 114 | consensus_positions.append(i) 115 | consensus_sequence.append(res_list[max_res]) 116 | 117 | #Includes all positions for which a residue is the most frequent ocurrence 118 | #In other words, eliminates positions for which the most frequent occurrence is a gap 119 | elif consensus_choice == '2': 120 | for i, j in enumerate(marginals): 121 | max_res = np.argmax(j) 122 | if max_res != len(j) - 1: 123 | consensus_positions.append(i) 124 | consensus_sequence.append(res_list[max_res]) 125 | 126 | #Includes all positions occupied by residues in a user-defined reference sequence 127 | else: 128 | #Take user input for sequence ID of reference sequence 129 | ref_seq = input('What is the ID of your reference sequence? ') 130 | 131 | if len(ref_seq) == 0: 132 | print('\nNo sequence ID was given...') 133 | sys.exit(1) 134 | 135 | #Users may not give sequence ID with leading '>' character 136 | #IDs from FASTA all have leading '>' character, so must add to reference ID 137 | if ref_seq[0] != '>': 138 | ref_seq = ''.join(['>', ref_seq]) 139 | 140 | try: 141 | ref_index = ids.index(ref_seq) 142 | except ValueError: 143 | print('\nCould not find sequence ID in set. Check your MSA for the correct ID...') 144 | sys.exit(1) 145 | 146 | for i, (j, k) in enumerate(zip(seqs[ref_index], marginals)): 147 | if j != '-': 148 | consensus_positions.append(i) 149 | max_res = np.argmax(k[:-1]) 150 | consensus_sequence.append(res_list[max_res]) 151 | 152 | consensus_sequence = ''.join(consensus_sequence) 153 | print(f'Consensus sequence: {consensus_sequence}') 154 | print() 155 | 156 | #Removes positions with high gap frequencies from all sequences in alignment 157 | #Calculates residue frequencies for gap stripped alignment 158 | seqs_gap_stripped = list() 159 | for i in seqs: 160 | seqs_gap_stripped.append(''.join([i[j] for j in consensus_positions])) 161 | marginals_gap_stripped = modules.marginal_frequencies(seqs_gap_stripped, res_list) 162 | 163 | #Calculates sequence entropies for all poisitions in the gap stripped alignment 164 | seq_entropies = modules.seq_entropy(marginals_gap_stripped) 165 | 166 | #Creates matrix of residue frequencies at all positions in gap stripped alignment 167 | #in a format for exporting as a CSV 168 | out_marginals = [] 169 | for i, j in enumerate(marginals_gap_stripped): 170 | if i == 0: 171 | out_marginals.append([''] + res_list) 172 | out_marginals.append([f'Position {i+1}'] + list(j)) 173 | else: 174 | out_marginals.append([f'Position {i+1}'] + list(j)) 175 | 176 | #Calculating gap frequencies 177 | #Gap is last element in each row of marginal frequencies 178 | gap_frequencies = [i[-1] for i in marginals_gap_stripped] 179 | 180 | #Setting path for file outputs 181 | #If the user does not supply a name for the output, use input file name 182 | #If user used a file from the Inputs directory, must remove 'Inputs/' from path 183 | out_file_name = args.o 184 | if out_file_name == 'out.txt': 185 | out_file_prefix = os.path.splitext(args.i)[0] 186 | if 'Inputs' in out_file_prefix: 187 | out_file_prefix = out_file_prefix.split('/')[1] 188 | else: 189 | out_file_prefix = out_file_name.split('.')[0] 190 | 191 | #Save figures if -f flag is not given 192 | if not args.f: 193 | #Save figure of alignment sequence entropies 194 | fig, ax = plt.subplots() 195 | ax.stem(np.arange(1, len(seq_entropies) + 1), seq_entropies, use_line_collection = True) 196 | ax.set_xlabel('Sequence position') 197 | ax.set_ylabel('Sequence entropy (bits)') 198 | ax.set_ylim(0, 4.5) 199 | fig.savefig(f'{path}Outputs/{out_file_prefix}_sequenceEntropies.png', bbox_inches = 'tight', dpi = 300) 200 | plt.close() 201 | 202 | #Save figure of gap stripped alignment gap frequencies 203 | fig, ax = plt.subplots() 204 | ax.stem(np.arange(1, len(gap_frequencies) + 1), gap_frequencies, use_line_collection = True) 205 | ax.set_xlabel('Sequence position') 206 | ax.set_ylabel('Gap frequency') 207 | ax.set_ylim(0, 1) 208 | fig.savefig(f'{path}Outputs/{out_file_prefix}_gapFrequencies.png', bbox_inches = 'tight', dpi = 300) 209 | plt.close() 210 | 211 | #Create a FASTA of the gap stripped alignment 212 | out_fasta = modules.make_fasta(ids, seqs_gap_stripped) 213 | 214 | #Save files 215 | with open(f'{path}Outputs/{out_file_prefix}_residueFrequencies.csv', 'w', newline='') as f: 216 | writer = csv.writer(f) 217 | writer.writerows(out_marginals) 218 | 219 | #Write summary of analysis to file 220 | with open(f'{path}Outputs/{out_file_prefix}_consensus_output.txt', 'w', newline='') as f: 221 | f.write(f'Determined consensus sequence for: {args.i}\n\n') 222 | f.write('Parameters used:\n') 223 | if consensus_choice == 1: 224 | f.write(f'Method for removing insertions: {consensus_choice}\n') 225 | f.write(f'Gap frequency threshold: {threshold}\n\n') 226 | else: 227 | f.write(f'Method for removing insertions: {consensus_choice}\n\n') 228 | f.write(f'>Consensus_sequence\n{consensus_sequence}\n\n') 229 | f.write(f'MSA sequence entropy per residue: {np.mean(seq_entropies):.3f}\n\n') 230 | f.write(f'Wrote gap stripped alignment to: {out_file_prefix}_gapStrip.txt\n\n') 231 | f.write(f'Wrote CSV of residue frequencies to: {out_file_prefix}_residueFrequencies.csv\n\n') 232 | if args.f: 233 | f.write(f'Wrote plot of sequence entropies to: {out_file_prefix}_sequenceEntropies.png\n\n') 234 | f.write(f'Wrote plot of gap frequencies to: {out_file_prefix}_gapFrequencies.png\n\n') 235 | 236 | with open(f'{path}Outputs/{out_file_prefix}_gapStrip.txt', 'w') as f: 237 | f.writelines(out_fasta) 238 | 239 | print(f'Wrote summary of output to: Outputs/{out_file_prefix}_consensus_output.txt') 240 | -------------------------------------------------------------------------------- /length_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import matplotlib as mpl 6 | import matplotlib.pyplot as plt 7 | import os 8 | import sys 9 | import modules 10 | 11 | if __name__ == '__main__': 12 | 13 | #Set path to current directory 14 | path = os.getcwd() + '/' 15 | 16 | #make directory for outputs if it does not exist 17 | if not os.path.exists('Outputs/'): 18 | os.makedirs('Outputs/') 19 | 20 | parser = argparse.ArgumentParser() 21 | 22 | parser.add_argument('-i', required=True, metavar = 'Input FASTA file', help='Filename for FASTA alignment.') 23 | parser.add_argument('-o', metavar = 'Output FASTA filename. If not given will use name of input FASTA file as template to name output files.', default='out.txt', help='Output FASTA filename') 24 | parser.add_argument('-t', metavar = 'Length filtering threshold', type=float, default=0.3, help='Sequence length filtering threshold value (default: 0.3 for removing sequences that deviate +/- 30% from median sequence length of set, change with -t flag). Must be a value between 0 and 1.') 25 | parser.add_argument('-f', action='store_true', help='Include flag to prevent saving image of sequence lengths histogram') 26 | parser.add_argument('-a', action='store_true', help='Include flag to keep gaps in sequences for output FASTA alignment (not recommended for further curation).') 27 | 28 | args = parser.parse_args() 29 | 30 | # show help if no arguments passed 31 | if len(sys.argv) < 2: 32 | parser.print_help() 33 | sys.exit(1) 34 | 35 | #FIGURE SETTINGS 36 | mpl.rcParams['axes.titlesize'] = 18 37 | mpl.rcParams['axes.labelsize'] = 18 38 | mpl.rcParams['xtick.labelsize'] = 14 39 | mpl.rcParams['ytick.labelsize'] = 14 40 | mpl.rcParams['axes.facecolor'] = 'FFFFFF' 41 | mpl.rcParams['axes.edgecolor'] = '000000' 42 | mpl.rcParams['axes.linewidth'] = 1.0 43 | mpl.rcParams['axes.labelweight'] = 'regular' 44 | mpl.rcParams['xtick.major.pad'] = 3 45 | mpl.rcParams['ytick.major.pad'] = 3 46 | plt.rcParams['font.family'] = 'sans-serif' 47 | 48 | 49 | print() 50 | print(f'Reading file: {args.i}') 51 | print() 52 | 53 | #read FASTA and populate lists for sequences and IDs 54 | seqs = list() 55 | names = list() 56 | 57 | #flag to strip gaps when cleaning sequence 58 | strip_gaps = True 59 | 60 | try: 61 | with open(args.i, 'r') as n: 62 | seqs, ids = modules.read_fasta(n, strip_gaps) 63 | except FileNotFoundError: 64 | print(f'Could not find file: {args.i}') 65 | sys.exit(1) 66 | 67 | #If no sequences were added to list, file was not in FASTA format 68 | if len(seqs) < 1: 69 | print('Provided file is not in FASTA format.') 70 | sys.exit(1) 71 | 72 | num_seqs = len(seqs) 73 | print(f'Number of sequences in initial sequence set: {num_seqs}') 74 | print() 75 | 76 | #Calculating lengths of all sequences in alignment 77 | #Note: gaps are removed before calculating sequence lengths 78 | lengths = modules.calc_lengths(seqs) 79 | 80 | threshold = args.t 81 | if threshold < 0 or threshold > 1: 82 | print('Sequence filtering threshold must be between 0 and 1') 83 | sys.exit(1) 84 | 85 | #Calculating statistics for length filtering 86 | med_length = int(np.median(lengths)) 87 | lower_threshold = int(round(med_length - med_length * threshold)) 88 | upper_threshold = int(round(med_length + med_length * threshold)) 89 | 90 | print(f'Median sequence length: {med_length} residues') 91 | print(f'Sequence length lower boundary: {lower_threshold} residues') 92 | print(f'Sequence length upper boundary: {upper_threshold} residues') 93 | print() 94 | 95 | #Create FASTA of sequences that fall within threshold values 96 | #Gaps are remove gaps from sequences unless user indicates otherwise with -a flag 97 | out_fasta = list() 98 | if args.a: 99 | for i, (j, k, l) in enumerate(zip(lengths, ids, seqs)): 100 | if j >= lower_threshold and j <= upper_threshold: 101 | if i != num_seqs - 1: 102 | out_fasta.append(f'{k}\n{l}\n') 103 | else: 104 | out_fasta.append(f'{k}\n{l}') 105 | else: 106 | for i, (j, k, l) in enumerate(zip(lengths, ids, seqs)): 107 | if j >= lower_threshold and j <= upper_threshold: 108 | if i != num_seqs - 1: 109 | out_fasta.append(f"{k}\n{l.replace('-','')}\n") 110 | else: 111 | out_fasta.append(f"{k}\n{l.replace('-','')}") 112 | 113 | 114 | num_seqs_out = len(out_fasta) 115 | print(f'Number of sequences in final sequence set: {num_seqs_out}') 116 | print() 117 | 118 | #Setting path for file outputs 119 | #If the user does not supply a name for the output, use input file name 120 | #If user used a file from the Inputs directory, must remove 'Inputs/' from path 121 | out_file_name = args.o 122 | if out_file_name == 'out.txt': 123 | out_file_prefix = os.path.splitext(args.i)[0] 124 | if 'Inputs' in out_file_prefix: 125 | out_file_prefix = out_file_prefix.split('/')[1] 126 | else: 127 | out_file_prefix = out_file_name.split('.')[0] 128 | 129 | #Save figure if -f flag is not given 130 | if not args.f: 131 | fig, ax = plt.subplots() 132 | ax.hist(lengths, bins='doane', color='b', edgecolor='k', alpha=0.65) 133 | ax.set_xlabel('Sequence length') 134 | ax.set_ylabel('Count') 135 | ax.axvline(med_length - threshold*med_length, color='k', linestyle='dashed') 136 | ax.axvline(med_length + threshold*med_length, color='k', linestyle='dashed') 137 | fig.savefig(f'{path}Outputs/{out_file_prefix}_sequence_length_hist.png', bbox_inches = 'tight', dpi = 300) 138 | plt.close() 139 | 140 | #Save length filtered alignment 141 | with open(f'{path}Outputs/{out_file_prefix}_lengthFiltered.txt', 'w') as f: 142 | f.writelines(out_fasta) 143 | 144 | #Write summary of analysis to file 145 | with open(f'{path}Outputs/{out_file_prefix}_lengthFiltered_output.txt', 'w', newline='') as f: 146 | f.write(f'Filtered sequence set: {args.i}\n\n') 147 | f.write('Parameters used:\n') 148 | f.write(f'Sequence length filtering threshold: {threshold}\n\n') 149 | f.write(f'Number of sequences in initial alignment: {num_seqs}\n\n') 150 | f.write(f'Median sequence length: {med_length} residues\n') 151 | f.write(f'Sequence length lower boundary: {lower_threshold} residues\n') 152 | f.write(f'Sequence length upper boundary: {upper_threshold} residues\n\n') 153 | f.write(f'Number of sequences in final alignment: {num_seqs_out}\n\n') 154 | f.write(f'Wrote length filtered sequence set to: {out_file_prefix}_lengthFiltered.txt') 155 | 156 | print(f'Wrote filtered alignment to: Outputs/{out_file_prefix}_lengthFiltered.txt') 157 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .consensus_tools import clean_seq 2 | from .consensus_tools import read_fasta 3 | from .consensus_tools import calc_lengths 4 | from .consensus_tools import is_fasta_aligned 5 | from .consensus_tools import marginal_frequencies 6 | from .consensus_tools import seq_entropy 7 | from .consensus_tools import make_fasta 8 | -------------------------------------------------------------------------------- /modules/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msternke/protein-consensus-sequence/88c4632f373bc440f0afa4af28c6a250e5144dbf/modules/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /modules/__pycache__/consensus_tools.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/msternke/protein-consensus-sequence/88c4632f373bc440f0afa4af28c6a250e5144dbf/modules/__pycache__/consensus_tools.cpython-36.pyc -------------------------------------------------------------------------------- /modules/consensus_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | 4 | def clean_seq(seq, strip_gap_flag): 5 | ''' Takes in a sequence and cleans any character that is not in the standard 6 | 20 amino acid alphabetand replaces it with a gap. 7 | :Arguments: 8 | - seq = sequence to be cleaned 9 | - strip_gap_flag = flag to strip gaps (length_filter) or not (consensus) 10 | :Returns: 11 | - new_seq = a string object of the cleaned-up sequence 12 | ''' 13 | alphabet = set('ACDEFGHIKLMNPQRSTVWY') 14 | gap = '-' 15 | 16 | if strip_gap_flag: 17 | seq = seq.upper().replace('-','').replace('.','') 18 | else: 19 | seq = seq.upper() 20 | new_seq = list() 21 | for aa in seq: 22 | if aa in alphabet: 23 | new_seq.append(aa) 24 | else: 25 | new_seq.append(gap) 26 | return(''.join(new_seq)) 27 | 28 | def read_fasta(in_seq_set, strip_gap_flag): 29 | ''' Reads a FASTA file and returns a list of sequences and a list of ids from 30 | the file. 31 | :Arguments: 32 | - in_seq_set = FASTA file 33 | - strip_gap_flag = flag to strip gaps (length_filter) or not (consensus) 34 | :Returns: 35 | - fasta_seqs = list of sequences in the input FASTA file 36 | - fasta_ids = list of ids in the input FASTA file 37 | ''' 38 | fasta_seqs = list() 39 | fasta_ids = list() 40 | test_seq= '' 41 | for line in in_seq_set: 42 | if line[0] == '>': 43 | fasta_ids.append(line.rstrip()) 44 | if len(test_seq) > 0: 45 | fasta_seqs.append(clean_seq(test_seq, strip_gap_flag)) 46 | test_seq = '' 47 | else: 48 | test_seq += line.rstrip() 49 | fasta_seqs.append(clean_seq(test_seq, strip_gap_flag)) 50 | return(fasta_seqs, fasta_ids) 51 | 52 | def calc_lengths(in_seqs): 53 | ''' Calculates the lengths of all sequences in a list and returns a list of 54 | all lengths. 55 | :Arguments: 56 | - in_seqs = alignment sequences 57 | :Returns: 58 | - lengths = list of sequence lengths 59 | ''' 60 | lengths = [len(i) for i in in_seqs] 61 | return(lengths) 62 | 63 | def is_fasta_aligned(in_seqs): 64 | ''' Tests if FASTA file is aligned by determining if all sequences have the 65 | same length. Returns True or False. 66 | :Arguments: 67 | - in_seqs = alignment sequences 68 | :Returns: 69 | - aligned_boolean = boolean value is True if sequences are aligned 70 | (all sequences have the same length), False if sequences are not 71 | aligned 72 | ''' 73 | seq_lengths = calc_lengths(in_seqs) 74 | aligned_boolean = seq_lengths.count(seq_lengths[0]) == len(seq_lengths) 75 | return(aligned_boolean) 76 | 77 | def marginal_frequencies(in_seqs, res): 78 | ''' Determines the residue frequencies at each position in the alignment. 79 | Returns a L x q matrix where L is # of positions in alignment and 80 | q is the number of characters is amino acid alphabet 81 | :Arguments: 82 | - in_seqs = alignment sequences 83 | - res = amino acid alphabet 84 | :Returns: 85 | - matrix = numpy array of residue frequencies at all positions 86 | ''' 87 | len_seqs = len(in_seqs[0]) 88 | matrix = np.zeros((len_seqs,len(res))) 89 | for i in range(len_seqs): 90 | Fi = collections.Counter(aa[i] for aa in in_seqs) 91 | for z in Fi: 92 | res_index = res.index(z) 93 | matrix[i][res_index] = Fi[z] / sum(Fi.values()) 94 | return(matrix) 95 | 96 | def seq_entropy(marginals): 97 | ''' Calculates the sequence entropy (a measure of conservation of the position) of all positions in the alignment. 98 | Note: a base 2 log is used. 99 | Note: gaps are not considered in sequence entropy calculation. 100 | Returns a vector of sequence entropy for each position in alignment. 101 | :Arguments: 102 | - marginals = matrix of residue frequencies 103 | :Returns: 104 | - entropies = list of the sequence entropy at each position in the alignment 105 | ''' 106 | entropies = list() 107 | for i, j in enumerate(marginals): 108 | ent_i = 0 109 | for k in j[:-1]: 110 | if k != 0: 111 | ent_i -= k * np.log2(k) 112 | entropies.append(ent_i) 113 | return(entropies) 114 | 115 | def make_fasta(in_ids, in_seqs): 116 | ''' Converts list of sequences and list of sequence IDs to a format for export 117 | as a FASTA file. 118 | :Arguments: 119 | - in_ids = output sequence ids 120 | - in_seqs = output sequences 121 | :Returns: 122 | - out_fasta = list of ids and sequences for output as FASTA file 123 | ''' 124 | out_fasta = list() 125 | for i, (j, k) in enumerate(zip(in_ids, in_seqs)): 126 | if i != len(in_seqs) - 1: 127 | out_fasta.append(''.join([j, '\n', k, '\n'])) 128 | else: 129 | out_fasta.append(''.join([j, '\n', k])) 130 | return(out_fasta) 131 | --------------------------------------------------------------------------------