├── .gitattributes ├── .gitignore ├── README.md ├── data ├── cleavage │ ├── neuropred_dataset │ │ ├── annotated_seqs.lf │ │ ├── extra_tracks │ │ │ ├── seqs.acc │ │ │ ├── seqs.disorder │ │ │ ├── seqs.pssm │ │ │ └── seqs.ss │ │ └── filtered_seqs.fasta │ └── uniprot_dataset │ │ ├── extra_tracks │ │ ├── seqs.acc │ │ ├── seqs.disorder │ │ ├── seqs.pssm │ │ └── seqs.ss │ │ ├── filtered_seqs.fasta │ │ └── raw_data.xml └── phosphoserine │ ├── annotated_seqs.lf │ └── annotated_seqs_demo.lf ├── py ├── asap │ ├── __init__.py │ ├── classification.py │ ├── config.py │ ├── data.py │ ├── features.py │ ├── features_deps │ │ ├── AAScales.py │ │ ├── AAlphabets.py │ │ ├── Disorder.py │ │ └── __init__.py │ ├── parse.py │ ├── sklearn_extensions.py │ ├── util.py │ └── window_extraction.py ├── cleavepred │ ├── __init__.py │ ├── api.py │ ├── check_top_features.py │ ├── common.py │ ├── extract_uniprot_annotated_seqs_from_xml.py │ ├── extract_windows.py │ ├── get_disopred.py │ ├── produce_auto_files.py │ ├── project_paths.py │ ├── test_classifier.py │ ├── train_classifier.py │ └── util.py └── deeppred │ ├── __init__.py │ ├── api.py │ ├── check_top_features.py │ ├── common.py │ ├── extract_uniprot_annotated_seqs_from_xml.py │ ├── extract_windows.py │ ├── get_disopred.py │ ├── produce_auto_files.py │ ├── project_paths.py │ ├── test_classifier.py │ ├── train_classifier.py │ └── util.py └── web └── cleavage ├── context_processors.py ├── manage.py ├── settings.py ├── templates ├── base.html ├── cleavage-prediction.html └── home.html ├── urls.py ├── views.py └── wsgi.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | # cleavepred auto-generated files 61 | data/cleavage/uniprot_dataset/annotated_seqs.lf 62 | data/cleavage/neuropred_dataset/window_simple_features.csv 63 | data/cleavage/neuropred_dataset/window_advanced_features.csv 64 | data/cleavage/uniprot_dataset/window_simple_features.csv 65 | data/cleavage/uniprot_dataset/window_advanced_features.csv 66 | data/cleavage/simple_peptide_predictor.pkl 67 | data/cleavage/advanced_peptide_predictor.pkl 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # asap 2 | Amino-acid Sequence Annotation Predictor 3 | 4 | Please take a look at our Wiki for a quick tutorial: https://github.com/ddofer/asap/wiki/Getting-Started:-A-Basic-Tutorial 5 | 6 | Feel free to add or improve! 7 | 8 | If you use us (or our code), please cite us: 9 | 10 | > Brandes, N., Ofer, D., & Linial, M. (2016). ASAP: a machine learning framework for local protein properties. Database : the journal of biological databases and curation, 2016, baw133. https://doi.org/10.1093/database/baw133 11 | -------------------------------------------------------------------------------- /data/cleavage/uniprot_dataset/extra_tracks/seqs.acc: -------------------------------------------------------------------------------- 1 | >P0DMI8 2 | eeee--eee-e-eeeeeeeeeeee-ee--e--ee--e--eee-eeeee-eeee 3 | >Q8AYR6 4 | eeeeeeee--e--ee--ee-ee--e-ee-ee--eeeeeeeee-eee-eeee-ee-ee-eeeeee---e--ee--eeeeeeeeeeeeee-ee-----e-ee--e-eeeee 5 | >A7WNV3 6 | eeeeeeeeeeeeeeeeee-e--ee--e------ee--ee---e--eee 7 | >B3VZU0 8 | eeeeeeeeeeeeee-eeeeee--ee--e---ee-ee------eee 9 | >E7EKE0 10 | eeeeeeeeeeee--eee-eee--ee-ee---e-eee--ee--ee-e-e-eeee 11 | >Q2UXW0 12 | eeeeeeeeeeeeeeeee-e-e-eee--ee------ee-ee---e-eeee 13 | >P0CAQ4 14 | eeeeeeeeeeeee-eeeeee--e-eee-e---ee-eee-e-ee 15 | >P05222 16 | eeeee-e---e---e--ee--e---e--ee-eeeeeeeeeee-eeeeee-eee--e-------e--ee--e-------e-eeeeeeeeeee-eeeeee-eeee-e-------e--ee--e---ee-eeeeeeeeeeeeee-eeeeee-e------eeeeeee 17 | >B6D434 18 | eeeee-e-ee--ee--e---eeeeee------e-e-eeeeeeeeeeeee-e-e-ee-e-eeeeeeeeee-e-eeeee-ee------eeeeee--e---eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-ee--ee-ee--ee-eee-ee--ee--ee-ee--eeeee 19 | >Q01301 20 | eeeeeeeee-eee--eeeee---e--e--e---e--eeeee-eee-eeeee--ee-ee--ee--e------ee-ee--ee-e-eeee-eeeeeeeeeeeeeeeeeeeeee--eee-ee--eeeeeeee----e-----eeeeeeeeeee--e-e-e---eeee-eee-eeeeeeeee--eeee------eee-e---e--ee-eeeee---eeee-ee-eee-eee-e-------eeee--eeeeeeee----eeee-ee-ee--e---e--ee--ee-eeee 21 | >P0C8W0 22 | eeeeeeee-eeeee-ee-e-ee--ee-eeee-eeeee-eee--eeeeee---eee-e-e-eeeeeee-e-eeeeee-ee 23 | >D6C4I0 24 | eeeeeee-ee-eee--eee--eee--ee-eee-eee-e--ee---e-e-eeeee-ee-eee 25 | >C7DQB7 26 | ee--eee----ee----------e-ee--e-e-eeeeeeee 27 | >C7DQB9 28 | eee-eeee---e------ee-e---e-eeeee-eeee--ee-ee--eeeee-ee--ee 29 | >D6C4J2 30 | eeeee-eee-eeee-eeeee 31 | >D6C4I9 32 | ee-e-eeeee---e-eee-ee--ee--eeeee-eee-eeee-eeeee 33 | >Q9BPJ1 34 | eeeeeeeeeeee-ee-eee-eeeeee--ee-ee--ee---e-ee-ee 35 | >Q9BPI6 36 | eeeeeeeeee-ee-eeee-eeeeeee-ee-ee--ee------e--ee 37 | >P0C1N7 38 | eeeeeeeeeee--ee-eeeeeee-eeeee--e-e-eee-e----e 39 | >C1J5M7 40 | eeeeeeeeeeeee-eee-eeeeee--eee--ee--eeeeeeee--e--eee-e--eee 41 | >P0C7I1 42 | eeeeeeeeeeee-ee-eee-eeeeee--eee-ee--eeeee--eee-eee---ee 43 | >Q5K0B9 44 | eeeeee-ee-ee-eeeeeeeee-ee-eee----ee-e--------e---------e-e 45 | >Q3YEF1 46 | eeeeeeee---eeeee--e-eee-ee--ee-eeee--e-e---------e--------------eee 47 | >Q9XZK9 48 | eeeeee--ee---e--ee-eeeeeee-eee-eeeeee-eeeee--------e-e 49 | >P0C8V5 50 | eeeeee-ee--ee-eee-eeee-ee-eeeee--ee------eee----e---e------ee 51 | >P0CB09 52 | eeeeee--e--ee-eee-eeee-ee-eee--eeeee--e--eee--ee-----e-e 53 | >P58913 54 | eeeeee-ee--ee-eee-eeee-ee-eeee---ee------ee-----e---e---eee 55 | >P69762 56 | eeeeee-ee--ee-eee-eeeeeee-eee--e--ee-----ee--ee--ee-e-e 57 | >Q9UA72 58 | eeeeeeeeeeee-e-eeeeee--ee-eeeee---eeee--e-e-eeee-e 59 | >Q3YEF4 60 | eeeeeeeee--e--eeeeeeeeeeeee-ee---e-eee-e--e-e-ee--eeee 61 | >Q3YEF9 62 | eeeeeeeeeeeee-e--eeeee-ee--eee-ee-------e-e--eee-ee---eee 63 | >G1AS83 64 | eeeeeeeeee-ee-eeeeeeeeee-ee--eeeeee-ee--e----e-eeee-ee 65 | >P0CY65 66 | eeeeeeeeeeee-eeeee-eeeeeee-eeeeee---ee-e---e-eeeeee-ee 67 | >D2Y4A1 68 | eeeeeeee--eeeee-e-ee------ee----e-e 69 | >B2KPN7 70 | eeeeeee-eee-eeeeee-eeeeee-ee--eeee-e-eeeeeee---eee 71 | >Q17AN4 72 | ee-e-eee-eeeeeeeeeeeeeeee--ee-e-e-eeeeee-ee--ee--eee-e-e--e---ee-ee--ee--ee-ee-------e-e--ee-eeee-e-eeeeeeee--ee--eee 73 | >A0NDK8 74 | ee-e-eee-eeeeeee-eeeeeeeeeeee-ee-e-ee--ee--eeee-ee-eeeeee--ee--eee-e-e-----ee-eeeee--e--eee-ee-eee------------eeeee------e-eeeee-eee-e-eeee-ee-eeeeeeee 75 | >Q9BH75 76 | eeeeeee-e----e--ee-ee--e---e-ee--eee---eee 77 | >Q9BPD6 78 | eeeeee-eee--eeeeeeee-eeeee--------e---e--eeeee 79 | >Q9BH86 80 | eeeeeee-e--e-ee--ee--ee-eee-e--ee-e--eee 81 | >Q3YEH6 82 | eeeeeee-e-ee-eee-ee--e--eeeee--eeeee-ee 83 | >Q1A3R0 84 | eeeeeee-e-ee-eee-ee--ee--eeee--eee----ee 85 | >P0C641 86 | eeeeeeeeeeee-ee-eee--eee-eee--eee------ee 87 | >Q3YEH2 88 | eeeeeee-eeee--ee-ee--e--eeeee--eee---eeee 89 | >P69766 90 | eeeeeeee-e----eee-ee--ee-eeee----eee---e 91 | >Q9BPH2 92 | eeeeee-ee-eeeeee-ee----eee----ee---eee 93 | >O17512 94 | eeeeee----eee-ee--ee--ee-ee-ee---e--ee--ee-eeeee 95 | >P0DJC4 96 | eeeeeee-eeee-eee-eee-e-ee-ee---ee----eee 97 | >B5A9S9 98 | eeeeeeeeeeee-eeeeeee-eee--e-e-----eee 99 | >A4H222 100 | eeeeeee-eeeeee-eee-eeee--ee--ee------eeeeeeeeeeeeee-e-ee-eee--------e---ee-e-eeeeeeeee-eee-eee-eeee-eeee 101 | >Q8N687 102 | eee-ee--eee-ee-eee--eeee-e---eee-e---e-eeeee-ee-eeee-ee-e---e-ee-ee--e----e-e-e---eeeeeeeeeeeeee-ee-eee--eeeeee-ee-eeee-e-ee-eeeeee--eee 103 | >Q9BYW3 104 | ee--ee--ee-e--eee-eeee-e-ee---e--eeee---e-eeeee------eee-eee---e-e--ee----ee---ee-eeeeeeeee 105 | >Q9BEE3 106 | ee--ee--ee-e--eee-eeee-e-ee------eeee---e-eeeee------eeeeeee-------------------e---eee---e--e--e-e-eeee 107 | >Q9H1M4 108 | eee--ee--e-e-ee--e-ee-ee---ee-e----e-ee-ee-ee-eeeeeee-ee---e-eeeee--ee-ee-eeeee 109 | >P0C8A2 110 | eeeeee-ee--eeeee-eeeeee-------e--eeeeeeeeeee-----e-e-ee-eeeeee-eeee-e-e---eeee 111 | >P0C8A1 112 | eeeeeeeeeee-eeee-ee-ee---------eeeeeeeeeee-ee---e-e---e--ee-eee-eeeeee-e---eeee 113 | >Q17UZ0 114 | eeeeeeeeeeeeeeeeeeeeeee--e---e-ee--ee--ee-ee-eeeeeeeeeeeee 115 | >O93222 116 | eeeeeeeeeeeeeeeeeeeeeee--------------e---e--e-eeeee 117 | >O93224 118 | eeeeeeeeeeeeeeeeeeeeee---e--e---e---e---e--ee-eeee 119 | >O93454 120 | eeeeeeeeeeeeeeeeeeeeeeee---e--ee------e-----eeeee 121 | >O93453 122 | eeeeeeeeeeeeeeeeeeeeeee--ee-ee-eee-eeeeeeeeeee--ee--eeeeee 123 | >Q6XMH8 124 | eeeeeeeeeeeeeeeeee-e-eee-ee-eee-ee---eeeeeeee 125 | >Q19165 126 | eeeee-eee-ee-ee---e--eeeeeee-e-ee-ee---e--ee--eeeeee--eeee-e-ee-ee---e--eeeeeeeee-e-ee-e-------eeeeeeeeeeeeeeeeee-e--e------e--eeeeeeeee-e-eee-----e-ee 127 | >Q9XVX1 128 | eeeeeeeee-e-e--eeeee-e-e--eeeee--e--eeeeee-ee-eee-eee-e-eee-e--ee-eee 129 | >O44185 130 | eeeee-e---ee-eeeee--e--eeeeeeeeeeeeeeee-----ee-eeeee-----eeeeeee-----ee--eeee-----e-ee-ee-----ee-eeeee-----e-eeeee-----eeee---------eeee--e-eee 131 | >O17058 132 | ee-eee-ee-eee---e--e-ee-eeee-eeeee---e--eeee 133 | >P80398 134 | eeeeeeeeeeeee-eeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-ee-eeeeee 135 | >P85070 136 | eeeeeeeeeeeeeee-ee-ee-ee-------eeeee-----eeee 137 | >P61516 138 | eee-ee-eeeeeeeeee-eeeee-eeeeeeeee-----eeeeeee-e--e---e--eeee-e---ee 139 | >Q801Y3 140 | eeeee-eeeeeee-eeeeeeeeeeeeee-e-eee-e-eeee--e--e---e--eeee-----ee 141 | >A1Z0M0 142 | eeeeeeee-eee----eee-e-ee-e-e---eeeeeeeeee-e---e--eeeee-----ee 143 | >Q9VT52 144 | eee---ee-ee--ee---e---ee-eee-eeeeeee-ee-ee-e---e--eee-e---e--ee-e---ee--ee--e-ee--e--eeeeee 145 | >Q7KUD5 146 | eee-ee--ee--e--e---ee-------ee------e-ee--ee-eeeeeee-ee-ee-eee-ee--ee--eee-e-ee-eee-ee 147 | >Q9W4Z4 148 | eeeeeee-eeee---eee-ee--ee---e-e-e-ee--eeeeeeeeeee-ee-ee---eeee-e-ee--ee-ee 149 | >K7ZGS2 150 | eeeeeeeeeeeeeeee-e-e--ee--e---ee------eee 151 | >E4Z7G0 152 | eeeeeeeeeeeeeeeeeeeeeeeeeeeee--ee--ee-e-eee 153 | >C5J893 154 | eeeeeeee-e-eee-ee--e---eeeee-ee-eee-ee 155 | >Q16998 156 | eeeeeeee-ee-eeeee--e-ee-eeeee-ee-ee-ee-ee-ee-ee-ee-ee-eeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee 157 | >Q58T45 158 | eeeeeeeee-eeee-eeee--ee-ee-ee---e--ee--ee--ee--e--eeeeeeeeeee--ee-ee--ee-ee-e-eee-eeeeeee-eeee--e--eee-ee----------e--ee--eeee 159 | >C5J897 160 | eee---e--ee--ee--eeeeeeeeeeeeeee-ee--ee-eee-eee-ee--eeee 161 | >Q718F4 162 | ee----ee---------eeeeeee-eee-eeeee-eeeeee-ee--ee-eee 163 | >P0DJO3 164 | eee----ee-ee---ee--ee-ee-ee-e-ee--eee-ee---e--ee--e 165 | >P0DJ02 166 | e-e---ee-ee--eeee-ee-ee-eee-eee-eee--e--ee-ee 167 | >L0GCV8 168 | eee-e--ee-ee--eeee-ee-ee--e--eee-ee-ee--eee-ee-eee--ee-e 169 | >I0DEB3 170 | eee---e--ee--eeee-ee-ee-eee-eee-eee--e--ee-ee 171 | >P0C8W1 172 | e-e---e--ee--eeee-ee-e-ee-ee--eeee-eee--e--ee-ee 173 | >P0DJ03 174 | e-e---e--ee--eeee-ee-e-ee-ee--eee-eee--e--ee-ee 175 | >F1CJ89 176 | eeee--ee--eeee-ee-eeee 177 | >Q9U662 178 | eeeeeee--e---e-eee-eeeeeee-eee--ee--e-eeeee--ee----e-e 179 | >Q9U657 180 | eeeee---e--ee-eee-eeeeeee-eeeee-eee-e--e-e----e---e---------e 181 | >Q9BP77 182 | eeeeeeee-eeeee-eeeee-e--ee-eee-e--eeeeee----e--eeeee------eeee 183 | >Q9BPC6 184 | eeeeeeeee-eee-e---eeeee-ee--e-e-ee--e--eeeee--ee--e-e-eee 185 | >Q9BPC3 186 | eeeeeeeeeeeee-e--eeee-eeeeee-e---------eeeeee---e---e---ee 187 | >Q9BP62 188 | eeeeeeeeeeeee--e-eeeee-eee--eee-ee--eeeeee---e-eeee-e-----ee 189 | >Q9BP65 190 | eeeeeeeeeeee-e-eeeee-eee-eee-eee-eeeeee--eeeeeee-eee-ee 191 | >B6DT16 192 | ee------e--eeee-ee-eeee-ee-eeeeee-ee-eeee-ee-eee--ee-eee--e--eeeeeeee 193 | >P0DKU2 194 | e---e--e---eeee-ee-ee--eee--e--ee-ee-ee-e-eeee-e-e-ee-------e--ee--e--ee-eeeeeeeeeee--e----------e-e--e-e-eee-eee-ee-eee-ee-eee-ee 195 | >Q16N80 196 | eeeeeeeeee-ee--eeeee-ee-eee--------e--ee---ee--ee--ee-eee-----eee-ee--ee--ee----------e-eeeee------e-ee-eeeeeeeeeeeeee-e--ee-eee-e--ee-eee-e-e-eee-ee 197 | >Q0VZ39 198 | eeeeeeeeeeeeeeeeeeeeeeee--e--ee--ee--e--eeee 199 | >P85882 200 | eeeeeeeeeeeeeeeeeeeeeeee--e--ee--e--ee--eeee 201 | >A8B5P7 202 | eeeeeeeeeeeee-eeee-ee---e--eeee-ee--ee---e-eeee 203 | >P31394 204 | eeeeee----eeee-ee--eeee-ee--ee-eeee-eee-eeee-e-e---e--eeeee-ee---e-eeeee-eeeee--e-eee-eeeee-eeeeee-e-e-eee-eeee-e-e-e-ee--eeeee-----e-eeeeee-e--ee-e--eeeee-eeee--e--e-e-eee-e-ee-e---e-e-eeeee-----e-ee-ee-e---------eeeee-------ee-------e--eeeee---------eeee-ee-ee-e-eee-e-ee--eee----------eee-eeee--------ee---ee--eee-e-e-----eeeeeeeeeeeeeee-----e----eeee-eee-eee--e------eeeee----e---------e-ee----------e-ee-eee---------e--e--eeeeeeeeeeeeeeee 205 | >D3UA80 206 | eee-e-eeee-eee--ee-ee--ee--ee--ee-e-e-eee 207 | >P83719 208 | eeeeeeeeeeeee--eee-ee---e---eeeeeeee-eee 209 | >P08950 210 | e-ee--ee-ee-eeeee-eee-ee----e---eee-eeeeee-e---ee--eeee 211 | >Q99109 212 | eee-e-ee-eeeee-ee--eee-ee-ee-e---e--eeee-ee-ee--ee-eee-e----eeee--e-ee-e-e----eeeeeeeee-ee--e---e--eeeeee-------eeee-e--eee-e-eeeeee-ee-e-eeeeeeeeeeeeeee-e--e--ee-ee--ee-ee------e-e-eeeeee-----e---e--e--eeeeeee----eeeee-e-eeee------e------e-eeeeee----e-eee-e-eeee--ee-ee---e----ee-e-e-ee-eee-ee-ee-eee-e-e--eee---e-ee-eee-ee-eee-ee-e--ee-ee-eeeeeeee-e-eeeee--ee-eee-ee-eeeeeeee-eeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeee-eeee-----e--e-eeeee--e-eeeeee---e----e-e-e 213 | >A6MWS8 214 | ee-----eeeeeeeeeeeeeee--ee-e-ee----ee-eeee-eeeeeeeeee 215 | >Q9SE35 216 | ee-ee--eee-eeeeeeeeee-eeeeeeeeeeee-ee-eeeeee-eeee-ee-e-eeee-ee-e-eeee-ee-eeeeeeeeeee-e--eeeeeee----eeeeeee-ee-eeeee--ee--eeeeeee-ee-eeeeeeeee--eeeeeee-eeeeeeeee--eeeeeee-eeeeeeeeee-eeeeeee-eeeeeeeee--eeeeeee-eeeeeeeeee-eeeeeee-eeeeeeeee--eee-eeee 217 | >Q9YGH3 218 | eee-eeeeeeeeeeeee-ee-ee---e------e--ee-e-eee--e-e-eeeee-eee-eeee--ee-ee-eeeeeeee-ee---ee-eee 219 | >Q9YGH4 220 | eeeeee-e-e-e-ee--ee-ee-ee--ee--ee--ee--ee-e-ee-e-eee-ee--ee-ee-e-e-eee-eeeee-eeeeeeee-ee---ee-eee 221 | >Q91194 222 | e--eeeeeeee-e---ee--ee-e---e-eee--e---ee--ee-eeeeeeeeeee-eeeee---e--e--eeeee-eeeeeeee-ee---ee-eee 223 | >A7WNV7 224 | eeeeeeeeeeeeeeeee-e-e--ee--e---e--ee-eee 225 | >P79875 226 | eeeeeeeeeeeee-eeeeee-eee--e---e--e--eee 227 | >P00735 228 | ee--eeeee-ee--e--ee--ee--ee-eeee--ee--eee-eeee-ee--eeeeeeeeeeeeee--ee-eeeee-eee-ee-e---ee-e--e-eeee---e-e--e-eeee--e-e-eeeeeee-e-ee------ee-ee------eeeeeee-----e--eeeee--eeeee-eee---e--eeeee-eee-ee-e-ee-e-eeeee--e-e-ee-ee--e--eeeee-eeeee-----ee-ee-------ee-------e-e-eeeeee-e-ee-eee--eeeeeeeeeeeeee--e--eee---eeeee--------eeeee-e--e--eee-eeee--eeee-ee----------eeeeee---------ee-----------eeee-e-e-ee----------eee-eee-ee-e-ee--e-ee--eee-----------eee-eeeee-------eee--ee--eee----------eeeeeeeeeee-e-e----------eeee-ee--eee--e-------eeeeeee----e-----------eeee---------------eeee--------e--e--ee--eeeee 229 | >A5A3H1 230 | eeee-eee-eee-ee-ee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e 231 | >D5KXG5 232 | eeeeeeeee--ee-ee--e--eeee-e-eeeeeeeee-------ee---e-eeeeeee 233 | >E2S064 234 | eee-ee-ee-eeee-eeeeeee-e-eeee-eee--ee-eeeeeee-eee-eee 235 | >P07198 236 | eeeeeeeeeeee-ee--e---ee-eee-ee--e-e-ee--eeeeee--ee-eeeeeeeeee 237 | >Q8AYR5 238 | eeeeeeeeee--ee---ee--------eeeee--e--eeeeee-eeee-ee-ee-eeee--e---e--ee-ee-eeeeee---e-e-----e-ee----eeeee 239 | >Q805D4 240 | eeeee-e-eeeeeee-ee-ee--eee-ee-eeeeeeeeee-eeeeee-eee-eeeeeeeeeeeeeeeeeeeeeee--eeee-eeee-e-eeee---e--ee--ee-ee-eeeeeeee-------e-ee--e-eeeee 241 | >Q805D3 242 | eeeeeeeee-eee-e-ee--e--eeeeeeeeeeeeee--e--ee-eee-e------e--eeeeeeeeeee-eeeeeeeee-------ee---e--eee 243 | >Q5SC60 244 | eeeee-e-ee-eeeeeeeeee-ee-e-eeee-e-eeeeeee-----eeee-e--------ee-------ee-e---------ee-ee-ee------eeeeeee-ee-eee-e-ee-ee-e-ee-e---ee-eee-eeee-----e-eeeeeeeeeeee---------ee----ee-e 245 | >P80111 246 | eeeee-eeeeeeeeeeee-ee-eeee--ee-eee-eee--ee--ee--ee-ee-eee-ee-eeee-----e--eeeee-eee 247 | >Q0MWV8 248 | eeeee-eee-eeee-e--eeeeeeeeee-eeee-eeeeee-eeeeee-ee-eee-ee--eee 249 | >P0C7P5 250 | eeeeeeeeeee-eeee-e-ee--eeeeeeee-ee-e-ee--eeeeeee-eeee-eeee-eee--e---ee-eee-eeee-e--eee-eeeee--e-----e--ee-eee-ee--ee-ee---------e----------e--ee--ee---e-----e-ee--eeeeeee 251 | >P68515 252 | eeeee-eeee-eeeeee-ee-e-ee--eeeeeeeeee-ee-e-ee-eee-eeeeeee-e--e--eeeeeeeeeeee-e-ee--eeeeeeee-eeee-ee--eeeeeee--eeee-eee--e---ee-eee-e--e-e----eeeeeeeee-e--eeeeeeee--e------e--ee-eee-ee--ee-ee----e------e----------e-eee-eeee-e-----e-ee--e-eeeee 253 | >B0VXV8 254 | eeeee-eeeeeeee-ee-e--e--eeeeeee-ee-eeee-eeeeeeeeee-eee------ee-eee-e--eeee--eeee-eeeee-ee-eee-eeee---------e--ee--ee-ee--ee-e-----e-----------------e-eee-eeee-e-----e-ee--eee-eee 255 | >P0CB12 256 | eeeeee-eeeeeeee-ee--e-eeeeee--e-ee-eeeeeee-e 257 | >E3PQQ8 258 | eeeeee--ee-e-eeeeeee-eeee-eee------e----eeeeeeee-eeeeee--e--eeee--ee----eeeee-eee-ee-eeeeeeeeeee-ee--e--ee--eeeeee-ee--eeeeeee-eeeeeeeeee--ee---------eeeee----e--ee-ee--eeeeeeee--ee-eeee 259 | >P50145 260 | eeeeee-eeee-e-eeeeeeeeeeeeeee-ee-eeeee-eeeeeeeeee--e---e--ee-eeeeee-e--e-ee-ee-e-eee-ee----------ee--ee-e-ee 261 | >E2E4E4 262 | eee-e-eeeee--ee-eeeeeeeeeeeee-ee-ee-eeeeeeeee--e---e--ee-ee-eeeee-eee-ee----------ee--ee-e-ee 263 | >D6C4K5 264 | eeeeeeeeeeeee-eee-eeeeeee-eee-eee-e---e----e----------e-ee 265 | >C7DQC2 266 | eeeeee--e-e-e----e--e-e-eee-e-eeeeee-ee--eee 267 | >C7DQX6 268 | eeeee-e-eeeee--ee----e--eee-eeeeeeee-ee--eeee 269 | >C7DQB8 270 | ee--eeee---ee-e--ee--------ee--eeeeeeeeeee 271 | >Q9BPJ8 272 | eeeeeeeeeeeeeee-eee--eeeeee-ee-ee--e-e-e-e--e-ee 273 | >Q9BPH5 274 | eee-ee-eeeeee-eeeee-ee--e--ee---------eee----e 275 | >Q5EHP2 276 | eee-ee-eeeeee-eeeeee----e-eeee--e-e--ee--e--ee 277 | >P60207 278 | eeeeeeee--eeeee-eee--eeeee--eee 279 | >D5L5Q7 280 | eeeeeee-ee-ee-eeeee--e-e-eeeeee-ee--e---eee 281 | >Q3YEG3 282 | eeeeeeee-eeeeeeeeeee-e--e--ee-eeeeee-ee-eee--e--e-eeeeee--ee 283 | >Q9U654 284 | eeeee-ee--ee-eee-eeee-ee-eee--eeeee-ee-eee--e-e----e-e 285 | >Q9UA95 286 | eeeeeeeeeeee-eeeeeeee-eee-eeeeee-eee-e---e--ee-eee-eeeeee 287 | >G1AS75 288 | eeeeeeeeee-e--eeeeeeeeee--e-e------e-eee-e--eee--ee--eeee 289 | >G1AS80 290 | eeeeeeeeeeeee-e--eeeeee-ee--eee--e--ee-eeeee--eee-eeee-eee 291 | >P10000 292 | e-eeeee-ee-eeeee--e--e--eee-eee-ee-eeeeeeeeeeeeeee------eeeeeeeeeeeeeeeeeeee--------eeee-eeeee-e---ee-eeeeeeeee-e-e-eee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-e-e----ee-eeeee-e---ee-eeeeeee--e--ee--eeeee 293 | >Q9XYR5 294 | ee-ee--ee--eee-e-e-------eeeeeeeeeeeeeeee---e--eeeeeee 295 | >Q9NDA7 296 | eeeeeeeeee-eeeeeeee--ee--e--eeee-eeeee-e 297 | >P0C907 298 | eeeeeee-e--e-eee--ee--eeee--ee-ee--e 299 | >P0C906 300 | eeeeeee-e-ee-eee-e-e-ee--eee--ee-----eeeee 301 | >Q3YEH1 302 | eeeeeeeeee-e-ee-eee-ee--eee-e-ee--eee---e--eeeee 303 | >P69767 304 | eeeeee-e-ee-eeeeee--ee--ee-e--eeeee--e 305 | >C0KYC3 306 | eeeeeee-e--e-eeeeeeeeeee--e-------e-e 307 | >Q1A3Q5 308 | eeee-e--eeeeeeeeee-e---e--eeeee--eeee-e--ee 309 | >Q9BPH1 310 | eeeeeee-eeee-eeeeeeeeeeeeee----ee-ee 311 | >D2Y493 312 | eeeeeeeeeeeeeeee-e--e-eeee-------e-e-ee-eeeee---ee--eeeee 313 | >D6C4L9 314 | eeeeeeeeeeeeee---------eeee-eeeee--eeee-e----ee-ee-ee-e 315 | >B3FIA6 316 | eeeeeeee--ee--ee-eeee-e-ee--ee-eeeee--eee-e-eeee-e--eeee 317 | >O77256 318 | eeeeeeeeeeeee-eee-eeeeeee-eeeeeeee 319 | >P0C8A9 320 | eeeeeeee-eee-eeeeeeeeee--ee-e--e-eeee--eeeee------ee-eeeeee-ee---e-eee--------ee--eeeeeee 321 | >O93456 322 | eeeeeeeeeeeeeeeeeeee-ee-ee-----eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeee-ee-eeeeeeeeeeeeeeee--e------eee-eee 323 | >B3IUE0 324 | ee--e--ee-eeee------ee-e-e-----------e-eeeeee--e-eeeeeeeeeeee-ee---ee--e--eee-e-ee--ee-ee-eee--e--ee--ee-eeeeee-ee---e-ee 325 | >O93451 326 | eeeeeeeeeeeeeeeeeeeeeee-ee--ee-ee---e--e---ee-ee--eeeee 327 | >O93223 328 | eeeeeeeeeeeeeeeeeeee-eee--ee-ee--ee--ee--ee-ee--eeeee 329 | >P81490 330 | eeeeeeeeee-eeeeeeeeee-ee--ee--ee--ee--ee-ee--eeeee 331 | >Q9XWV7 332 | eeeeeeee-e-e-eeeeeeee--e---eeeeeeee---e--e--e--ee--e---e--ee--eeee---eee-eeeeee-eeeeee--e-eeeeee--e--eeeee--e-eeeeee--e-eee 333 | >Q9N4V0 334 | eeee--e-eeee-eee--e--e--eeeeee------ee-e--------eeee--eeeeeeeeeee-----e--e-e-eeeeee--------eee-eeeeee--------eeeeeeeeee--------e-eeeeeeeeeeee--------eee--------e-ee-eee--e--eeee-e---e-eee 335 | >Q8MPY9 336 | ee---e----eee-eee-eeee--eeee-eee-e-ee---e--eeeeeeee-e--ee----eeee 337 | >A8WU84 338 | eeeeeeee---eeee-ee--e---e-eeee-e-e-e--eeeeeeee----------ee-e-e--ee 339 | >Q90W78 340 | eeeeeeeeeeeeeeeeeeeeee-eee-ee-eeeee-ee--eee-eeeeee 341 | >Q9VT50 342 | eee-e-e-ee--ee-e-e---ee-ee--e-----------eeeeeeeeeeeeeee-eee-eee--e-ee-e-ee--------e-e---------ee-eeeee---ee--ee--eeee--e--eee 343 | >Q9VT51 344 | ee--ee-ee--ee--ee--e--eeeee-e--eee-ee-e-----eeeeeeeee-ee--e----e-e-eee--e------eeeeeeee--ee--eee-e-ee-ee--ee-ee 345 | >Q9VT53 346 | eee---e--ee--e-----e-eee-eeeeeeeeee-ee--e---e--ee-e-e-eeee-eeeeeeeeeeeeeeeeeee-ee-ee--ee--ee--eee-e-ee--ee-e 347 | >G3ETQ2 348 | eeeeeeeeeeee--eee-eeeee---e--ee--ee-e-e-eeee 349 | >P0DJ35 350 | eeee-eeeeeee-eeeee---e---eeeeeeee-eee-ee 351 | >P08947 352 | eeeeee-ee-eeee--ee---e----eeeee-eeeeeeeeee-e-----eeee-e-eeee 353 | >Q17093 354 | eeeeee---ee-eeeee----eeee-----ee-eeee-----eeee-----ee-e-ee-----eeee-----eeee-ee-----eeeeeeeeeee-----ee-e-ee-eeeee-eeeee-eeeeee 355 | >Q25060 356 | eeeeeeeeeeeeee-eeeeeeeee-eee-ee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeee-ee-eeee-eeeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeee-eee--eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeee-eeeeeeeeeeeeee-eeeeeee--eeeeeeee-eeeeeee-eeeeeeeee-eeeeeeeeeeee-eeee-eeeeeeeee-ee--eee--eeeeeeee-eeeeeeeeeeee 357 | >Q09180 358 | eeeee--e--ee-ee--e-----ee-eeeeeee-eeeeeeeeeeee--e--e---e--e-eeeeeee-eeeeeeeeeeee--e--e---e--e-e-eeeee-eeeeeeeeeeee--e--e------e-eeeeeee-eee-eeeeeeeeeeee--e---------eeeee-ee-e-eee-ee 359 | >C9X4J0 360 | ee-ee--ee--eee--ee-eee-ee--ee--eee-eeeee-eee-eeeeeeeeee 361 | >R4JNJ5 362 | ee-ee--ee-ee---eee-eeeee-ee--eee-eee--e--ee--e 363 | >I0DEB5 364 | eeee--ee--e--eeee-ee-ee-ee--eee-eee--e--ee-ee 365 | >D9U2B8 366 | ee---e------e--eeeeeeeeeee-eeeeee--eee-e-ee--ee-eee 367 | >G1FE62 368 | ee----ee--e--ee--eeeeeeee-eee-e--ee-eeeeee-ee--ee-eee 369 | >P0CI89 370 | eee---e------e--eeeee-e-eee-eeeeee-eeeeee-ee--ee-eee 371 | >B8XH50 372 | ee-e---e---ee--ee-e--e-----e-eeee-ee-ee--eee 373 | >L0GCJ6 374 | eeeee-e-eeeeeee--eeeeeeeee-e-eeeeeeeeeee--eeee-eee-------eeeeeeee 375 | >P43443 376 | ee-eee-ee-ee-eee------e--eeee-eee-eeeeee-eeee-eee--e--e-e-eeee-ee-eeeeeee-ee--ee--ee--eeeee 377 | >Q9BPA3 378 | eeeee--e---ee-eee-eee-eee-ee-eee------eeeee---------e-e 379 | >Q9BP87 380 | eeeeeeee-e--e-ee--ee-e-eee-ee-eee-e-e---ee-e-ee-e-ee-eee 381 | >Q9BPA0 382 | eee-eeeeeee--ee--ee-eeeeeeee-eeeeeee-eeeee---e---e--eee---e-e 383 | >Q9BP90 384 | eeeee-ee--eee-ee---e-eeeeee--ee------eeee--ee-------ee 385 | >Q9BPA7 386 | eeeeee-eee-ee-eeeeeeeeeeeeee-eee-ee-eeeee--e---eee-e 387 | >Q9BP70 388 | eeeeeeee-eee-e-e-e-eee-eeee-eee-ee--eeeeee--eeeeeee---e--ee 389 | >Q7PTL2 390 | ee-eeeeee-ee-eeeeeee-ee-e-eeeeeeeeeee-ee-eee--------e--ee---ee--ee--ee-eee-----eee-ee--ee--ee----------eeeeeee-----e-ee-eeeeeeeeeeee-e--ee-eee-e--ee-ee--eee-e---e-ee 391 | >P85799 392 | eeee-eee-ee-eee-eeeeee-ee-----eeeeeeeeeeeeee--ee-ee----e-e-ee-ee--eeeeeeeeeee-ee-----ee-e-ee--e-eee--e--e--e------eeeeeeeeeee--eeeeeee-e-eeeee-eeeeeeee-eeeeeee-ee-eeee-eee-e-eee---------eeeeeeee-ee--eee-ee-eeeee--eeeeeee-----e-ee--ee--e-ee--ee------e--ee--ee-eee-eee-eeee-eeee-e-e---e---eeeee-----eeeeee-ee-eee-----eeeeee-e-e---ee 393 | >P86040 394 | eeeeeeeeeeeeeeeeeee-eeee-ee-ee--ee-ee-ee-e-e------ee--eeee 395 | >B3VZU5 396 | eeeeeeeeeeeeeee-eee-ee---e--e---e--ee 397 | >Q99N14 398 | eeeeeeee---eeeeee------ee-e-ee-e-e-eeeeeee-ee------ee----e--e---e---ee---ee-e-eee-eeeeeeee-eeeee-eeeeeeeeeeeeeee 399 | >M9P2C1 400 | ee-e-eee-e---ee-eee-ee-ee--ee--eeee-ee------eeeeeee---eeeeeeee--ee-eeeee 401 | >A5A3H2 402 | eeeeee-eeeee-eee-eeeeee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e 403 | >P0DKP4 404 | eeee-ee-ee-e-ee--e--eee--e--eeee-ee-ee-ee-e-ee--e--eeeeee-eeeee-ee-ee-ee-e-ee--e--eeeeeee-eeee-------eeeeee 405 | >A5LHG2 406 | eeeeeeeeeeeeeeeeee--e--e-eeee-ee--------eeeeeeeeeeeeeeeeee--eeeeeeeeee-eeeeeeeeeeeeee-eeee 407 | >Q868F8 408 | eeeeeeeee-ee-----eeee-e-e-e-e----e--ee---e-e-eee-ee-eee-ee-eeee---e--eee--ee-eeeeee-e-ee-e-ee-e-eee 409 | >P20968 410 | eeeee-ee-ee--eee-eee-eeee-eeee-eee--ee-eee-e-eeeeeeeeeeeeeeee--ee--ee--ee-ee-eeeeeeee-------e-ee--e-eeeee 411 | >Q805D8 412 | ee--e--eeee-ee-ee--ee-eee-eeeeeeeeeeee-eeeeeeeeeeeeee-eeeeeeeeee-eeeeeeeeeeeee-eee-ee--e-eeeee-e---e--ee--ee-----eeeee 413 | >Q25461 414 | eeeeeeeeeee-eeeeeeeee--e--eeee---e---eeee-ee-eeeeee--e-ee-eeeeeeeeeeeee--eeeee-eeeeeeee---e--ee--ee-e--eee-eeee---e--eeeeeeeeee---e--ee--ee-eee--ee-ee------e--ee-ee--ee-eeeee 415 | >P01021 416 | eeeee--eeeeeeee-eeeeeeee-ee-ee-ee-ee-e-eee-ee-eeeee-ee-eeeee-eeeeeeee-ee-eeeee-eee-eeee-ee-ee-eeeeeeee--e--eeee-eee------ee--ee-e--------e-e-eeeeee-ee--ee-eeeee-e----------e--eee-ee--ee-ee----e--------------e--e-eee-eeee-e-----e-ee--eee-eee 417 | >P21591 418 | ee--ee-eeeee-eeeeeeee---------eee-eeee-ee-ee--ee--ee--e--eeeeeeeeeeee-ee-----ee--ee--eeeee 419 | >A0AEI5 420 | eeeeeeeeeeeee--ee-ee---ee--ee--ee--ee--ee-e-e-eeee 421 | >P50983 422 | eeeee--eeee---e-e 423 | >P05226 424 | eeeeee-e---e---e--ee--e------ee-ee-ee-eeee---eeeee--------eeeeeee-eee--e-------e--ee--e-------e-eeeeeeeeeee--eeeee---------eeee-e--------eeeeeee-eee--e-------e--ee--e---ee-eeeeeeeeeeeeee-eeeeee-e------eeeeeee 425 | >U5KJZ2 426 | eee-e-ee--ee--ee--eeeeee------e-eeeeeeeeeeeeeee-e-e-ee-e-eeeeeeeeee-e-eeeee-ee-e-e-eeeeeee--e---eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeee-ee-eee-ee-ee--ee-eee-eee-ee-eee-e-----eee 427 | >Q8JHB9 428 | eee-eeeeeee-eee-eeeeeeeeeeee-eeeeee-eeeeeeeee-ee-ee---e--ee-ee-eeee----e-ee-e-e-e-ee-e--------ee--ee-e-ee 429 | >D6C4H9 430 | eeeeeee--ee-e-ee-eee-ee-eee-ee--ee-eeeee--eee--ee---ee-ee-e 431 | >C7DQX9 432 | e-eeee---e---e--eee--eee-ee--eeeeeeeeee---e--eee 433 | >P16240 434 | eeee--ee-eeee-eeeee-eeee-eee-e--e---e--ee-eee--ee-eeeeeeeee-e--e---e-ee-eeeeeee--ee--e------e--ee--ee-eeee 435 | >Q9BPJ4 436 | eeeeeeeeee-ee-eee-eeeeee--ee-eeeeeee--e-ee-e-eeeeeee---ee---ee-ee-ee 437 | >Q5EHP4 438 | eee-ee-eeeeee-eeeee-e---e--eeee----ee-eee-e--e 439 | >Q9BPH3 440 | eee-ee--eeeee-eeeeeee---e--ee--ee-eeee-e-ee 441 | >Q3YEG4 442 | ee-eeeeee-e---eeeee-e-eee-eeeee--eeeee--ee--ee-eeeee 443 | >P0C833 444 | eeeeeeeeeeee-eeeeee-e--ee-eeeeee-eee-e--ee--ee-ee---e------eeeee 445 | >P0CH24 446 | eeeeeeeeeeeeeeeeeee-ee-ee-ee--ee-ee----eeeeeeeeee 447 | >Q2I2P1 448 | ee-eee-ee---e--eee--ee--eeeeee-e-eeee-e-e----e--eeeeeeeee-eeeee 449 | >Q9NDA6 450 | eeeeeeeeee-eeeeeeeeeeee--e--eeee----ee-e 451 | >Q9GU58 452 | ee-eee-eeee-eeeeee-eee-ee------eee-eee-eeeee-eee-----e--eeeee 453 | >Q800R2 454 | eeeeeeeeeeeeeeeeeeeeeeeeeee---e---e--ee--ee--e---eeee 455 | >Q9BPE7 456 | eeeeeee-e-ee-eee-ee--ee--eeee--eeee---eee 457 | >P0C667 458 | eeeeeee--ee-eee-ee--ee--e-ee--eeeee--e--eeee 459 | >P0C8A4 460 | ee-eee-eeeeee-e-eeeeee-e---eeee-ee-e-e--e-eeee-eeee---ee--e-eeeeeee-eeeee---ee 461 | >P0C8A5 462 | eeeee--e-eee-eee-e-eee-ee--eeeee---ee---eee 463 | >B3IUD8 464 | ee--e--ee-eeee------ee-e-e-----------e-eeeeeeeeeeeeee-ee---e---e---ee-eeeeeee-eeee-e--ee--ee-eee--e--eeee-eeee 465 | >A7WNV5 466 | eeeeeeeeeeeeeee--ee------ee--e---e---ee-ee-e-e----e-eeee 467 | >C6EVG1 468 | eeeeee-eeeeeeeeee--ee-eeeeeeeeeeeeeeeeeeeeeeeeee-eeee-eeeeeeeeee 469 | >Q23212 470 | eeeee-e---e--e----eeee-eeeee-------eeeeeeeeeeeeeeeee----------eee----------eeeeee--eeeeeeee-eeee--------e-eeeeee-------eeeee---------e--eeeee-----e-eeeeee---e-ee 471 | >Q7YX32 472 | ee-eee------ee-ee-eeeee--eeeeeeee-e---e-eee-e---e-eee-e---e--eeeeeee 473 | >Q21156 474 | eee-eeeeee-ee---e--e-eee-ee---e--eeeeeeeee-eeeee-eeeeeeeee---e--eeee-ee-ee--e--ee-eeeeee 475 | >Q8ML70 476 | eeeee-eeee------eeee------eee-e-e-ee-e-e-ee-eee-e---e-ee------eeeeeeeee----------e-eeee--ee-e--ee-eeeee---e-eee-ee-ee-e-----eeeeee-e-e-ee---e--eee-----eeeeee-e-e-ee-----eeee------eeeeee-e-eeee--ee-ee----eee----e-eeeee---eee-e-----eee--ee--e-ee--ee-e 477 | >E6ZBE2 478 | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeee-ee-----------eee 479 | >R4JQZ0 480 | ee----e---e--eeee-ee-ee-ee--eee-eee--e--ee-ee 481 | >P0DMF9 482 | ee-e---ee-eee-eeeeeeee--e--eee--ee-ee--eee 483 | >Q9BP85 484 | eeeeeeeee-e--e-ee--ee-eeeeee-eeeeee-eeee--ee-----eeee---eee 485 | >Q9U660 486 | eeeeee-ee--eeeeee-eeee-ee-eee--eeeee-e---ee--eee------e-ee 487 | >Q9NL82 488 | ee-e--eee-eee-ee-e---ee-e-ee-----------e---e--eeee-e-ee---ee-e-ee-e-ee---ee-e-e----ee---ee-eeee-e-ee---ee-e-ee---ee---ee-e-ee---ee---ee-e-ee---ee---ee-e-eee--ee---ee-eeee-e-eeee--eeeeee-ee---ee-eeee---ee---ee-eeee-e-ee-e-ee-eee---ee-eee----eeee-e 489 | >G3F828 490 | eeeeeeeeeeee--ee--eee--ee-ee--ee---e--ee-e-e-eeee 491 | >Q91826 492 | eeeeeeeeeeeeeee--e---ee--e--ee--e---ee-eeeee 493 | >P01170 494 | eeeeeeeeeee-e-e-e-ee--ee-ee-ee--ee--ee--ee--ee-e-ee-e-eee-ee--e-eeee-e-eee-eeeee-eeeeeeee-ee---ee-eee 495 | >P87385 496 | eeeeee-eeeeeee-ee--ee---e--ee-eeeeeee-ee-eee-eeeeeee-eee-e-ee-e-eeeeee-ee---ee-eee 497 | >P0DJK0 498 | eeeeee--ee-eeeeeeeeeeeee--e--e-eeeeee---e-eeeee--eeeeeeeeeeee-e-ee-eeee--e---ee---e-eeeeeeeeeee 499 | >A8YPR9 500 | eeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeee-eeeeeeeeee-eeeeeeee-e--eeeeeeeee-eeeeeeee-e-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee--eeeeeee-e-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee--eeeeeeeee-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeee-ee-eeeee 501 | >E7EKD0 502 | eeeeeeeeeeeeeeee-eee-eee---e---e--ee-eee 503 | >B3VZU3 504 | eeeeeeeeeeeeeeeeeeee-eee--e---e--ee-eee 505 | >S0F1M9 506 | ee-e-eee-----eeeee--eeee-eeeee--ee-eeeeeeeeeee-eeeee 507 | >Q6ECK6 508 | eeeeee---eee-e-------ee-e-ee-e-e-ee-eeee-ee------ee-ee-ee--ee--eee---ee-eeeeeeeee 509 | >P83580 510 | eee-ee-ee---ee-eeeeeeee-eeeeee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e 511 | >Q7ZZY8 512 | eeeeee-e-e-ee-e--eeee--e-eeee-eee-----e-----------ee-eeee-e-eee--ee-eee---eeee-ee-----ee-eeeeee-e---e--e 513 | >Q09GK2 514 | eeeeeee-ee-eeee-e---ee-eee-e--ee--eeee-e-eee-ee-ee-eee-eeeeee--e-e-------e---ee-ee--ee-ee---ee-e-eeeeeee-ee----e-eeeeee-e-e-----e-ee--eee-eee 515 | >Q800I8 516 | eeeee-ee-ee--eee-eeeeeeeeeeeeeeeeeee-eeeee-eeeeee---e--ee--ee-eeeeeeeeeee-------e-ee--e-eeeee 517 | >Q805D5 518 | eeeeeeeeee--ee---ee---------eeee--ee-eeeee-eeeeeee-ee-e---eeeee--e---e--ee-ee-eeeeee---e-e-----e-ee----eeeee 519 | >Q76KW6 520 | eee-eeeeeeee--e---e---e--ee-eeeeeeeeeeee-eee-eee-ee--eeeee---eee-eeeeeeeeeeeee--ee--eeee--eee-ee-eeee-----e-ee--e-eeeee 521 | >P86093 522 | eeeeeeeeeeeeeeee-ee-ee-ee-eee-ee-eee 523 | >P20481 524 | eeeeeeeeee-e-e--ee--eeeeeee-e-eee-eeeee--e-e--ee-eeeee-e-e-eee-eeeee-e-e-eee-eeeee-e-ee-e--eeee-e-e-eee-eeeee-e-e-eee-eeeee-e-e-eee-eeeee-e-eeeee-eeeee-eeeeeee-eeeeeee-e--ee-eeeeeee-e-eee-eeeee-e-e-eee-eeeeee--e--e--eeeeee-e--eeeeeee-eee-eee-eeeee-e-e-ee--ee--e--e-eee-eeee-e-e-eeeeeeeeeee-e-eeeeeeeee-e-e-eee-ee-eeee-e-ee-e-eeeeee-e-eee-eeeee-e-e-ee-eeeeee-e-e-ee-eeeeee-e-e-ee-eeeeee-eee-eee-ee-ee-e-e-ee-eeeeee-e-e-ee-eeeeee-e-e-eeeeeeeee-eee-eeeeeeeee-eee-ee-eeeeeeee-eeeeeeee 525 | >O93464 526 | ee-eee-eeeee-e-e--eeeeeeeeee-eee--ee-ee-eeeee-eee--e---e--eeeeeeee-ee-e-ee-eee-e-ee-e--------ee--ee-e-ee 527 | >C7DQX7 528 | eeee------e-e----e--------e-eeeeeeee--ee-eee 529 | >D2DGD7 530 | eeeeeeeeeee-ee-eee--eee-ee-eeeeee-eee-e----------e--e------eee 531 | >C7DQC1 532 | eeee-ee--------------ee-eee-ee--ee-eeeee 533 | >D6C4J3 534 | ee-eeee-eee-ee--eee-eee-eee-eeeee-eee-ee 535 | >Q9BPH6 536 | eee-ee-eeeeee-eeeeeee---e--eeee--eee--e----ee-e 537 | >P01523 538 | eeeeeeeeeeee-ee-eee-eeee-e--eeeee--eeeee-eeee-eeee--eee 539 | >P0C1N6 540 | eee-ee-eeeeee-eeeee-e---e--ee---eee--e-ee-e--ee 541 | >P56529 542 | eeeeeeeeeeeeeee-e-e-eee-ee--eeeeee-----e-ee-ee-eeee--eee 543 | >P69769 544 | eeeeeeee-ee--ee-eee-eeeeee--eee-ee--e-e-ee-eeee-eeee--ee 545 | >D6C4L5 546 | ee--ee-eeeeee-eeee-eeeee--e-ee----eee---e-e 547 | >Q2I2P8 548 | eeeeeeeeeee-eeeeee--e--e--ee--eeeeeeee--ee--ee---e---eee-eeeeee 549 | >Q3YEG7 550 | eeeeeeeeeeee-eeeeee-e--ee-eeee-e-eee-e--ee--eeeee---eeeeee 551 | >Q86RA3 552 | eeeeeeeeeeee-eeeeee-e--ee-eeeeee-eeeee--eee-eee--e-e 553 | >Q9UA91 554 | eeeeeeeeee-e-eeeeeeee-eee-eeee-----------ee-eeeeeeee 555 | >Q3YEE1 556 | eeeeeeeeeee-eeeee-eeeee-e--ee--eee--------e---------e-ee----e-e 557 | >Q5K0C4 558 | eeeeeeee-e--e-eee-eee--ee--ee-eeeeee-eeeee--eee-eee---e--e 559 | >G1AS74 560 | eeeeeeeeee-e--eeee-e-ee--eeee-e-e-e-e-e-e--eee-ee---eee 561 | >Q3YEF8 562 | eeeeeeeee--ee-eee-eee-eeeeee-e-eee--e-e-eee--eeee 563 | >P0C1M8 564 | eeeee--eeee------e-eeeee--eee-eee--eee 565 | >P0CY72 566 | eeeeeeeee-ee-eeeeeeeeeeee-ee--ee-eeeee--eeeee--eee-eeeeeee-------ee--eee 567 | >D2Y495 568 | eeeeeeeeeee-eeeeeeeeeeeeee------ee-ee-eee-eeeeee--eee-e-e-eeeee--eee----e-e 569 | >Q9BH21 570 | eeeeeee-e--e-eee-ee--e--eeee-----eee-ee 571 | >Q3YEH0 572 | eeeeeee-e-ee-eee-ee--e--eeee--e-eeee--eee 573 | >Q9U6Z8 574 | eeeeeee-e-ee-eee-ee--ee-eee---eeeee--eee 575 | >Q3YEH3 576 | eeeeeee-e-ee-eee-ee--e---eeee--eee----ee 577 | >Q1A3Q7 578 | ee-eee-eee-eeeeee-eeeee-ee-ee-------eee---eeeeee 579 | >Q3YEH5 580 | eeeeeee-e-ee-eee-ee--ee--ee----eee-e--e 581 | >Q6PTD0 582 | eeeee-eeee-e--e--ee-eee-e-ee-eee--eee-e--ee 583 | >Q17313 584 | eeeeeee---eee-e---ee-ee--e---eeeee--e---eeee 585 | >P0C1D0 586 | eeeeeeee--e---ee--e---eeee-e-e-e 587 | >P0DJC6 588 | eeeeee-eeee---e-----e--e--eee-eeee-ee-eeeee-eeeee----e-e-e----e-------ee 589 | >P0DJC7 590 | eeeeee-eeeeeeeeeeee-eeeee-eeee-----eeeeee--e-----e-e 591 | >P0C8A7 592 | eeee-e--eeeeee-e-e-e-eeee-ee-e--------eeeeee 593 | >O93225 594 | eeeeeeeeeeeeeeeeeeeeeeee---e--e---e--ee--ee--eeeee 595 | >O93455 596 | eeeeeeeeeeeeeeeeeeee--ee--ee-ee--ee-eee--eee 597 | >C6EVG2 598 | ee-eee-eeeeeeeeee--e-eee-ee--eee--e--ee---ee--ee--eeeeeeeeeee 599 | >P42565 600 | ee-e-eeeee-----ee-e-----ee-e------e--------e--------e-------ee-------ee--------e--------e--------e--------e--------e--------e--------e--------e--------e--------e--------e-------------------ee--------e-----------------------e----------------------------------e------------------ee--------eeeeeeee-------e-e--------e-eee---eeee 601 | >Q18502 602 | eeeee-ee--e--eeeeeee--ee--e--eee-eeeeeee-e-eeeeee--e--eeee-----eeeeee-eeeeee--e-eee 603 | >Q1MX22 604 | eeeeeeee-eeeeeee-ee---e--e-e-eeeeeeeee--ee-eeeee-e---ee--e--ee-eee-eee-eee--e--e--ee-e--e--e--eeeee--eee-ee-eeeeee-ee---e--eeeee-eeeeeeeee-eeeeeeeeeeee-ee-e 605 | >A6P3B2 606 | eee-eee-eeee-ee--ee-e-eeeeeeee-ee-eee-ee-eeeee-eeeeee-e-eeeeee-e-e---ee-eeeeeeee------eeeeeeeeeeeee--e--ee-eee--e--eeeeeeee-ee--eeeeee-ee-ee-e-ee--eeeeeee-----eeeeeeeeeeee--e--eeeeeee-----eeeeee--e-eeeeeee--e-eeeeeee----eeeeeeee--e-eeeeeeee--e-eeeeeeee--e--eeeeeee--e-eeeeeeee--e--eeeeeee-----eeeeeee--e--eeeeeee----eeeee--e--eeeeeeee--e--eeeeeeee--e--ee-eeeeeeeee-ee 607 | >Q8AUU1 608 | eee--eeeeeeeeeeeee--eee-ee-eee-eeeeee---e---e-----eeee-ee--ee--e--ee--eeeeeee 609 | >Q8JIM3 610 | e-----eeee-ee-eeeee-ee--eeeee-eeeee-e-ee-ee-eeee--e--e-ee-e-e-eee-eeee--e---e---------eee-eee-eeeee-eee-ee--e------e-e-e-ee-ee--ee-ee-ee-e---e--ee--ee-eeeee-ee--ee-e---ee-eeeeeeee-eee-e---eeeeeeee 611 | >Q76IQ4 612 | eee--eeeeeeeeeeeeeeeee---e--ee--eeee-e-eeeeee---e---e-----eeee-ee--e---e--ee--eeeeeee 613 | >Q9PUR1 614 | eeeeeeeeeee-ee-eeeee--ee-------e--eee--ee--e---eeeeeeeeeeee--e--------e--eee--ee--e--eeeeeeeee-eee--eee-eee--ee--e--ee---ee--e---eee-eeeee 615 | >O12956 616 | eeeee-eeeeee-e-eeeeeeeeeee-eeeee--ee---e---e--eee--ee--e---eeeeeeeee-eeeeeee-eeeeeeee--eeeee-e-eeeeee-eeeeeeeeeeeeeeeeee-eeeeeeee-eee-ee-ee-ee---e-e--ee--e--ee---ee--e--eeee-eeee-eee-e 617 | >Q9GQV7 618 | eeeee-eee-eeeeeeeeeee---ee--eeeeeeeeee-eee-eeeee-e--eeeeeeeeee-eee-eee-eeeeeeee-eeeeeeeee--ee-eeeeeeeeeeee 619 | >G0LWV9 620 | eeeeeeeeeeeeeeeeeeeeeeeeee--ee--ee-ee--ee--e-eee 621 | >Q16992 622 | eeeeeeeeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeee--e-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eee-ee-ee-ee-ee-ee-ee--eeeeeee-eeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-ee-eeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-e-eeeeeeee-ee-ee-ee-ee-ee-eeee 623 | >H2CYR5 624 | eee---e---e--eee--ee-ee-ee--eee-eee--e--ee-ee 625 | >E4VP50 626 | eeee--e--eee-ee--eeeeeee--eee-ee--ee-eee-e-ee--ee-eee 627 | >Q09982 628 | eeee-e-e--e-e-----eee-eeee-ee-ee---------------------------------------------ee-eee 629 | >P86442 630 | eeeeeee-ee--e--e--ee-ee---e--eee-eeeee-eeeeeee--eeeee-ee--ee--eee 631 | >Q9BH84 632 | eeeeee-ee---e--ee-eee-ee-eee---ee-e----ee-e-e--e-----e-e 633 | >Q9BPA4 634 | eeeee--ee--ee--ee-eee-ee-eeeee-eee-e-----e-eee--------e-e 635 | >P21259 636 | eeee-ee--eee-eeeeeeeeeeeeeeeeeee-eeee--eeeeee-eee-eeeeeee--eee--eeeeeee-eee--eeeeee--eee--eeeeeeeeeee--eeeeeee-eee--eeeeee--eee--eeeeeeeeeee--eeeeeee-eee--eeeeee-eeee--eeeeee-eeee--eeeeeeeeeee--eeeeee-eeee--eeeeeee-eee-eeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeee-eeee 637 | >P82003 638 | eeeeeeeeeeeeeeeeee-eeeeeeeeeee-ee-eeeeee-e-ee-eeeeee-e-eeeeeeeeeee--e-eeeeeeee-ee-eeeeee-e-ee-eeeeeeeeeeeeeeeee-ee-eeeeee-e-ee-eeeeee-e-eeeeeeeeeee--e-eeeeeeee--e-eeeeee-e-ee-ee-eee-ee-e-ee-eeeeee-e-eeeeeeeee-eeeeeeeeeeeeeeeeeeeeeee-ee-eeeeeeeeeeeeeeeeeee 639 | >C5J8E3 640 | eeeeeeeeeeeeeeeee-ee-eee-ee----eeeeeee 641 | >Q2V2G5 642 | eeeee-eeee-e-eee--ee-eee-----eeeeeeeeeee-e----e--e---------e--ee-ee--eeeeeee-------eee 643 | >A0SIF1 644 | eeeeeeee--ee--ee--eee-e-e-e-ee-e-eee-eeeee-e-e--eeeeee-e-e-eeee--eeeeee-ee---e--ee-ee-ee-eee---eee-eee-ee--ee--e-e-e-e-eee-eeee---ee--ee-e-e-e--eee-e-e-eee--e-e-eeeeeeee-eeeeeeeee-e--eeeee-eeeeeeee--eeeeeeeeeeeee 645 | >A0SIX6 646 | eeeeeeee--ee--e----ee---e-e-ee-e-eee-eeeee-e-e--eeeeee-e-eee-eee-eeeeee--e-------e-ee-ee-eee--eeeee-eeee-eeeeee-e---eeee--ee--ee-e---e--eee-e-ee--eee-eee---e--eeeeeeeeeeee--ee-eee-e-eeeeeeeeeee 647 | >A8YPR6 648 | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eee-eeeeeeeeeee-eeeeeee-eeeeeeeeeee-eeeeeeeeeee-eeeeeeeeeeeeeee-eeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeee-eeeeeeee-ee--eeeeee-ee-eee--eeee-ee-eee----e 649 | >E7EKD4 650 | eeeeeeeeeeeee-e-e-eeee--e---e--ee-eee 651 | >Q86UU9 652 | eeeee-e--ee-e-------ee-e-ee-e-e-ee-eeee-ee------ee-eee-e--eeeee--e-ee---e---ee---eeeeeeeeeeee 653 | >P04560 654 | eeeeee-e-e-ee-e--eeee----eeee-eee-e---e----e------eee-e-eee--ee--ee---e-eee-eee-e-----ee-eeeeee-e---e--e 655 | -------------------------------------------------------------------------------- /py/asap/__init__.py: -------------------------------------------------------------------------------- 1 | from .features import FEATURE_KEY_OPTIONS, DEFAULT_FEATURE_KEYS 2 | from .parse import convert_lf_to_fasta 3 | from .window_extraction import META_WINDOW_HEADERS, WindowExtractionParams, extract_windows_from_file, extract_windows_from_seq 4 | from .classification import WindowClassifier, PeptidePredictor, train_window_classifier, get_top_features, get_windows_data 5 | from .sklearn_extensions import FeatureSelectionPipeline -------------------------------------------------------------------------------- /py/asap/classification.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from sklearn.utils import shuffle 8 | from sklearn.pipeline import Pipeline 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.feature_selection import VarianceThreshold, SelectFdr, RFECV 11 | from sklearn.cross_validation import StratifiedKFold 12 | from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn.svm import SVC 15 | 16 | from . import util 17 | from . import window_extraction 18 | from . import sklearn_extensions 19 | 20 | LOGGER = logging.getLogger('ML') 21 | 22 | DEFAULT_CLASSIFIERS = [ 23 | RandomForestClassifier(n_estimators = 180, n_jobs = -2, class_weight = 'auto'), 24 | SVC(kernel = 'rbf', class_weight = 'auto', probability = True, cache_size = 1500), 25 | ] 26 | 27 | DEFAULT_TRANSFORMER = StandardScaler(copy = False) 28 | 29 | DEFAULT_FEATURE_SELECTOR = sklearn_extensions.FeatureSelectionPipeline([ 30 | VarianceThreshold(0.01), 31 | SelectFdr(alpha = 0.1), 32 | ]) 33 | 34 | RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER = sklearn_extensions.RandomForestClassifierWithCoef(n_estimators = 400, n_jobs = -2, class_weight = 'auto') 35 | 36 | # Use a constant seed 37 | SEED = 1812 38 | np.random.seed(SEED) 39 | 40 | # Silence annoying pandas warnings 41 | pd.options.mode.chained_assignment = None 42 | 43 | class WindowClassifier(object): 44 | 45 | ''' 46 | Classifies windows extracted with their features. 47 | ''' 48 | 49 | def __init__(self, raw_classifier, used_features, transformer = DEFAULT_TRANSFORMER): 50 | ''' 51 | @param raw_classifier (sklearn classifier): A raw classifier trained against a dataset of windows. 52 | @param used_features (list of strings): The names of the features used while training the raw classifier (after feature selection 53 | has been applied). 54 | @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): The exact same transformer used 55 | when training the raw classifier (providing any other transformer is expected to give very poor results). 56 | ''' 57 | self.raw_classifier = raw_classifier 58 | self.used_features = used_features 59 | self.transformer = transformer 60 | 61 | def classify_windows(self, windows_data_frame, proba = False): 62 | 63 | ''' 64 | Classifies windows extracted with their features, given in a CSV format. Obviously, these windows don't need to have annotations/labels. 65 | Even if labels are given, they will be ignored. 66 | @param windows_data_frame (pandas.DataFrame): 67 | A data frame of the windows' CSV. 68 | @param proba (default False): 69 | Whether to return predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s). 70 | @return: 71 | A numpy array of the predicted labels for the given windows. The length of the returned array will correspond to the number of 72 | windows in the given data frame. 73 | ''' 74 | 75 | if len(windows_data_frame) == 0: 76 | return np.empty(shape = 0) 77 | else: 78 | 79 | X = windows_data_frame[self.used_features].values 80 | X = self._transform(X) 81 | 82 | if proba: 83 | return self.raw_classifier.predict_proba(X)[:,1] 84 | else: 85 | return self.raw_classifier.predict(X) 86 | 87 | def test_performance(self, windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, scoring_method = f1_score): 88 | ''' 89 | Tests the performance of this trained classifier, that was originally trained on a certain dataset, on a new dataset. The given dataset 90 | should be windows extracted with their features and annotations, given in a CSV format. 91 | The documentation of this method is partial and lacks some important details, as it's very similar to train_window_classifier, which 92 | already has a detailed documentation. Therefore, make sure to read the documentation of the other method in order to understand the full 93 | meaning of all the parameters. 94 | @param windows_data_frame (pandas.DataFrame): 95 | A data frame of the windows' CSV. 96 | @param drop_only_almost_positives (boolean, default False): 97 | Whether to drop only almost positive windows in the dataset before evaluating the performance of this classifier against it. 98 | @param drop_duplicates (boolean, default True): 99 | Whether to drop duplicating windows in the dataset, based on their neighbourhood property, before evaluating the performance 100 | of this classifier against it. 101 | @param scoring_method (function, default sklearn.metrics.f1_score): 102 | A scoring method to evaluate the classifiers by, just like in train_window_classifier. 103 | @return: 104 | A tuple of scores measuring the performance of this classifier against the given dataset in the format (score, roc, sensitivity, 105 | precision, specificity, cm), just like in train_window_classifier. 106 | ''' 107 | LOGGER.info('Testing ' + str(type(self.raw_classifier))) 108 | features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, self.transformer, \ 109 | features = self.used_features) 110 | LOGGER.info('Predicting %d records...' % len(X)) 111 | y_pred = self.raw_classifier.predict(X) 112 | return _get_prediction_scores(y, y_pred, scoring_method) 113 | 114 | def _transform(self, X): 115 | if self.transformer is None: 116 | return X 117 | else: 118 | return self.transformer.fit_transform(X) 119 | 120 | class PeptidePredictor(object): 121 | 122 | ''' 123 | Uses a trained window classifier to predicts annotations for new peptide. 124 | ''' 125 | 126 | def __init__(self, window_classifier, window_extraction_params = window_extraction.WindowExtractionParams()): 127 | ''' 128 | @param window_classifier (WindowClassifier): 129 | The trained window classifier to use. 130 | @param window_extraction_params (WindowExtractionParams, default params by default): 131 | The exact same parameters that have been used to extract the windows on which the window classifier has been trained (providing 132 | any other set of parameters is expected to result very unpleasant errors). 133 | ''' 134 | self.window_classifier = window_classifier 135 | self.window_extraction_params = window_extraction_params 136 | 137 | def predict_annotations(self, seq, extra_tracks_data = {}, proba = False): 138 | 139 | ''' 140 | Predicts the annotations of a peptide. 141 | @param seq (string): 142 | The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet. 143 | @param extra_tracks_data (dict, empty by default): 144 | A dictionary for providing extra tracks of the given peptide. Must receive the data for all the tracks that have been used to 145 | extract the windows for training this classifier. Specifically, if this predictor relies on a feature that relies on a certain 146 | track, then this track must be provided here. The given dictionary should map from track names to their sequence. 147 | @param proba (default False): 148 | Whether to return predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s). 149 | @return: 150 | If proba = False, will return a binary string (of 0's and 1's) representing the predicted annotations for the given peptide. If 151 | proba = True, will return a list of floats (between 0 to 1), representing the predicted probabilities. Either way, the length of 152 | the returned string/list will correspond to the length of the provided peptide sequence. 153 | ''' 154 | 155 | length = len(seq) 156 | windows_csv = window_extraction.extract_windows_from_seq(seq, extra_tracks_data = extra_tracks_data, \ 157 | window_extraction_params = self.window_extraction_params) 158 | windows_data_frame = pd.read_csv(windows_csv) 159 | window_indices = windows_data_frame['window_hot_index'].values 160 | 161 | labels = self.window_classifier.classify_windows(windows_data_frame, proba = proba) 162 | annotation_mask = [0] * length 163 | 164 | for window_index, label in zip(window_indices, labels): 165 | if window_index >= 0 and window_index < length: 166 | annotation_mask[window_index] = label 167 | 168 | if proba: 169 | return map(float, annotation_mask) 170 | else: 171 | return ''.join(map(str, annotation_mask)) 172 | 173 | def train_window_classifier(windows_data_frame, classifiers = DEFAULT_CLASSIFIERS, drop_only_almost_positives = False, \ 174 | drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, feature_selector = DEFAULT_FEATURE_SELECTOR, n_folds = 5, \ 175 | scoring_method = f1_score, select_best = True): 176 | 177 | ''' 178 | Trains a window classifier using a CSV of windows with extracted features and annotations/labels (obtained by either 179 | window_extraction.extract_windows_from_file with extract_annotations = True or window_extraction.extract_windows_from_seq with a given 180 | annotation_mask). The evaluation of the classifiers will be based on the kfold procedure, during which various metrics will be calculated. 181 | The final training of the classifier will be based on the entire data set. 182 | @param windows_data_frame (pandas.DataFrame): 183 | A data frame of the windows' CSV. 184 | @param classifiers (list of sklearn classifiers, default Gaussian-kernel SVM and random forest): 185 | A list of classifiers to try training independently, from which the best classifier can be chosen. 186 | @param drop_only_almost_positives (boolean, default False): 187 | Whether to drop "only almost positive" windows in the dataset. An only almost positive window is a window with a false label in its 188 | hot index, but with a true label in either of the flanking indices. In some learning scenarios, the labeling of the residues (i.e. 189 | annotations) isn't so important in a strict manner, and it only matters whether larger regions contain a positive label. It's especially 190 | important in cases that the actual used dataset is only accurate up to +/-1 shifts of the labels. In such scenarios, using this parameter 191 | might enhance performance. 192 | @param drop_duplicates (boolean, default True): 193 | Whether to drop duplicating windows in the dataset, based on their neighbourhood property. 194 | @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): 195 | A preprocessing transformer to use for the data before starting the kfold evaluation and final training of the classifiers. If None, will 196 | not perform any preprocessing transformation. 197 | @param feature_selector (sklearn feature selector, optional, default a pipeline of VarianceThreshold and SelectFdr): 198 | A feature selection procedure to apply during both the kfold evaluation and final training of each classifier. If None, will not perform 199 | feature selection (i.e. will use all features). Note that the given feature selector must implement the get_support method (hence sklearn's 200 | builtin Pipeline object cannot be used; if you want to pipeline then use FeatureSelectionPipeline of this project). 201 | @param n_folds (int, default 5): 202 | The number of folds to use during the kfold evaluation procedure. 203 | @param scoring_method (function, default sklearn.metrics.f1_score): 204 | A scoring method to evaluate the classifiers by. Expecting a method that receives two parameters (y_true and y_pred) and returns a float 205 | score. This score will be calculated for all classifiers, in addition to other metrics. Also, if select_best is set to True, this score 206 | will be used in order to choose the best classifier. 207 | @param select_best (boolean, default True): 208 | Whether to return only the best evaluated classifier or all of them. 209 | @return: 210 | For each classifier, will return a tuple of the trained WindowClassifier object and its metrics, as evaluated during the kfold procedure. 211 | The metrics are also a tuple of floats in the format (score, roc, sensitivity, precision, specificity, cm), where: score is the score 212 | calculated by scoring_method; roc is Area Under the Curve (AUC); cm stands for the 2X2 confusion matrix of the results. If select_best is 213 | set to True, will return only the tuple of the best classifier (based on the score). Otherwise, will return a list of tuples for all the 214 | classifiers, sorted by their score in a descending order. 215 | ''' 216 | 217 | features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer) 218 | window_classifiers_and_results = [] 219 | 220 | for classifier in classifiers: 221 | kfold_results = _get_classifier_kfold_results(classifier, X, y, n_folds, feature_selector, scoring_method) 222 | window_classifier = _get_trained_window_classifier(classifier, features, X, y, feature_selector, transformer) 223 | window_classifiers_and_results += [(window_classifier, kfold_results)] 224 | 225 | window_classifiers_and_results.sort(key = lambda window_classifier_and_results: window_classifier_and_results[1], reverse = True) 226 | 227 | if select_best: 228 | best_classifier, best_results = window_classifiers_and_results[0] 229 | LOGGER.info('The best classifier is %s with score %f.' % (str(type(best_classifier.raw_classifier)), best_results[0])) 230 | return best_classifier, best_results 231 | else: 232 | return window_classifiers_and_results 233 | 234 | def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \ 235 | classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'): 236 | 237 | ''' 238 | Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format. 239 | @param windows_data_frame (pandas.DataFrame): 240 | A data frame of the windows' CSV. 241 | @param drop_only_almost_positives (boolean, default False): 242 | Same as in train_window_classifier. 243 | @param drop_duplicates (boolean, default True): 244 | Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV. 245 | @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): 246 | A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation. 247 | @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV): 248 | The classifier to use as the estimator of RFECV. 249 | @param n_folds (int, default 2): 250 | The n_folds to use in the kfold cross-validation as part of the RFECV process. 251 | @param step (default 0.05): 252 | See sklearn.feature_selection.RFECV 253 | @param scoring (default 'f1'): 254 | See sklearn.feature_selection.RFECV 255 | @return: 256 | A list of the top features, each represented as a string. 257 | ''' 258 | 259 | features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer) 260 | kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED) 261 | rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring) 262 | rfecv.fit(X, y) 263 | return util.apply_mask(features, rfecv.support_) 264 | 265 | def get_windows_data(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \ 266 | features = None): 267 | 268 | ''' 269 | Extracts numeric vectorial data in numpy format, suitable for applying standard sklearn models on, from a CSV of windows with features. 270 | @param windows_data_frame (pandas.DataFrame): 271 | A data frame of the windows' CSV. 272 | @param drop_only_almost_positives (boolean, default False): 273 | Same as in train_window_classifier. 274 | @param drop_duplicates (boolean, default True): 275 | Whether to drop duplicating windows in the dataset, based on their neighbourhood property. 276 | @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): 277 | A transformer to apply on the data (X). If None, will not perform any preprocessing transformation. 278 | @param features (list of strings, optional): 279 | The names of the features to extract from each window. If None, will extract all the features that appear in the given CSV. 280 | @return: 281 | A tuple comprised of: 282 | 1. features - A list of strings corresponding to the names of the features extracted from the data. 283 | 2. X - A numpy matrix of the extracted data points. Each row in the matrix represents a window, and each column a feature. 284 | 3. y - A numpy array of binary integer values (0s and 1s), corresponding to the label of the extracted data points (windows). The 285 | length of y is equal to the number of rows in X. 286 | ''' 287 | 288 | LOGGER.info('Given a data frame of %d records X %d columns.' % windows_data_frame.shape) 289 | 290 | if drop_only_almost_positives: 291 | windows_data_frame = windows_data_frame[windows_data_frame['window_only_almost_positive'] == 0] 292 | LOGGER.info('Dropped only almost positives. %d records remained.' % len(windows_data_frame)) 293 | 294 | if drop_duplicates: 295 | # When we remove duplicates, we want to give priority to positives 296 | windows_data_frame.sort(columns = 'window_label', ascending = False, inplace = True) 297 | windows_data_frame.drop_duplicates(subset = 'window_neighbourhood', inplace = True) 298 | LOGGER.info('Dropped duplicates. %d records remained.' % len(windows_data_frame)) 299 | 300 | if features is None: 301 | features = [header for header in windows_data_frame.columns if header not in window_extraction.META_WINDOW_HEADERS] 302 | 303 | LOGGER.info('%d features to process.' % len(features)) 304 | X = windows_data_frame[features].values 305 | y = windows_data_frame['window_label'].values 306 | X, y = shuffle(X, y, random_state = SEED) 307 | 308 | if transformer is not None: 309 | X = transformer.fit_transform(X) 310 | LOGGER.info('Transformed the data.') 311 | 312 | LOGGER.info('Final data: samples = %d, features = %d' % X.shape) 313 | return features, X, y 314 | 315 | def _get_classifier_kfold_results(classifier, X, y, n_folds, feature_selector, scoring_method): 316 | 317 | LOGGER.info('Estimating ' + str(type(classifier))) 318 | time_before = datetime.datetime.now() 319 | 320 | y_pred = _predict_using_kfold(X, y, classifier, n_folds, feature_selector) 321 | score, roc, sensitivity, precision, specificity, cm = _get_prediction_scores(y, y_pred, scoring_method) 322 | 323 | time_diff = datetime.datetime.now() - time_before 324 | LOGGER.info('Finished estimating. Took %d seconds' % int(time_diff.total_seconds())) 325 | 326 | LOGGER.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity)) 327 | LOGGER.info('Confusion matrix:' + '\n' + str(cm)) 328 | return score, roc, sensitivity, precision, specificity, cm 329 | 330 | def _get_trained_window_classifier(classifier, features, X, y, feature_selector, transformer): 331 | LOGGER.info('Training ' + str(type(classifier))) 332 | X_reduced = feature_selector.fit_transform(X, y) 333 | used_features = util.apply_mask(features, feature_selector.get_support()) 334 | classifier.fit(X_reduced, y) 335 | return WindowClassifier(classifier, used_features, transformer) 336 | 337 | def _predict_using_kfold(X, y, classifier, n_folds, feature_selector): 338 | 339 | kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED) 340 | y_pred = np.zeros(len(y)) 341 | 342 | for i, fold in enumerate(kfold): 343 | 344 | LOGGER.info('Running fold %d/%d...' % (i + 1, n_folds)) 345 | 346 | train_indices, test_indices = fold 347 | X_train = X[train_indices] 348 | X_test = X[test_indices] 349 | y_train = y[train_indices] 350 | 351 | if feature_selector is not None: 352 | feature_selector.fit(X_train, y_train) 353 | X_train = feature_selector.transform(X_train) 354 | X_test = feature_selector.transform(X_test) 355 | LOGGER.info('Selected features. Remained with %d features.' % X_train.shape[1]) 356 | 357 | classifier.fit(X_train, y_train) 358 | y_pred[test_indices] = classifier.predict(X_test) 359 | 360 | return y_pred 361 | 362 | def _get_prediction_scores(y_true, y_pred, scoring_method): 363 | 364 | cm = confusion_matrix(y_true, y_pred, labels = [0, 1]) 365 | roc = roc_auc_score(y_true, y_pred) 366 | score = scoring_method(y_true, y_pred) 367 | 368 | tn = float(cm[0][0]) 369 | tp = float(cm[1][1]) 370 | fp = float(cm[0][1]) 371 | fn = float(cm[1][0]) 372 | n = tn + fp 373 | p = tp + fn 374 | 375 | sensitivity = tp / p 376 | specificity = tn / n 377 | precision = tp / (tp + fp) 378 | 379 | return score, roc, sensitivity, precision, specificity, cm 380 | -------------------------------------------------------------------------------- /py/asap/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | AMINO_ACIDS = '_ACDEFGHIKLMNPQRSTVWY' 4 | REDUCED_AMINO_ACIDS = 'ACDGFIHKMNQPSRW' 5 | POSITIVE_AMINO_ACIDS = 'KR' 6 | NEGATIVE_AMINO_ACIDS = 'DE' 7 | 8 | SS_OPTIONS = '_HCE' # 3 state 2D structure 9 | ACC_OPTIONS = '_-e' # Binary 10 | DISORDER_OPTIONS = '.-^' 11 | PSSM_AMINO_ACIDS = 'ARNDCQEGHILKMFPSTWYV' # The order is super-important here! 12 | 13 | # Init logger 14 | LOG_FORMAT = '%(asctime)s [%(name)s:%(levelname)s] %(message)s' 15 | logging.basicConfig(format = LOG_FORMAT, level = 'INFO') 16 | -------------------------------------------------------------------------------- /py/asap/data.py: -------------------------------------------------------------------------------- 1 | from Bio.Seq import Seq 2 | from Bio.SeqRecord import SeqRecord 3 | from Bio.Alphabet import IUPAC 4 | 5 | from . import features 6 | 7 | class SequenceTrack(object): 8 | 9 | def __init__(self, name, seq, padding_value = '_'): 10 | ''' 11 | @param name: 12 | The name of the track (string). 13 | @param seq: 14 | The actual sequence (string). 15 | ''' 16 | self.name = name 17 | self.seq = seq 18 | self.padding_value = padding_value 19 | 20 | def length(self): 21 | return len(self.seq) 22 | 23 | def get_subsequence(self, start, length): 24 | return SequenceTrack(self.name, self.seq[start:(start + length)]) 25 | 26 | def pad(self, prefix_length, suffix_length): 27 | prefix = prefix_length * self.padding_value 28 | suffix = suffix_length * self.padding_value 29 | self.seq = prefix + self.seq + suffix 30 | 31 | def __repr__(self): 32 | return '%s: %s' % (self.name, self.seq) 33 | 34 | class SequenceTracks(object): 35 | 36 | def __init__(self): 37 | self.tracks = {} 38 | 39 | def add_track(self, sequence_track): 40 | self.tracks[sequence_track.name] = sequence_track 41 | 42 | def get_track(self, name): 43 | return self.tracks[name] 44 | 45 | def get_subsequence(self, start, length): 46 | 47 | subsequence = SequenceTracks() 48 | 49 | for track in self.tracks.values(): 50 | subsequence.add_track(track.get_subsequence(start, length)) 51 | 52 | return subsequence 53 | 54 | def pad(self, prefix_length, suffix_length): 55 | for track in self.tracks.values(): 56 | track.pad(prefix_length, suffix_length) 57 | 58 | def length(self): 59 | 60 | used_track_name = None 61 | length = None 62 | 63 | for track in self.tracks.values(): 64 | if length is None: 65 | used_track_name = track.name 66 | length = track.length() 67 | elif length != track.length(): 68 | raise Exception('Track lengths don\'t match (%s: %d, %s: %d)' % (used_track_name, length, track.name, track.length())) 69 | 70 | return length 71 | 72 | class DataRecord(object): 73 | 74 | def __init__(self, sequence_tracks): 75 | ''' 76 | @param sequence_tracks (SequenceTracks): 77 | All the sequence tracks used for this data record. 78 | ''' 79 | self.sequence_tracks = sequence_tracks 80 | self.padding_prefix_length = 0 81 | self.padding_suffix_length = 0 82 | 83 | def length(self): 84 | return self.sequence_tracks.length() 85 | 86 | def pad(self, prefix_length, suffix_length): 87 | self.padding_prefix_length += prefix_length 88 | self.padding_suffix_length += suffix_length 89 | self.sequence_tracks.pad(prefix_length, suffix_length) 90 | 91 | def get_track_seq(self, name): 92 | return self.sequence_tracks.get_track(name).seq 93 | 94 | def get_aa_seq(self): 95 | return self.get_track_seq('aa') 96 | 97 | def get_annotation_mask(self): 98 | return self.get_track_seq('annotation') 99 | 100 | def get_available_tracks(self): 101 | return self.sequence_tracks.tracks.keys() 102 | 103 | def has_annotation_mask(self): 104 | return 'annotation' in self.get_available_tracks() 105 | 106 | class FullDataRecord(DataRecord): 107 | 108 | def __init__(self, id, name, description, sequence_tracks): 109 | ''' 110 | @see DataRecord 111 | @param id (string): 112 | The record's ID from FASTA 113 | @param name (string): 114 | The record's name from FASTA 115 | @param description (string): 116 | The record's description from FASTA 117 | ''' 118 | DataRecord.__init__(self, sequence_tracks) 119 | self.id = id 120 | self.name = name 121 | self.description = description 122 | 123 | def to_fasta_record(self): 124 | return SeqRecord(Seq(self.get_aa_seq(), IUPAC.protein), 125 | id = self.id, 126 | name = self.name, 127 | description = self.description) 128 | 129 | def get_windows(self, window_size): 130 | for i in range(self.length() - window_size + 1): 131 | yield Window(self, i, i - self.padding_prefix_length, self.sequence_tracks.get_subsequence(i, window_size)) 132 | 133 | def __repr__(self): 134 | return 'Record %s' % self.id 135 | 136 | class Window(DataRecord): 137 | 138 | def __init__(self, full_record, offset, original_index, sequence_tracks): 139 | DataRecord.__init__(self, sequence_tracks) 140 | self.full_record = full_record 141 | self.offset = offset 142 | self.original_index = original_index 143 | 144 | def get_left_context_track_seq(self, name): 145 | return self.full_record.get_track_seq(name)[:self.offset] 146 | 147 | def get_right_context_track_seq(self, name): 148 | return self.full_record.get_track_seq(name)[(self.offset + self.length()):] 149 | 150 | def get_neighbourhood(self, hot_index, neighbourhood_prefix, neighbourhood_suffix): 151 | return self.get_aa_seq()[(hot_index - neighbourhood_prefix):(hot_index + neighbourhood_suffix + 1)] 152 | 153 | def get_label(self, hot_index): 154 | return self.get_annotation_mask()[hot_index] == '1' 155 | 156 | def is_only_almost_positive(self, hot_index): 157 | ''' 158 | @return: 159 | whether the hot index is negative, but one of the flanking indices is positive. If so, this window shouldn't 160 | be considered during the learning process (we treat it neither positive nor negative), assuming that default 161 | configuration is used. 162 | ''' 163 | mask = self.get_annotation_mask() 164 | return mask[hot_index] == '0' and (mask[hot_index - 1] == '1' or mask[hot_index + 1] == '1') 165 | 166 | def get_features(self, hot_index, feature_keys = features.DEFAULT_FEATURE_KEYS): 167 | return features.get_features(self, hot_index, feature_keys) 168 | 169 | def __repr__(self): 170 | return 'Window %d of %s' % (self.offset, self.full_record.id) 171 | -------------------------------------------------------------------------------- /py/asap/features_deps/AAScales.py: -------------------------------------------------------------------------------- 1 | ''' 2 | AA Propensity Scales. 3 | 4 | TODO: (Add/Note "combined" metrics: Georgiev scales. Kidera factors.. ) 5 | Some from BioPython. 6 | (BioPython stored them as dictionaries, e.g: Bio.SeqUtils.ProtParamData.kd). 7 | 8 | May need to be SCALED to 0-1 range ?!? 9 | 10 | Data initially acquired from BioPython: 11 | https://github.com/biopython/biopython/blob/master/Bio/SeqUtils/ProtParamData.py 12 | Bio.SeqUtils.ProtParamData 13 | 14 | Some more descriptors: 15 | https://github.com/ddofer/Protein-Descriptors/blob/master/src/csdsML/Descriptors.py 16 | ''' 17 | 18 | import numpy as np 19 | 20 | # Kyte & Doolittle {kd} index of hydrophobicity 21 | hp = {'A': 1.8, 'R':-4.5, 'N':-3.5, 'D':-3.5, 'C': 2.5, 22 | 'Q':-3.5, 'E':-3.5, 'G':-0.4, 'H':-3.2, 'I': 4.5, 23 | 'L': 3.8, 'K':-3.9, 'M': 1.9, 'F': 2.8, 'P':-1.6, 24 | 'S':-0.8, 'T':-0.7, 'W':-0.9, 'Y':-1.3, 'V': 4.2 } 25 | 26 | # Flexibility 27 | # Normalized flexibility parameters (B-values), average (Vihinen et al., 1994) 28 | Flex= {'A': 0.984, 'C': 0.906, 'E': 1.094, 'D': 1.068, 29 | 'G': 1.031, 'F': 0.915, 'I': 0.927, 'H': 0.950, 30 | 'K': 1.102, 'M': 0.952, 'L': 0.935, 'N': 1.048, 31 | 'Q': 1.037, 'P': 1.049, 'S': 1.046, 'R': 1.008, 32 | 'T': 0.997, 'W': 0.904, 'V': 0.931, 'Y': 0.929} 33 | 34 | # Hydrophilicity 35 | # 1 Hopp & Wood 36 | # Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981). 37 | hw = {'A':-0.5, 'R': 3.0, 'N': 0.2, 'D': 3.0, 'C':-1.0, 38 | 'Q': 0.2, 'E': 3.0, 'G': 0.0, 'H':-0.5, 'I':-1.8, 39 | 'L':-1.8, 'K': 3.0, 'M':-1.3, 'F':-2.5, 'P': 0.0, 40 | 'S': 0.3, 'T':-0.4, 'W':-3.4, 'Y':-2.3, 'V':-1.5 } 41 | 42 | # Surface accessibility {"em"} 43 | # 1 Emini Surface fractional probability 44 | sa = {'A': 0.815, 'R': 1.475, 'N': 1.296, 'D': 1.283, 'C': 0.394, 45 | 'Q': 1.348, 'E': 1.445, 'G': 0.714, 'H': 1.180, 'I': 0.603, 46 | 'L': 0.603, 'K': 1.545, 'M': 0.714, 'F': 0.695, 'P': 1.236, 47 | 'S': 1.115, 'T': 1.184, 'W': 0.808, 'Y': 1.089, 'V': 0.606 } 48 | 49 | # 2 Janin Interior to surface transfer energy scale 50 | ja = {'A': 0.28, 'R':-1.14, 'N':-0.55, 'D':-0.52, 'C': 0.97, 51 | 'Q':-0.69, 'E':-1.01, 'G': 0.43, 'H':-0.31, 'I': 0.60, 52 | 'L': 0.60, 'K':-1.62, 'M': 0.43, 'F': 0.46, 'P':-0.42, 53 | 'S':-0.19, 'T':-0.32, 'W': 0.29, 'Y':-0.15, 'V': 0.60 } 54 | 55 | # Disorder Propensity scale 56 | #"TOP-IDP-Scale: A New Amino Acid Scale Measuring Propensity for Intrinsic Disorder" 57 | #Campen, Uversky, Dunker et al. Protein Pept Lett. 2008 58 | 59 | # Positive values indicate protein (or windows) are likely to be ordered, etc' / 60 | TOP_IDP ={'A':0.06, 'R':0.180, 'N':0.007, 'D':0.192, 'C': 0.02, 61 | 'Q':0.318, 'E':0.736, 'G': 0.166, 'H':0.303, 'I': -0.486, 62 | 'L': -0.326, 'K':0.586, 'M': -0.397, 'F': -0.697, 'P':0.987, 63 | 'S':0.341, 'T':0.059, 'W': -0.884, 'Y':-0.510, 'V': -0.121 } 64 | 65 | # https://github.com/ddofer/Protein-Descriptors/blob/master/src/csdsML/Descriptors.py 66 | polarizability= {'A':0.046,'R':0.291,'N':0.134,'D':0.105,'C': 0.128,'Q':0.180, 67 | 'E':0.151,'G':0.000,'H':0.230,'I':0.186,'L':0.186,'K':0.219,'M':0.221, 68 | 'F':0.290,'P':0.131,'S':0.062,'T':0.108,'W':0.409,'Y':0.298,'V':0.140} 69 | 70 | ASAInTripeptide = {'A':115,'R':225,'N':160,'D':150,'C':135,'Q':180, 71 | 'E':190,'G':75,'H':195,'I':175,'L':170,'K':200,'M':185, 72 | 'F':210,'P':145,'S':115,'T':140,'W':255,'Y':230,'V':155} 73 | Volume = {'A':52.6,'R':109.1,'N':75.7,'D':68.4,'C':68.3,'Q':89.7, 74 | 'E':84.7,'G':36.3,'H':91.9,'I':102.0,'L':102.0,'K':105.1,'M':97.7, 75 | 'F':113.9,'P':73.6,'S':54.9,'T':71.2,'W':135.4,'Y':116.2,'V':85.1} 76 | 77 | StericParam = {'A':0.52,'R':0.68,'N':0.76,'D':0.76,'C':0.62,'Q':0.68, 78 | 'E':0.68,'G':0.00,'H':0.70,'I':1.02,'L':0.98,'K':0.68,'M':0.78, 79 | 'F':0.70,'P':0.36,'S':0.53,'T':0.50,'W':0.70,'Y':0.70,'V':0.76} 80 | Mutability = {'A':100,'R':65,'N':134,'D':106,'C':20,'Q':93, 81 | 'E':102,'G':49,'H':66,'I':96,'L':40,'K':56,'M':94, 82 | 'F':41,'P':56,'S':120,'T':97,'W':18,'Y':41,'V':74} 83 | 84 | # Hydrophobicity_kd_TMD = Bio.SeqUtils.ProtParamData.kd 85 | # Hydrophilicity = Bio.SeqUtils.ProtParamData.hw 86 | # Surface_access = Bio.SeqUtils.ProtParamData.em 87 | # Ja_transfer_energy = Bio.SeqUtils.ProtParamData.ja 88 | # flexibility = Bio.SeqUtils.ProtParamData.Flex 89 | 90 | 91 | 'GeorgievScales:' 92 | #Acquired from georgiev's paper of AAscales using helper script "GetTextData.py". + RegEx cleaning 93 | gg_1 = {'Q': -2.54, 'L': 2.72, 'T': -0.65, 'C': 2.66, 'I': 3.1, 'G': 0.15, 'V': 2.64, 'K': -3.89, 'M': 1.89, 'F': 3.12, 'N': -2.02, 'R': -2.8, 'H': -0.39, 'E': -3.08, 'W': 1.89, 'A': 0.57, 'D': -2.46, 'Y': 0.79, 'S': -1.1, 'P': -0.58} 94 | gg_2 = {'Q': 1.82, 'L': 1.88, 'T': -1.6, 'C': -1.52, 'I': 0.37, 'G': -3.49, 'V': 0.03, 'K': 1.47, 'M': 3.88, 'F': 0.68, 'N': -1.92, 'R': 0.31, 'H': 1, 'E': 3.45, 'W': -0.09, 'A': 3.37, 'D': -0.66, 'Y': -2.62, 'S': -2.05, 'P': -4.33} 95 | gg_3 = {'Q': -0.82, 'L': 1.92, 'T': -1.39, 'C': -3.29, 'I': 0.26, 'G': -2.97, 'V': -0.67, 'K': 1.95, 'M': -1.57, 'F': 2.4, 'N': 0.04, 'R': 2.84, 'H': -0.63, 'E': 0.05, 'W': 4.21, 'A': -3.66, 'D': -0.57, 'Y': 4.11, 'S': -2.19, 'P': -0.02} 96 | gg_4 = {'Q': -1.85, 'L': 5.33, 'T': 0.63, 'C': -3.77, 'I': 1.04, 'G': 2.06, 'V': 2.34, 'K': 1.17, 'M': -3.58, 'F': -0.35, 'N': -0.65, 'R': 0.25, 'H': -3.49, 'E': 0.62, 'W': -2.77, 'A': 2.34, 'D': 0.14, 'Y': -0.63, 'S': 1.36, 'P': -0.21} 97 | gg_5 = {'Q': 0.09, 'L': 0.08, 'T': 1.35, 'C': 2.96, 'I': -0.05, 'G': 0.7, 'V': 0.64, 'K': 0.53, 'M': -2.55, 'F': -0.88, 'N': 1.61, 'R': 0.2, 'H': 0.05, 'E': -0.49, 'W': 0.72, 'A': -1.07, 'D': 0.75, 'Y': 1.89, 'S': 1.78, 'P': -8.31} 98 | gg_6 = {'Q': 0.6, 'L': 0.09, 'T': -2.45, 'C': -2.23, 'I': -1.18, 'G': 7.47, 'V': -2.01, 'K': 0.1, 'M': 2.07, 'F': 1.62, 'N': 2.08, 'R': -0.37, 'H': 0.41, 'E': 0, 'W': 0.86, 'A': -0.4, 'D': 0.24, 'Y': -0.53, 'S': -3.36, 'P': -1.82} 99 | gg_7 = {'Q': 0.25, 'L': 0.27, 'T': -0.65, 'C': 0.44, 'I': -0.21, 'G': 0.41, 'V': -0.33, 'K': 4.01, 'M': 0.84, 'F': -0.15, 'N': 0.4, 'R': 3.81, 'H': 1.61, 'E': -5.66, 'W': -1.07, 'A': 1.23, 'D': -5.15, 'Y': -1.3, 'S': 1.39, 'P': -0.12} 100 | gg_8 = {'Q': 2.11, 'L': -4.06, 'T': 3.43, 'C': -3.49, 'I': 3.45, 'G': 1.62, 'V': 3.93, 'K': -0.01, 'M': 1.85, 'F': -0.41, 'N': -2.47, 'R': 0.98, 'H': -0.6, 'E': -0.11, 'W': -1.66, 'A': -2.32, 'D': -1.17, 'Y': 1.31, 'S': -1.21, 'P': -1.18} 101 | gg_9 = {'Q': -1.92, 'L': 0.43, 'T': 0.34, 'C': 2.22, 'I': 0.86, 'G': -0.47, 'V': -0.21, 'K': -0.26, 'M': -2.05, 'F': 4.2, 'N': -0.07, 'R': 2.43, 'H': 3.55, 'E': 1.49, 'W': -5.87, 'A': -2.01, 'D': 0.73, 'Y': -0.56, 'S': -2.83, 'P': 0} 102 | gg_10 = {'Q': -1.67, 'L': -1.2, 'T': 0.24, 'C': -3.78, 'I': 1.98, 'G': -2.9, 'V': 1.27, 'K': -1.66, 'M': 0.78, 'F': 0.73, 'N': 7.02, 'R': -0.99, 'H': 1.52, 'E': -2.26, 'W': -0.66, 'A': 1.31, 'D': 1.5, 'Y': -0.95, 'S': 0.39, 'P': -0.66} 103 | gg_11 = {'Q': 0.7, 'L': 0.67, 'T': -0.53, 'C': 1.98, 'I': 0.89, 'G': -0.98, 'V': 0.43, 'K': 5.86, 'M': 1.53, 'F': -0.56, 'N': 1.32, 'R': -4.9, 'H': -2.28, 'E': -1.62, 'W': -2.49, 'A': -1.14, 'D': 1.51, 'Y': 1.91, 'S': -2.92, 'P': 0.64} 104 | gg_12 = {'Q': -0.27, 'L': -0.29, 'T': 1.91, 'C': -0.43, 'I': -1.67, 'G': -0.62, 'V': -1.71, 'K': -0.06, 'M': 2.44, 'F': 3.54, 'N': -2.44, 'R': 2.09, 'H': -3.12, 'E': -3.97, 'W': -0.3, 'A': 0.19, 'D': 5.61, 'Y': -1.26, 'S': 1.27, 'P': -0.92} 105 | gg_13 = {'Q': -0.99, 'L': -2.47, 'T': 2.66, 'C': -1.03, 'I': -1.02, 'G': -0.11, 'V': -2.93, 'K': 1.38, 'M': -0.26, 'F': 5.25, 'N': 0.37, 'R': -3.08, 'H': -1.45, 'E': 2.3, 'W': -0.5, 'A': 1.66, 'D': -3.85, 'Y': 1.57, 'S': 2.86, 'P': -0.37} 106 | gg_14 = {'Q': -1.56, 'L': -4.79, 'T': -3.07, 'C': 0.93, 'I': -1.21, 'G': 0.15, 'V': 4.22, 'K': 1.78, 'M': -3.09, 'F': 1.73, 'N': -0.89, 'R': 0.82, 'H': -0.77, 'E': -0.06, 'W': 1.64, 'A': 4.39, 'D': 1.28, 'Y': 0.2, 'S': -1.88, 'P': 0.17} 107 | gg_15 = {'Q': 6.22, 'L': 0.8, 'T': 0.2, 'C': 1.43, 'I': -1.78, 'G': -0.53, 'V': 1.06, 'K': -2.71, 'M': -1.39, 'F': 2.14, 'N': 3.13, 'R': 1.32, 'H': -4.18, 'E': -0.35, 'W': -0.72, 'A': 0.18, 'D': -1.98, 'Y': -0.76, 'S': -2.42, 'P': 0.36} 108 | gg_16 = {'Q': -0.18, 'L': -1.43, 'T': -2.2, 'C': 1.45, 'I': 5.71, 'G': 0.35, 'V': -1.31, 'K': 1.62, 'M': -1.02, 'F': 1.1, 'N': 0.79, 'R': 0.69, 'H': -2.91, 'E': 1.51, 'W': 1.75, 'A': -2.6, 'D': 0.05, 'Y': -5.19, 'S': 1.75, 'P': 0.08} 109 | gg_17 = {'Q': 2.72, 'L': 0.63, 'T': 3.73, 'C': -1.15, 'I': 1.54, 'G': 0.3, 'V': -1.97, 'K': 0.96, 'M': -4.32, 'F': 0.68, 'N': -1.54, 'R': -2.62, 'H': 3.37, 'E': -2.29, 'W': 2.73, 'A': 1.49, 'D': 0.9, 'Y': -2.56, 'S': -2.77, 'P': 0.16} 110 | gg_18 = {'Q': 4.35, 'L': -0.24, 'T': -5.46, 'C': -1.64, 'I': 2.11, 'G': 0.32, 'V': -1.21, 'K': -1.09, 'M': -1.34, 'F': 1.46, 'N': -1.71, 'R': -1.49, 'H': 1.87, 'E': -1.47, 'W': -2.2, 'A': 0.46, 'D': 1.38, 'Y': 2.87, 'S': 3.36, 'P': -0.34} 111 | gg_19 = {'Q': 0.92, 'L': 1.01, 'T': -0.73, 'C': -1.05, 'I': -4.18, 'G': 0.05, 'V': 4.77, 'K': 1.36, 'M': 0.09, 'F': 2.33, 'N': -0.25, 'R': -2.57, 'H': 2.17, 'E': 0.15, 'W': 0.9, 'A': -4.22, 'D': -0.03, 'Y': -3.43, 'S': 2.67, 'P': 0.04} 112 | 113 | Atch_1 = {'A': 0.591, 'C': 1.343, 'E': 1.357, 'D': 1.05, 'G': 0.384, 'F': 1.006, 'I': 1.239, 'H': 0.336, 'K': 1.831, 'M': 0.663, 'L': 1.019, 'N': 0.945, 'Q': 0.931, 'P': 0.189, 'S': 0.228, 'R': 1.538, 'T': 0.032, 'W': 0.595, 'V': 1.337, 'Y': 0.26} 114 | Atch_2 = {'A': 1.302, 'C': 0.465, 'E': 1.453, 'D': 0.302, 'G': 1.652, 'F': 0.59, 'I': 0.547, 'H': 0.417, 'K': 0.561, 'M': 1.524, 'L': 0.987, 'N': 0.828, 'Q': 0.179, 'P': 2.081, 'S': 1.399, 'R': 0.055, 'T': 0.326, 'W': 0.009, 'V': 0.279, 'Y': 0.83} 115 | Atch_3 = {'A': 0.733, 'C': 0.862, 'E': 1.477, 'D': 3.656, 'G': 1.33, 'F': 1.891, 'I': 2.131, 'H': 1.673, 'K': 0.533, 'M': 2.219, 'L': 1.505, 'N': 1.299, 'Q': 3.005, 'P': 1.628, 'S': 4.76, 'R': 1.502, 'T': 2.213, 'W': 0.672, 'V': 0.544, 'Y': 3.097} 116 | Atch_4 = {'A': 1.57, 'C': 1.02, 'E': 0.113, 'D': 0.259, 'G': 1.045, 'F': 0.397, 'I': 0.393, 'H': 1.474, 'K': 0.277, 'M': 1.005, 'L': 1.266, 'N': 0.169, 'Q': 0.503, 'P': 0.421, 'S': 0.67, 'R': 0.44, 'T': 0.908, 'W': 2.128, 'V': 1.242, 'Y': 0.838} 117 | Atch_5 = {'A': 0.146, 'C': 0.255, 'E': 0.837, 'D': 3.242, 'G': 2.064, 'F': 0.412, 'I': 0.816, 'H': 0.078, 'K': 1.648, 'M': 1.212, 'L': 0.912, 'N': 0.933, 'Q': 1.853, 'P': 1.392, 'S': 2.647, 'R': 2.897, 'T': 1.313, 'W': 0.184, 'V': 1.262, 'Y': 1.512} 118 | 119 | MinScales_Dict = {'hp':hp, 'hw':hw, 120 | 'sa':sa, 'TOP_IDP':TOP_IDP, 121 | 'Atch_1':Atch_1,'Atch_2':Atch_2, 122 | 'Atch_3':Atch_3,'Atch_4':Atch_4, 123 | 'Atch_5':Atch_5} 124 | #Some scales removed from "full" scales dict, due ot redundnacy, partic if minScales dict is used on subsegments of sequence. 125 | #If MinScales dict is NOT used, then it's HGIHLY recomended to re-add these features! 126 | Scales_Dict = {'hp':hp, 'ja':ja, 127 | 'polarizability':polarizability,'Mutability':Mutability,'Volume':Volume, 128 | 'ASAInTripeptide':ASAInTripeptide, 129 | 'gg_1' : gg_1,'gg_2' : gg_2,'gg_3' : gg_3,'gg_4' : gg_4,'gg_5' : gg_5, 130 | 'gg_6' : gg_6,'gg_7' : gg_7,'gg_8' : gg_8,'gg_9' : gg_9,'gg_10' : gg_10,'gg_11' : gg_11} 131 | #,'gg_12' : gg_12 132 | #,'gg_13' : gg_13,'Atch_1':Atch_1,'Atch_2':Atch_2,'Atch_3':Atch_3,'Atch_4':Atch_4,'Atch_5':Atch_5, 133 | #,'gg_14' : gg_14,'gg_15' : gg_15, 134 | #,'gg_16' : gg_16,'gg_17' : gg_17,'gg_18' : gg_18,'gg_19' : gg_19} 135 | 136 | # Idea: ICA or PCA of ALL the above scales. Get 7-11 scales from them.. 137 | PTMScales_Dict = { 138 | # 'hp': hp, 139 | 'hw': hw, 140 | 'sa': sa, 141 | 'TOP_IDP': TOP_IDP, 142 | 'Atch_1': Atch_1, 143 | 'Atch_2': Atch_2, 144 | 'Atch_3': Atch_3, 145 | 'Atch_4': Atch_4, 146 | 'Atch_5': Atch_5, 147 | 'polarizability': polarizability, 148 | "ASAInTripeptide": ASAInTripeptide, 149 | } 150 | 151 | # PTMScales_Avg = {scale: np.median(PTMScales_Dict[scale].values()) for scale in PTMScales_Dict} #ORIG - Py 2.7 152 | PTMScales_Avg = {str(scale): np.median(list(PTMScales_Dict[scale].values())) for scale in PTMScales_Dict} 153 | 154 | 155 | ######################################################################################## 156 | ''' 157 | From PyPro, 158 | Authors: Dongsheng Cao and Yizeng Liang. 159 | : 160 | ''' 161 | # def _mean(listvalue): 162 | # """ 163 | # ######################################################################################## 164 | # The mean value of the list data. 165 | 166 | # Usage: 167 | 168 | # result=_mean(listvalue) 169 | # ######################################################################################## 170 | # """ 171 | # return sum(listvalue)/len(listvalue) 172 | # ############################################################################################## 173 | # def _std(listvalue,ddof=1): 174 | # """ 175 | # ######################################################################################## 176 | # The standard deviation of the list data. 177 | 178 | # Usage: 179 | 180 | # result=_std(listvalue) 181 | # ######################################################################################## 182 | # """ 183 | # mean=_mean(listvalue) 184 | # temp=[math.pow(i-mean,2) for i in listvalue] 185 | # res=math.sqrt(sum(temp)/(len(listvalue)-ddof)) 186 | # return res 187 | # ############################################################################################## 188 | "TODO: Fix to use proper way of normalizing, AND scaling. (Maybe sci-kit learn's preprocessor?" 189 | def NormalizeAAP(AAP): 190 | """ 191 | ######################################################################################## 192 | Centralize and normalize amino acid indices (Scales) before calculations. 193 | 194 | Usage: 195 | 196 | result=NormalizeEachAAP(AAP) 197 | 198 | Input: AAP is a dict containing the properties of 20 amino acids. 199 | 200 | Output: result is the a dict form containing the normalized properties. 201 | ######################################################################################## 202 | """ 203 | if len(AAP.values())!=20: 204 | print ('Some Amino Acids are missing') 205 | else: 206 | Result={} 207 | for i,j in AAP.items(): 208 | Result[i]=(j-_mean(AAP.values()))/_std(AAP.values(),ddof=0) 209 | 210 | return Result 211 | ######################################################################################## 212 | '''GetAAindex1 Requires the GetAAIndex.py from PyPro: ''' 213 | # def GetAAindex1(self,name,path='.'): 214 | # """ 215 | # Get the amino acid property values from aaindex1 216 | 217 | # Usage: 218 | 219 | # result=GetAAIndex1(name) 220 | 221 | # Input: name is the name of amino acid property (e.g., KRIW790103) 222 | 223 | # Output: result is a dict form containing the properties of 20 amino acids 224 | # """ 225 | 226 | # return GetAAIndex1(name,path=path) 227 | 228 | -------------------------------------------------------------------------------- /py/asap/features_deps/AAlphabets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Check to make alphabets, dicts, strings 3 | are persistant and not recalculated each time this method called!! 4 | 5 | Amino acid groupings from 6 | 'Reduced amino acid alphabets improve the sensitivity...' by 7 | Peterson, Kondev, et al. 8 | http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf 9 | 10 | Other alphabets from 11 | http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html 12 | 13 | """ 14 | 15 | 'TODO:' 16 | 'Add AA Propensities (From BioPython, articles, comp.profiler, etc) - eg, AAindex, http://bioinf.icm.uu.se/kbib/project13/convertAAstoProperties/' 17 | 18 | 19 | from collections import defaultdict 20 | 21 | # ambiguous amina acids: [ 'aspartic acid or asparagine', 'leucine or isoleucine', 22 | # 'glutamic acid[E] or glutamine[Q]'] : 23 | ambiguous_aa = 'BJZX' 24 | # special amino acids - 'selenocysteine', 'pyrralysine' 25 | aa_special_alph = 'UO' 26 | UNKNOWN_AA = "Z" #'unknown amino acid', 27 | 28 | ''' 29 | ILLEGALS = [c for c in ambiguous_aa+aa_special_alph+'Z'] 30 | ''' 31 | ILLEGALS = ['B', 'J', 'Z', 'X', 'U', 'O', 'Z'] 32 | # print(ILLEGALS) 33 | 34 | def TransDict_from_list(groups): 35 | ''' 36 | Given a list of letter groups, returns a dict mapping each group to a 37 | single letter from the group - for use in translation. 38 | >>> alex6=["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"] 39 | >>> trans_a6 = TransDict_from_list(alex6) 40 | >>> print(trans_a6) 41 | {'V': 'A', 'W': 'F', 'T': 'D', 'R': 'D', 'S': 'D', 'P': 'P', 42 | 'Q': 'D', 'Y': 'F', 'F': 'F', 43 | 'G': 'G', 'D': 'D', 'E': 'D', 'C': 'C', 'A': 'A', 44 | 'N': 'D', 'L': 'A', 'M': 'A', 'K': 'D', 'H': 'D', 'I': 'A'} 45 | ''' 46 | transDict = dict() 47 | 48 | result = {} 49 | for group in groups: 50 | g_members = sorted(group) #Alphabetically sorted list 51 | for c in g_members: 52 | # print('c' + str(c)) 53 | # print('g_members[0]' + str(g_members[0])) 54 | result[c] = str(g_members[0]) #K:V map, use group's first letter as represent. 55 | # print(result) 56 | return result 57 | 58 | def translate_sequence (seq, TranslationDict): 59 | ''' 60 | Given (seq) - a string/sequence to translate, 61 | Translates into a reduced alphabet, using a translation dict provided 62 | by the TransDict_from_list() method. 63 | Returns the string/sequence in the new, reduced alphabet. 64 | Remember - in Python string are immutable.. 65 | 66 | ''' 67 | from_list = [] 68 | to_list = [] 69 | for k,v in TranslationDict.items(): 70 | from_list.append(k) 71 | to_list.append(v) 72 | # TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list))) 73 | TRANS_seq = seq.translate(str.maketrans(TranslationDict)) 74 | return TRANS_seq 75 | 76 | def Get_Letters (TranslationDict): 77 | ''' 78 | Given a TranslationDict, 79 | return, as string, the letters retained after translation 80 | by that dict. 81 | ''' 82 | e = set(TranslationDict.values()) 83 | res = sorted (e) 84 | return ("".join(res)) 85 | 86 | 87 | AA20 = 'ACDEFGHIKLMNPQRSTVWY' #"Standard alphabet" 88 | 89 | 'Invented, based roughly on Ofer8, for Dibasic cleavage prediction' 90 | OferKR = TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "R","K","H", "DE", "STNQ"]) 91 | 92 | ofer14=TransDict_from_list(["A", "D", "KR","E", "N", "TS","Q", 93 | "YF", "LIVM", "C", "W", "H", "G", "P"]) 94 | ofer13=TransDict_from_list(["A", "DE", "KR", "N", "TS","Q", 95 | "YF", "LIVM", "C", "W", "H", "G", "P"]) 96 | 97 | "modifed from wang-wang, Clustering of the Protein Design Alphabets by Using Hierarchical SOM " 98 | ofer_w8 = TransDict_from_list(["FIL", "CY", "MVW", "HAT", "GP", "RK", "QSN", "DE"]) 99 | # ofer14=TransDict_from_list(['LIVM', 'D', 'G', 'A', 'C', 'N', 'H', 'KE','R', 'W', 'P', 'TSQ', 'YF']) 100 | 101 | # Ofer7=TransDict_from_list(["C", "G", "P", "FYW", "AVILM","KR", "STNQHDE"]) 102 | 103 | 'Look at: http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1594927&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1594927' 104 | ofer_tail =TransDict_from_list(["FAILV","TS","C","G","P","KR","DE","MWY","NQH"]) 105 | 106 | ofer8=TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "RKH", "DE", "STNQ"]) 107 | 108 | ofer_gbm5 = TransDict_from_list(["ANTSQ", "YFLIVMCWH","DKER" "G", "P"]) 109 | gbm4 = TransDict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"]) 110 | sdm12 =TransDict_from_list( 111 | ["A", "D", "KER", "N", "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"] ) 112 | 113 | hsdm17 =TransDict_from_list( 114 | ["A", "D", "KE", "R", "N", "T", "S", "Q", "Y", "F", "LIV", 115 | "M", "C", "W", "H", "G", "P"]) 116 | 117 | alex6=TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]) 118 | 119 | shen7 =TransDict_from_list(["AGV","ILFP","YMTS","HNQW","RK","DE","C"]) 120 | ''' 121 | "Shen 7" From: "Predicting protein-protein interactions based only on sequences information.", 122 | Shen J,Jiang H. et al. PNAS. 2007. 123 | Suggested ese as trimers, and/or with RNA 4-mers for predicting Protein-interaction, 124 | (Protein-RNA idea, from: "Predicting RNA-Protein Interactions Using Only Sequence Information", 125 | BMC Bioinformatics. 2011; Dobbs et al) 126 | ''' 127 | 128 | 129 | #hydrophilic vs. hydrophobic 130 | hp2 =TransDict_from_list(["AGTSNQDEHRKP", "CMFILVWY"]) 131 | #Hydrophilic, Hydrophobic, Charged. (Custom Ofer) 132 | hp3 = TransDict_from_list(["AGTSNQP", "CMFILVWY", "RKHED"]) 133 | #Hydrophilic, Hydrophobic, Positively Charged. (Custom Ofer) 134 | hp3_Plus = TransDict_from_list(["AGTSNQPHED", "CMFILVWY", "RK"]) 135 | 136 | murphy10 =TransDict_from_list( ["LVIM", "C", "A", "G", "ST", 137 | "P", "FYW", "EDNQ", "KR", "H"]) 138 | 139 | aromatic2 =TransDict_from_list(["FHWY", "ADKERNTSQLIVMCGP"]) 140 | 141 | hp_aroma_4 =TransDict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"]) 142 | 143 | # 'ofer13KR transdict:' 144 | ofer13KR = {'V': 'I', 'E': 'D', 'G': 'G', 'D': 'D', 'N': 'N', 'L': 'I', 'F': 'F', 'S': 'S', 'P': 'P', 'R': 'R', 'K': 'K', 'Y': 'F', 'T': 'S', 'C': 'C', 'A': 'A', 'Q': 'Q', 'M': 'I', 'H': 'H', 'W': 'W', 'I': 'I'} 145 | 146 | # https://github.com/biopython/biopython/blob/master/Bio/Alphabet/Reduced.py 147 | murphy15 = {"L": "L", "V": "L", "I": "L", 148 | "M": "L", "C": "C", "A": "A", 149 | "G": "G", "S": "S", "T": "T", 150 | "P": "P", "F": "F", "Y": "F", 151 | "W": "W", "E": "E", 152 | "D": "D", "N": "N", "Q": "Q", 153 | "K": "K", "R": "K", "H": "H"} 154 | 155 | murphy_8 = {"L": "L", "V": "L", "I": "L", 156 | "M": "L", 157 | "C": "L", "A": "A", "G": "A", 158 | "S": "S", "T": "S", 159 | "P": "P", "F": "F", "Y": "F", 160 | "W": "F", "E": "E", 161 | "D": "E", "N": "E", "Q": "E", 162 | "K": "K", "R": "K", "H": "H"} 163 | pc5 = {"I": "A", # Aliphatic 164 | "V": "A", "L": "A", 165 | "F": "R", # Aromatic 166 | "Y": "R", "W": "R", "H": "R", 167 | "K": "C", # Charged 168 | "R": "C", "D": "C", "E": "C", 169 | "G": "T", # Tiny 170 | "A": "T", "C": "T", "S": "T", 171 | "T": "D", # Diverse 172 | "M": "D", "Q": "D", "N": "D", 173 | "P": "D"} 174 | 175 | 176 | 177 | ### ProFEAT propensity based scales: #### 178 | # modified from ProFEAT + CTD. (Intended for letter: number use there) 179 | Disorder_3=TransDict_from_list(['ARSQEGKP','ILNCFYVW', 'DHMT']) 180 | Hydrophobicity_3 = TransDict_from_list(['RKEDQN','GASTPHY','CLVIMFW']) 181 | # #'1'stand for Polar; '2'stand for Neutral, '3' stand for Hydrophobicity 182 | Polarity_3 = TransDict_from_list(['LIFWCMVY','PATGS','HQRKNED']) #ProFeat based 183 | # #'1'stand for (4.9-6.2); '2'stand for (8.0-9.2), '3' stand for (10.4-13.0) 184 | Polarizability_3 = TransDict_from_list(['GASDT','CPNVEQIL','KMHFRYW']) 185 | # #'1'stand for (0-0.108); '2'stand for (0.128-0.186), '3' stand for (0.219-0.409) 186 | Charge_3 = TransDict_from_list(['KR','ANCQGHILMFPSTWYV','DE']) 187 | # #'1'stand for Positive; '2'stand for Neutral, '3' stand for Negative 188 | SecondaryStr_3 = TransDict_from_list(['EALMQKRH','VIYCWFT','GNPSD']) #Orig 189 | # #1'stand for Helix; '2'stand for Strand, '3' stand for coil 190 | NormVDWV_3 = TransDict_from_list(['GASTPDC','NVEQIL','MHKFRYW']) 191 | # #1'stand for (0-2.78); '2'stand for (2.95-4.0), '3' stand for (4.03-8.08) 192 | SolventA_3 = TransDict_from_list(['ALFCGIVW','RKQEND','MPSTHY']) 193 | # #1'stand for Buried; '2'stand for Exposed, '3' stand for Intermediate 194 | SurfaceTension_3 = TransDict_from_list(['GQDNAHR','KTSEC','ILMFPWYV']) 195 | # Hierarchical Classification of Protein Folds Using a Novel Ensemble Classifier. PLoS ONE 196 | 197 | THREE_LETTER_ALPH_NAMES = ['Disorder_3','Hydrophobicity_3', 198 | 'Polarity_3','Polarizability_3','Charge_3','SecondaryStr_3', 199 | 'NormVDWV_3','SolventA_3','SurfaceTension_3'] 200 | 201 | 'Call alphabet by name from this dict, then feed value into translator func:' 202 | REDUCED_ALPHABETS_TRANSDICTS = { 203 | 'ofer14':(ofer14), 204 | 'ofer_w8':ofer_w8, 205 | 'ofer13':(ofer13), 206 | 'ofer8':ofer8, 207 | 'ofer_tail':ofer_tail, 208 | 'gbm4':(gbm4), 209 | 'murphy10':(murphy10), 210 | 'hp_aroma_4':(hp_aroma_4), 211 | 'hp2':(hp2), 212 | 'hp3':(hp3), 213 | 'alex6':(alex6), 214 | 'sdm12':(sdm12), 215 | 'hsdm17':(hsdm17), 216 | 'murphy15':murphy15, 217 | 'pc5':pc5, 218 | 'Disorder_3':Disorder_3, 219 | 'Hydrophobicity_3':Hydrophobicity_3, 220 | 'Polarity_3':Polarity_3, 221 | 'Polarizability_3':Polarizability_3, 222 | 'Charge_3':Charge_3, 223 | 'SecondaryStr_3':SecondaryStr_3, 224 | 'NormVDWV_3':NormVDWV_3, 225 | 'SolventA_3':SolventA_3, 226 | 'hp3_Plus':hp3_Plus, 227 | 'ofer_gbm5':ofer_gbm5, 228 | 'shen7':shen7 229 | } 230 | 231 | 232 | def Get_Alph_Letters(REDUCED_ALPHABETS_TRANSDICTS): 233 | REDUCED_ALPHABETS_LETTERS = defaultdict(str) 234 | for k,v in REDUCED_ALPHABETS_TRANSDICTS.items(): 235 | REDUCED_ALPHABETS_LETTERS[k]=Get_Letters(v) 236 | REDUCED_ALPHABETS_LETTERS['AA20'] = 'ACDEFGHIKLMNPQRSTVWY' #Include full, nonreduced alphabet. 237 | return REDUCED_ALPHABETS_LETTERS 238 | 239 | 'Make this run once! Not every time method is called! (Potentially)' 240 | REDUCED_ALPHABETS_LETTERS = Get_Alph_Letters(REDUCED_ALPHABETS_TRANSDICTS) 241 | 242 | ############################################################################## 243 | 244 | if __name__=="__main__": 245 | print("ofer13KR transdict:") 246 | print(ofer13KR) 247 | print() 248 | '''Check this all works..''' 249 | # print(Reduced_Alphabets) 250 | protein="MQNEEDACLEAGYCLGTTLSSWRLHFMEEQSQSTMLMGIGIGALLTLAFVGIFFFVYRRVRRLRRAEDQQGTDDESDYQTEYEEELPAIPKETYADFQSTGIELDSDSEYEPSMLQGPPSLTSPEQSQDSFPWLPNQDDQGPRLEHPS" 251 | print(REDUCED_ALPHABETS_TRANSDICTS['gbm4']) 252 | print(translate_sequence(protein,REDUCED_ALPHABETS_TRANSDICTS['gbm4'])) 253 | print(REDUCED_ALPHABETS_LETTERS) 254 | print(REDUCED_ALPHABETS_LETTERS['ofer14']) 255 | # for k,v in REDUCED_ALPHABETS_LETTERS.items(): 256 | # print (str(k), str(len(set(v)))) 257 | print(translate_sequence(protein,REDUCED_ALPHABETS_TRANSDICTS['Charge_3'])) 258 | 259 | 260 | 261 | ''' 262 | #Internet: 263 | import string 264 | s='abracadabra' 265 | from_list='abcdr' 266 | to_list='?*!@|' 267 | print s.translate(string.maketrans(from_list,to_list)), 268 | # ?*|?!?@?*|? 269 | ''' 270 | -------------------------------------------------------------------------------- /py/asap/features_deps/Disorder.py: -------------------------------------------------------------------------------- 1 | __author__ = 'DanaLab' 2 | ''' 3 | Look at using: 4 | Get_ParamScales() from protfeat - and import scales from AAScales.py? 5 | (Also, calc. normalized KD scale, and save it (modify file) to AAScales.py and import from there 6 | = performance. 7 | Also, AAScales should hold (import) the TDP-IDP scale. - Dan. ) 8 | 9 | netCharge; calculateAminoAcidCharge - why not use built in ones from main ProtFeat? 10 | (also - names of methods here/there are SAME! 11 | => asking for bugs when importing, calling methods..) | Change method names. 12 | 13 | Look at using different PHs for netcharge calcing. (This would be a Different feature of 14 | course; i.e diff key name in res-dict) 15 | ''' 16 | from collections import Counter 17 | # from ProtFeat import pKa 18 | 19 | pKa = {'D':3.9, 'E':4.3, 'H':6.1, 'C':8.3, 'Y':10.1, 'K':10.5, 'R':12, 'N-term':8, 'C-term':3.1} 20 | charges = {'D':-1, 'E':-1, 'H':+1, 'C':-1, 'Y':-1, 'K':1, 'R':1, 'N-term':1, 'C-term':-1} 21 | 22 | 23 | # @staticmethod 24 | def netCharge(seq,pH = 7.2): #maybe ReName, to "subseq_ .." , to avoid confusion with "calculateProteinCharge", get_netCharge, From ProtFeat.py ? (OR use them directly) - D 25 | """ 26 | 27 | :param seq: 28 | :return: 29 | """ 30 | aa_counts = Counter(seq) 31 | # pH = 7.2 32 | res = 0.0 33 | 34 | def calculateAminoAcidCharge(amino_acid, pH): 35 | ratio = 1 / (1 + 10 ** (pH - pKa[amino_acid])) 36 | if charges[amino_acid] == 1: 37 | return ratio 38 | else: 39 | return ratio - 1 40 | 41 | for amino_acid in pKa: 42 | res += aa_counts[amino_acid] * calculateAminoAcidCharge(amino_acid, pH) 43 | return res 44 | 45 | 46 | # @staticmethod 47 | def hydrophobicity(seq): 48 | hydropathy = {'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8, 49 | 'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5, 'S': -0.8, 'T': -0.7, 'U': 0.0, 'V': 4.2, 'W': -0.9, 50 | 'Y': -1.3, 'B': -3.5, 'X': -0.49, 'Z': -3.5} 51 | WINDOW_SIZE = 5 52 | NORME = True # NORME = ? 53 | 54 | def normalizeHydropathy(): 55 | """ 56 | 57 | 58 | """ 59 | minimum = min(hydropathy.values()) 60 | maximum = max(hydropathy.values()) 61 | for key in hydropathy.keys(): 62 | oldVal = hydropathy[key] 63 | hydropathy[key] = (oldVal - minimum) / (maximum - minimum) 64 | 65 | def kyteDoolittle(seq, windowSize, normaliz): 66 | """ 67 | 68 | :param seq: 69 | """ 70 | seq = seq.strip() 71 | if normaliz: 72 | normalizeHydropathy() 73 | maxJump = int((windowSize - 1) / 2) #MOD 74 | result = 0 75 | subResultsArr = [] 76 | for i in range(maxJump): 77 | subResultsArr.append(0) 78 | for i in range(maxJump, len(seq) - maxJump): 79 | summ = 0 80 | for j in range(-maxJump, maxJump + 1): 81 | key = seq[i + j] 82 | if key in hydropathy.keys(): 83 | summ += hydropathy[key] 84 | else: 85 | #print(key) 86 | pass 87 | subResultsArr.append(summ / windowSize) 88 | result += summ / windowSize 89 | 90 | result /= len(seq) 91 | return result, subResultsArr 92 | 93 | return kyteDoolittle(seq, WINDOW_SIZE, NORME)[0] 94 | 95 | 96 | def uversky(seq): #As implemented, this gets the foldindex for WHOLE seq; vs segments/window. 97 | ''' 98 | FoldIndex method prediction of disorder. 99 | ''' 100 | #Why use sep. Seq? Use seq=self.seq for consistancy/less confusion, ne? 101 | R = netCharge(seq) #Maybe have this for different PHs. 102 | H = hydrophobicity(seq) 103 | uScore = (2.785 * float(H) - 1.151 - float(R)) 104 | return uScore 105 | 106 | def getDisordered(seq,segments=5): 107 | ''' 108 | Get predicted disorder for protein, divided into segments (default=5)., 109 | predicted individually for entirety of each segment; using: 110 | A) FoldIndex (Uversky) method. 111 | 112 | #I'd add other method(s) for getting predicted disordered here also. (EG, TDP-IDP scale, whether seperate or "joint"feature) - D 113 | ''' 114 | 115 | # seq = self.seq 116 | 117 | length = len(seq) 118 | window_size = int(length / segments) # window size 20% of the protein length 119 | pos = 0 120 | scores = [0 for _ in range(segments)] 121 | 122 | for i in range(segments-1): 123 | scores[i] = uversky(seq[pos:pos + window_size]) 124 | pos += window_size 125 | scores[-1] = uversky(seq[pos:]) 126 | 127 | res = {} 128 | key = "Disordered window " 129 | 130 | for i, uscore in enumerate(scores): 131 | res[key + str(i)] = 1 if uscore < 0 else 0 #ORIG 132 | # res[key + str(i)] = 1 if uscore != 0 else 0 133 | 'Binary feature - presence of ANY disordered window:' 134 | if uscore < (-0.1): 135 | res['AnyDISORDER_'+str(segments)]=1 136 | else: 137 | res['AnyDISORDER_'+str(segments)]=0 138 | 139 | return res -------------------------------------------------------------------------------- /py/asap/features_deps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ddofer/asap/7f592a660a422e24a6f816021cc95459f896f7a0/py/asap/features_deps/__init__.py -------------------------------------------------------------------------------- /py/asap/parse.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from Bio import SeqIO 4 | 5 | from . import util 6 | from . import config 7 | from . import data 8 | 9 | LOGGER = logging.getLogger('PARSE') 10 | 11 | def convert_lf_to_fasta(source, output_file): 12 | 13 | ''' 14 | Converts a .lf file, which also contains annotations, to a .fasta file, which contains only the amino-acid sequences 15 | of the records. 16 | @param source (file handle): 17 | The source .lf file to read. 18 | @param output_file (file handle): 19 | A file handlw with writing permissions to write the output FASTA into. 20 | ''' 21 | 22 | full_records = parse_records_from_file(source, extract_annotations = True) 23 | fasta_records = [full_record.to_fasta_record() for full_record in full_records] 24 | SeqIO.write(fasta_records, output_file, 'fasta') 25 | 26 | def parse_records_from_file(source, extract_annotations, relevant_ids = None, extra_tracks = {}): 27 | 28 | ''' 29 | Parses full data records from a file (either .lf format, which also includes annotation masks, 30 | or a simple fasta ) 31 | @param source (file): 32 | A file handle to parse the records from. 33 | @param extract_annotations (bool): 34 | Whether to expect a .lf format which also contains annotations, or a simple .fasta format. 35 | @param relevant_ids (collection, optional): 36 | An optional list of ids. If None, will do nothing. If provided with a collection, 37 | will return only records with the given ids. 38 | @param extra_tracks (dict, empty by default): 39 | Extra tracks to give the records, given in the following format: 40 | { 41 | track_name : { 42 | record_id: (seq, padding_value), 43 | ... 44 | } 45 | ... 46 | } 47 | @return: 48 | A generator for the parsed records (each of type FullDataRecord). 49 | ''' 50 | 51 | seqs = list(SeqIO.parse(source, 'fasta')) 52 | LOGGER.info('Parsing %d sequencess...' % len(seqs)) 53 | 54 | for seq in seqs: 55 | if relevant_ids is None or _format_id(seq.id) in relevant_ids: 56 | yield _parse_fasta_record(seq, extra_tracks, extract_annotations) 57 | 58 | def get_record_from_seq(seq, annotation_mask = None, extra_tracks = {}): 59 | 60 | ''' 61 | Creates a full data record from sequences. 62 | @param seq (string): 63 | The amino-acid sequence of the record 64 | @param annotation_mask (string, optional): 65 | A binary mask (made of 0's and 1's) in the same size of the given sequence to use 66 | as an annotation mask. If not provided, the record will not have an annotation mask. 67 | @param extra_tracks (dict, empty by default): 68 | Extra tracks to give the record, given in the following format: 69 | { 70 | track_name: (seq, padding_value), 71 | ... 72 | } 73 | @return: 74 | A FullDataRecord created from the provided data. 75 | ''' 76 | 77 | sequence_tracks = data.SequenceTracks() 78 | sequence_tracks.add_track(data.SequenceTrack('aa', seq)) 79 | 80 | if annotation_mask is not None: 81 | sequence_tracks.add_track(data.SequenceTrack('annotation', annotation_mask)) 82 | 83 | for track_name, track_data in extra_tracks.items(): 84 | track_seq, track_padding_value = track_data 85 | sequence_tracks.add_track(data.SequenceTrack(track_name, track_seq, track_padding_value)) 86 | 87 | return data.FullDataRecord('N/A', 'N/A', 'N/A', sequence_tracks) 88 | 89 | def parse_track_from_file(source, type): 90 | 91 | ''' 92 | Parses the track data of multiple records from a FASTA file . 93 | @param source (file): 94 | The file handle to parse (in FASTA format) 95 | @param type (string): 96 | The type of the track to parse (options: seq, disorder, pssm) 97 | @return: 98 | A dictionary of the following format: 99 | { 100 | record_id: (seq, padding_value), 101 | ... 102 | } 103 | ''' 104 | 105 | track_file_parser, track_seq_parser, padding_value = _TRACK_TYPE_TO_PARSERS_AND_PADDING[type] 106 | track_data = {} 107 | 108 | for record_id, seq in track_file_parser(source): 109 | track_data[_format_id(record_id)] = (seq, padding_value) 110 | 111 | return track_data 112 | 113 | def parse_track_from_seq(seq, type): 114 | 115 | ''' 116 | Parses the track data of a single record from a raw sequence. 117 | @param seq (string): 118 | The raw sequence to parse 119 | @param type (string): 120 | The type of the track to parse (options: seq, disorder, pssm) 121 | @return: 122 | A tuple containing the parsed track sequence and its padding value. 123 | ''' 124 | 125 | track_file_parser, track_seq_parser, padding_value = _TRACK_TYPE_TO_PARSERS_AND_PADDING[type] 126 | return track_seq_parser(seq), padding_value 127 | 128 | def _parse_fasta_record(fasta_seq, extra_tracks, extract_annotations): 129 | 130 | record_id = _format_id(fasta_seq.id) 131 | 132 | if extract_annotations: 133 | raw_seq_and_mask = str(fasta_seq.seq) 134 | mask_start_index = util.find_first_index_of(raw_seq_and_mask, '01') 135 | aa_seq = _fix_aa_seq(raw_seq_and_mask[:mask_start_index]) 136 | annotation_mask = raw_seq_and_mask[mask_start_index:] 137 | else: 138 | aa_seq = fasta_seq.seq 139 | annotation_mask = None 140 | 141 | sequence_tracks = data.SequenceTracks() 142 | sequence_tracks.add_track(data.SequenceTrack('aa', aa_seq)) 143 | 144 | if annotation_mask is not None: 145 | sequence_tracks.add_track(data.SequenceTrack('annotation', annotation_mask)) 146 | 147 | for track_name, extra_track_data in extra_tracks.items(): 148 | if record_id in extra_track_data: 149 | track_seq, track_padding_value = extra_track_data[record_id] 150 | sequence_tracks.add_track(data.SequenceTrack(track_name, track_seq, track_padding_value)) 151 | else: 152 | raise Exception('No record for %s in track %s' % (record_id, track_name)) 153 | 154 | return data.FullDataRecord(record_id, fasta_seq.name, fasta_seq.description, sequence_tracks) 155 | 156 | def _parse_seq_track_from_file(source): 157 | for seq in SeqIO.parse(source, 'fasta'): 158 | yield seq.id, str(seq.seq) 159 | 160 | def _parse_seq_track_from_seq(seq): 161 | return seq 162 | 163 | def _parse_disorder_track_from_file(source): 164 | for seq in SeqIO.parse(source, 'fasta'): 165 | yield seq.id, _parse_disorder_track_from_seq(str(seq.seq)) 166 | 167 | def _parse_disorder_track_from_seq(seq): 168 | disorder_start_index = util.find_first_index_of(seq, config.DISORDER_OPTIONS) 169 | return seq[disorder_start_index:] 170 | 171 | def _parse_pssm_track_from_file(source): 172 | for raw_record in source.read().split('>')[1:]: 173 | lines = raw_record.splitlines() 174 | record_id = lines[0].split(' ')[0] 175 | pssm = _parse_pssm(lines[1:]) 176 | yield record_id, pssm 177 | 178 | def _parse_pssm_track_from_seq(seq): 179 | return _parse_pssm(seq.splitlines()) 180 | 181 | def _parse_pssm(lines): 182 | 183 | pssm = [] 184 | 185 | for line in lines: 186 | freqs_vector = map(float, line.split(' ')[1:]) 187 | freqs_dict = dict(zip(config.PSSM_AMINO_ACIDS, freqs_vector)) 188 | freqs_dict['_'] = 0.0 189 | pssm += [freqs_dict] 190 | 191 | return pssm 192 | 193 | def _fix_aa_seq(seq): 194 | 195 | fixed_seq = '' 196 | 197 | for aa in seq: 198 | if aa in config.AMINO_ACIDS: 199 | fixed_seq += aa 200 | else: 201 | fixed_seq += '_' 202 | 203 | return fixed_seq 204 | 205 | def _format_id(id): 206 | return id.replace('|', '__').replace('-', '_') 207 | 208 | _TRACK_TYPE_TO_PARSERS_AND_PADDING = { 209 | 'seq': (_parse_seq_track_from_file, _parse_seq_track_from_seq, '_'), 210 | 'disorder': (_parse_disorder_track_from_file, _parse_disorder_track_from_seq, '_'), 211 | 'pssm': (_parse_pssm_track_from_file, _parse_pssm_track_from_seq, [dict([(aa, 0.0) for aa in config.PSSM_AMINO_ACIDS] + [('_', 1.0)])]), 212 | } 213 | -------------------------------------------------------------------------------- /py/asap/sklearn_extensions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This module contains extensions required by our project that we wish sklearn supported. 3 | ''' 4 | 5 | from sklearn.ensemble import RandomForestClassifier 6 | 7 | class RandomForestClassifierWithCoef(RandomForestClassifier): 8 | 9 | ''' 10 | A small hack required to make sklearn.ensemble.RandomForestClassifier support sklearn.feature_selection.RFECV. 11 | ''' 12 | 13 | def fit(self, *args, **kwargs): 14 | ''' 15 | @see sklearn.ensemble.RandomForestClassifier.fit 16 | ''' 17 | super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) 18 | self.coef_ = self.feature_importances_ 19 | 20 | class FeatureSelectionPipeline(object): 21 | 22 | ''' 23 | Like sklearn.pipeline.Pipeline, but suitable for feature selection only. 24 | Unfortunately we can't use sklearn.pipeline.Pipeline as it is, because it doesn't have the get_support method that we need. 25 | This class isn't intended for general purpose, and we do not recommend using it outside the context of this project. 26 | ''' 27 | 28 | def __init__(self, feature_selectors): 29 | 30 | ''' 31 | @param feature_selectors (list of feature selectors): 32 | The list of feature selectors to pipeline together. 33 | ''' 34 | 35 | if len(feature_selectors) == 0: 36 | raise Exception('Cannot pipeline an empty list of feature selectors') 37 | 38 | for feature_selector in feature_selectors: 39 | for method_name in ['fit', 'transform', 'fit_transform', 'get_support']: 40 | if not hasattr(feature_selector, method_name): 41 | raise Exception('Feature selectors must have a %s method' % method_name) 42 | 43 | self.feature_selectors = feature_selectors 44 | 45 | def fit(self, X, y): 46 | 47 | for feature_selector in self.feature_selectors[:-1]: 48 | X = feature_selector.fit_transform(X, y) 49 | 50 | self.feature_selectors[-1].fit(X, y) 51 | 52 | def transform(self, X): 53 | 54 | for feature_selector in self.feature_selectors: 55 | X = feature_selector.transform(X) 56 | 57 | return X 58 | 59 | def fit_transform(self, X, y): 60 | 61 | for feature_selector in self.feature_selectors: 62 | X = feature_selector.fit_transform(X, y) 63 | 64 | return X 65 | 66 | def get_support(self): 67 | 68 | support = self.feature_selectors[0].get_support() 69 | 70 | for feature_selector in self.feature_selectors[1:]: 71 | support = _embed_vector_in_mask(feature_selector.get_support(), support) 72 | 73 | return support 74 | 75 | def _embed_vector_in_mask(vector, mask): 76 | 77 | ''' 78 | Embedding a vector inside the positive indices of a boolean mask. 79 | For example, if given the vector [x1, x2, x3] and the mask [0, 0, 0, 1, 0, 0, 1, 1], then the returned value will 80 | be [0, 0, 0, x1, 0, 0, x2, x3]. 81 | Note that the number of 1's in the mask must be equal to the length of the vector. 82 | ''' 83 | 84 | _validate_boolean(mask) 85 | 86 | if len(vector) != sum(mask): 87 | raise Exception('Cannot embed a vector of size %d in a mask with %d 1\'s' % (len(vector), sum(mask))) 88 | 89 | result = [0] * len(mask) 90 | vector_index = 0 91 | 92 | for i, flag in enumerate(mask): 93 | if flag: 94 | result[i] = vector[vector_index] 95 | vector_index += 1 96 | 97 | return result 98 | 99 | def _validate_boolean(mask): 100 | for flag in mask: 101 | if flag not in [0, 1]: 102 | raise Exception('Expecting a boolean mask, given %s element' % repr(flag)) 103 | 104 | -------------------------------------------------------------------------------- /py/asap/util.py: -------------------------------------------------------------------------------- 1 | def apply_mask(array, mask): 2 | if len(array) == len(mask): 3 | return [element for element, flag in zip(array, mask) if flag] 4 | else: 5 | raise Exception('Cannot apply a mask of a different length') 6 | 7 | def bit_to_bool(bit): 8 | return bit == '1' 9 | 10 | def find_first_index_of(string, character_list): 11 | for i, c in enumerate(string): 12 | if c in character_list: 13 | return i 14 | 15 | def format_as_csv_value(value): 16 | if type(value) == bool: 17 | if value: 18 | return '1' 19 | else: 20 | return '0' 21 | if type(value) == float: 22 | return '%.4f' % value 23 | else: 24 | return str(value) 25 | 26 | def write_csv_line(csv_writer, line): 27 | csv_writer.writerow(map(format_as_csv_value, line)) 28 | -------------------------------------------------------------------------------- /py/asap/window_extraction.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | import datetime 3 | import csv 4 | import logging 5 | 6 | from . import util 7 | from . import features 8 | from . import parse 9 | 10 | BASIC_HEADERS = [ 11 | 'peptide_id', 12 | 'window_hot_index', 13 | 'window_seq', 14 | 'window_neighbourhood', 15 | ] 16 | 17 | ANNOTATION_HEADERS = [ 18 | 'window_annotation_mask', 19 | 'window_label', 20 | 'window_only_almost_positive', 21 | ] 22 | 23 | META_WINDOW_HEADERS = BASIC_HEADERS + ANNOTATION_HEADERS 24 | 25 | LOGGER = logging.getLogger('EXTRACTION') 26 | 27 | class WindowExtractionParams(object): 28 | 29 | ''' 30 | Parameters that should be used when extracting windows and their features from full records. 31 | ''' 32 | 33 | def __init__(self, window_prefix = 9, window_suffix = 9, neighbourhood_prefix = 5, neighbourhood_suffix = 5, \ 34 | windows_filter = None, feature_keys = features.DEFAULT_FEATURE_KEYS): 35 | 36 | ''' 37 | @param window_prefix, window_suffix (int, both default 7): 38 | The number of residues before and after the hot index in each window, where the hot index is the position determining the label of 39 | the window (i.e. the window's label is the value of the annotation mask in the hot index). It follows that the total window size 40 | is (window_prefix + window_suffix + 1). 41 | @param neighbourhood_prefix, neighbourhood_suffix (int, both default 5): 42 | The number of residues before and after the hot index in determining the neighbourhood of the window. The neighbourhood can be used 43 | during the training process in order to avoid duplicates of very similar windows. 44 | @param windows_filter (function, optional): 45 | A function to filter the extracted windows by. The function will receive Window objects and should return a bool stating whether to 46 | include them or not. If not provided, no filtration will take place in the windows level, and all windows will be extracted. 47 | @param feature_keys (list, optional, default features.DEFAULT_FEATURE_KEYS): 48 | A list of features to extract for each window. You can see the full list of optional keywords in features.FEARURE_KEY_OPTIONS, where 49 | documentation is also provided for each of the feature keys. If not provided, all features will be extracted by default (i.e. will 50 | use all of the features in features.FEARURE_KEY_OPTIONS). By default will use features.DEFAULT_FEATURE_KEYS, which is another list 51 | containing most of the features, but not all of them, as there are some features that it doesn't make much sense to use at the same 52 | time (e.g. both 'aa' and 'aa_reduced'). We anticipate that using the default features should give pretty good results in most 53 | scenarios, so fine-tuning the exact used features can be left for late stages of a project. 54 | Important note: Features that rely on extra tracks (ss, acc, disorder, pssm) will not be extracted if the tracks are not provided, 55 | even if those features are explicitly given in this list. 56 | ''' 57 | 58 | self.window_prefix = window_prefix 59 | self.window_suffix = window_suffix 60 | self.neighbourhood_prefix = neighbourhood_prefix 61 | self.neighbourhood_suffix = neighbourhood_suffix 62 | self.windows_filter = windows_filter 63 | self.feature_keys = feature_keys 64 | 65 | self.window_size = window_prefix + window_suffix + 1 66 | self.window_hot_index = window_prefix 67 | self.neighbourhood_size = neighbourhood_prefix + neighbourhood_suffix + 1 68 | 69 | def extract_windows_from_file(source, extract_annotations = False, seqs_filtration_file = None, \ 70 | extra_tracks_files = {}, csv_output_file = None, window_extraction_params = WindowExtractionParams()): 71 | 72 | ''' 73 | Parses a given file with peptide sequences and breaks it into windows with features, outputs a CSV with a row for each window 74 | and a column for each feature (along with a few other meta headers). 75 | @param source (file): 76 | A file handle to parse the peptide sequences from. Can be either a .fasta or .lf format, depending on the extract_annotations 77 | parameter. 78 | @param extract_annotations (boolean, default False): 79 | Whether to expect finding annotation masks inside the given file of sequences. If set to True, will expect getting a .lf file 80 | which also contains annotations. If set to False, will expect getting a .fasta file that contains only the sequences. Annotations 81 | are required only if one plans using the extracted windows to train a new classifier, rather than using an existing one. 82 | @param seqs_filtration_file (file, optional): 83 | A fasta format file handle to use for filtering records. If given, will use only sequences with an ID that is also present in the 84 | ids of this FASTA file. If not given, will not perform any filtration. 85 | @param extra_tracks_files (dict, empty by default): 86 | A dictionary for providing extra tracks to extract data from (beside the actual amino-acid sequence and annotations mask). The 87 | given dictionary should map from a track name to a file handle containing the track data for each of the records in a FASTA 88 | format. The currently supported extra tracks are: ss (secondary-structure), acc (accessibility), disorder and pssm (position-specific 89 | scoring matrix). 90 | @param csv_output_file (file, optional): 91 | A file handle with writing permissions to write the output CSV into. If not provided, will return a StringIO object from which the 92 | output CSV can be read. 93 | @param window_extraction_params (WindowExtractionParams, default params by default): 94 | Parameters to use for extracting the windows. 95 | @return: 96 | If csv_output_file is given, will return nothing. If csv_output_file is not given, will return a a StringIO object from which the 97 | output CSV can be read. 98 | ''' 99 | 100 | relevant_ids = _get_relevant_ids(seqs_filtration_file) 101 | extra_tracks = _get_extra_tracks_from_files(extra_tracks_files) 102 | full_records = list(parse.parse_records_from_file(source, extract_annotations, relevant_ids, extra_tracks)) 103 | parse.LOGGER.info('Final records: %d' % len(full_records)) 104 | _pad_records(full_records, window_extraction_params) 105 | return _extract_windows(full_records, csv_output_file, window_extraction_params) 106 | 107 | def extract_windows_from_seq(seq, annotation_mask = None, extra_tracks_data = {}, csv_output_file = None, \ 108 | window_extraction_params = WindowExtractionParams()): 109 | 110 | ''' 111 | Breaking a peptide sequence into windows with features, outputting a CSV with a row for each window and a column for each feature 112 | (along with a few other meta headers). 113 | @param seq (string): 114 | The peptide sequence to use, given in a 20 amino-acid alphabet. 115 | @param annotation_mask (string, optional): 116 | An annotation mask to use as a labeling for each position along the sequence. Expecting a binary sequence (of 0's and 1's) in 117 | the same length of the given amino-acid sequence. If not provided, the extracted windows won't have labels, meaning they cannot 118 | be used for training a new classifier (only fed to an already trained classifier). 119 | @param extra_tracks_data (dict, empty by default): 120 | A dictionary for providing extra tracks to extract data from (beside the actual amino-acid sequence and annotations mask). The 121 | given dictionary should map from track names to their sequence. Currently supported extra tracks are: ss (secondary-structure), 122 | acc (accessibility), disorder and pssm (position-specific scoring matrix). 123 | @param csv_output_file (file, optional): 124 | A file handle with writing permissions to write the output CSV into. If not provided, will return a StringIO object from which 125 | the output CSV can be read. 126 | @param window_extraction_params (WindowExtractionParams, default params by default): 127 | Parameters to use for extracting the windows. 128 | @return: 129 | If csv_output_file is given, will return nothing. If csv_output_file is not given, will return a a StringIO object from which the 130 | output CSV can be read. 131 | ''' 132 | 133 | extra_tracks = _get_extra_tracks_from_raw_data(extra_tracks_data) 134 | full_record = parse.get_record_from_seq(seq, annotation_mask, extra_tracks) 135 | _pad_record(full_record, window_extraction_params) 136 | return _extract_windows([full_record], csv_output_file, window_extraction_params) 137 | 138 | def _get_relevant_ids(seqs_filtration_file): 139 | if seqs_filtration_file is None: 140 | return None 141 | else: 142 | relevant_ids = parse.parse_track_from_file(seqs_filtration_file, 'seq').keys() 143 | parse.LOGGER.info('%d records are in the filtration FASTA file' % len(relevant_ids)) 144 | return relevant_ids 145 | 146 | def _get_extra_tracks_from_files(extra_tracks_files): 147 | 148 | extra_tracks = {} 149 | 150 | for track_name, track_source in extra_tracks_files.items(): 151 | if track_name in _TRACK_NAME_TO_TYPE: 152 | track_type = _TRACK_NAME_TO_TYPE[track_name] 153 | extra_tracks[track_name] = parse.parse_track_from_file(track_source, track_type) 154 | else: 155 | raise Exception('Unknown track name: ' + str(track_name)) 156 | 157 | return extra_tracks 158 | 159 | def _get_extra_tracks_from_raw_data(extra_tracks_data): 160 | 161 | extra_tracks = {} 162 | 163 | for track_name, raw_track_seq in extra_tracks_data.items(): 164 | if track_name in _TRACK_NAME_TO_TYPE: 165 | track_type = _TRACK_NAME_TO_TYPE[track_name] 166 | extra_tracks[track_name] = parse.parse_track_from_seq(raw_track_seq, track_type) 167 | else: 168 | raise Exception('Unknown track name: ' + str(track_name)) 169 | 170 | return extra_tracks 171 | 172 | def _pad_records(records, window_extraction_params): 173 | for record in records: 174 | _pad_record(record, window_extraction_params) 175 | 176 | def _pad_record(record, window_extraction_params): 177 | record.pad(window_extraction_params.window_prefix - 1, window_extraction_params.window_suffix - 1) 178 | 179 | def _extract_windows(full_records, csv_output_file, window_extraction_params): 180 | if csv_output_file is None: 181 | csv_buffer = StringIO() 182 | _extract_windows_to_csv(full_records, csv_buffer, window_extraction_params) 183 | csv_buffer.seek(0) 184 | return csv_buffer 185 | else: 186 | _extract_windows_to_csv(full_records, csv_output_file, window_extraction_params) 187 | 188 | def _extract_windows_to_csv(full_records, output_file, window_extraction_params): 189 | 190 | LOGGER.info('Extracting windows with features in CSV format...') 191 | start = datetime.datetime.now() 192 | csv_writer = csv.writer(output_file) 193 | feature_headers = None 194 | include_annotations = None 195 | 196 | for record in full_records: 197 | for window in record.get_windows(window_extraction_params.window_size): 198 | feature_headers, include_annotations = _process_window_to_csv(window, csv_writer, window_extraction_params, \ 199 | feature_headers, include_annotations) 200 | 201 | time_diff = datetime.datetime.now() - start 202 | LOGGER.info('Done. Extraction took %d seconds.' % time_diff.total_seconds()) 203 | 204 | def _process_window_to_csv(window, csv_writer, window_extraction_params, feature_headers, include_annotations): 205 | 206 | if feature_headers is None: 207 | features = window.get_features(window_extraction_params.window_hot_index, window_extraction_params.feature_keys) 208 | feature_headers = list(sorted(features.keys())) 209 | include_annotations = window.has_annotation_mask() 210 | util.write_csv_line(csv_writer, _get_meta_headers(include_annotations) + feature_headers) 211 | 212 | if window_extraction_params.windows_filter is None or window_extraction_params.windows_filter(window): 213 | meta_values = _get_window_meta_values(window, window_extraction_params, include_annotations) 214 | features = window.get_features(window_extraction_params.window_hot_index, window_extraction_params.feature_keys) 215 | feature_values = [features[header] for header in feature_headers] 216 | util.write_csv_line(csv_writer, meta_values + feature_values) 217 | 218 | return feature_headers, include_annotations 219 | 220 | def _get_meta_headers(include_annotations): 221 | if include_annotations: 222 | return BASIC_HEADERS + ANNOTATION_HEADERS 223 | else: 224 | return BASIC_HEADERS 225 | 226 | def _get_window_meta_values(window, window_extraction_params, include_annotations): 227 | 228 | hot_index = window.original_index + window_extraction_params.window_hot_index 229 | neighbourhood = window.get_neighbourhood(window_extraction_params.window_hot_index, window_extraction_params.neighbourhood_prefix, \ 230 | window_extraction_params.neighbourhood_suffix) 231 | meta_values = [window.full_record.id, hot_index, window.get_aa_seq(), neighbourhood] 232 | 233 | if include_annotations: 234 | label = window.get_label(window_extraction_params.window_hot_index) 235 | is_only_almost_positive = window.is_only_almost_positive(window_extraction_params.window_hot_index) 236 | meta_values += [window.get_annotation_mask(), label, is_only_almost_positive] 237 | 238 | return meta_values 239 | 240 | _TRACK_NAME_TO_TYPE = { 241 | 'ss': 'seq', 242 | 'acc': 'seq', 243 | 'disorder': 'disorder', 244 | 'pssm': 'pssm', 245 | } 246 | -------------------------------------------------------------------------------- /py/cleavepred/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import simple_cleavage_predictor, advanced_cleavage_predictor -------------------------------------------------------------------------------- /py/cleavepred/api.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from . import project_paths 4 | 5 | class CleavagePredictor(object): 6 | 7 | ''' 8 | A predictor trained to predict the cleavage of peptides. 9 | There should be only two instances of this class: 10 | 1. simple_cleavage_predictor - Uses only the basic features derived from the amino-acid sequence of peptides. 11 | 2. advanced_cleavage_predictor - Used also features derived from external tools (ss, acc, disorder and pssm). 12 | ''' 13 | 14 | def __init__(self, advanced): 15 | self.advanced = advanced 16 | self._peptide_predictor = None 17 | 18 | def predict(self, seq, extra_tracks_data = {}, proba = False): 19 | ''' 20 | Predicts cleavage for a given peptide. 21 | @param seq (string): 22 | The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet. 23 | @param extra_tracks_data (dict, empty by default): 24 | A dictionary for providing extra tracks of the given peptide. If using the simple predictor (i.e. advanced = False), it can 25 | be left empty. If using the advanced predictor (i.e. advanced = True), must receive all tracks (i.e. ss, acc, disorder 26 | and pssm). The given dictionary should map from track names to their sequence. 27 | @param proba (default False): 28 | Whether to return mask of predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s). 29 | @return: 30 | A tuple composed of: 31 | 1. cleavage_mask - If proba = False, it will be a binary string (0's and 1's) representing whether each residue is a cleavage 32 | site (1) or not (0). If proba = True, it will be a list of floats (between 0 to 1) representing the probability of each residue 33 | to be a cleavage site. Either way, the length of the returned string/list will correspond to the length of the provided peptide 34 | sequence. 35 | 2. cleavage_products - A list of strings, each representing the amino-acid sequence of a predicted cleavage product. 36 | ''' 37 | cleavage_mask = self.get_peptide_predictor().predict_annotations(seq, extra_tracks_data = extra_tracks_data, proba = proba) 38 | cleavage_products = _get_cleavage_products(seq, cleavage_mask) 39 | return cleavage_mask, cleavage_products 40 | 41 | def get_peptide_predictor(self): 42 | 43 | ''' 44 | @return: 45 | The PeptidePredictor object associated with this cleavage predictor. 46 | ''' 47 | 48 | if self._peptide_predictor is None: 49 | self._peptide_predictor = self._load_peptide_predictor() 50 | 51 | return self._peptide_predictor 52 | 53 | def _load_peptide_predictor(self): 54 | 55 | predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(self.advanced), 'rb') 56 | 57 | try: 58 | return pickle.load(predictor_dump_file) 59 | finally: 60 | predictor_dump_file.close() 61 | 62 | simple_cleavage_predictor = CleavagePredictor(False) 63 | advanced_cleavage_predictor = CleavagePredictor(True) 64 | 65 | def _get_cleavage_products(seq, cleavage_mask): 66 | 67 | products = [] 68 | current_product = '' 69 | 70 | for i in range(len(seq)): 71 | 72 | current_product += seq[i] 73 | 74 | # When we have continuous positive cleavage sites, we consider only the most C-terminus one. 75 | if _is_cleavage(cleavage_mask[i]) and (i >= len(seq) - 1 or not _is_cleavage(cleavage_mask[i + 1])): 76 | _add_if_not_empty(products, current_product) 77 | current_product = '' 78 | 79 | _add_if_not_empty(products, current_product) 80 | return products 81 | 82 | def _is_cleavage(label): 83 | if isinstance(label, str): 84 | return label == '1' 85 | elif isinstance(label, int) or isinstance(label, float): 86 | return int(round(label)) == 1 87 | else: 88 | raise Exception('Unknown label type: ' + str(type(label))) 89 | 90 | def _add_if_not_empty(array, string): 91 | if len(string) > 0: 92 | array += [string] 93 | -------------------------------------------------------------------------------- /py/cleavepred/check_top_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Checks the top features predicted for a given dataset. 3 | Arguments: 4 | - dataset_name (string): The name of the dataset to use. 5 | - advanced (boolean): Whether to use all or only some features, corresponding to simple and advanced classifiers respectively. 6 | ''' 7 | 8 | import sys 9 | import logging 10 | 11 | import pandas as pd 12 | 13 | from asap import get_top_features 14 | 15 | from cleavepred import util 16 | from cleavepred import project_paths 17 | 18 | logger = logging.getLogger('FEATURES') 19 | 20 | ### Parse arguments ### 21 | 22 | project_paths.dataset_name = sys.argv[1].lower() 23 | advanced = util.parse_bool(sys.argv[2]) 24 | 25 | ### Get top features ### 26 | 27 | windows_file = None 28 | 29 | def open_files(): 30 | global windows_file 31 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 32 | 33 | def close_files(): 34 | util.close_files([windows_file]) 35 | 36 | def get_advanced_label(): 37 | if advanced: 38 | return 'advanced' 39 | else: 40 | return 'simple' 41 | 42 | def check_top_features(): 43 | windows_data_frame = pd.read_csv(windows_file) 44 | logger.info('Checking top features over %s dataset with %s features...' % (project_paths.dataset_name, get_advanced_label())) 45 | top_features = get_top_features(windows_data_frame, drop_only_almost_positives = True) 46 | logger.info('Top features: ' + ', '.join(top_features)) 47 | 48 | if __name__ == '__main__': 49 | try: 50 | open_files() 51 | check_top_features() 52 | finally: 53 | close_files() 54 | -------------------------------------------------------------------------------- /py/cleavepred/common.py: -------------------------------------------------------------------------------- 1 | from asap import FEATURE_KEY_OPTIONS, WindowExtractionParams 2 | from asap.config import POSITIVE_AMINO_ACIDS 3 | 4 | AVAILABLE_TRACKS = [ 5 | 'ss', 6 | 'acc', 7 | 'disorder', 8 | 'pssm', 9 | ] 10 | 11 | # Here we prefer using 'aa_reduced' over 'aa'. We give up on some other features. 12 | USED_FEATURES = set(FEATURE_KEY_OPTIONS).difference(['aa', 'accum_charge_left', 'accum_charge_right', 'accum_pos_charge_left', 'accum_pos_charge_right']) 13 | 14 | def windows_filter(window): 15 | ''' 16 | We consider only windows with a positively charged amino-acid (i.e. K/R) in the hot index (only then it can be a 17 | cleavage candidate). 18 | ''' 19 | return window.get_aa_seq()[window_extraction_params.window_hot_index] in POSITIVE_AMINO_ACIDS 20 | 21 | window_extraction_params = WindowExtractionParams(window_prefix = 11, window_suffix = 8, neighbourhood_prefix = 5, \ 22 | neighbourhood_suffix = 5, windows_filter = windows_filter, feature_keys = USED_FEATURES) 23 | -------------------------------------------------------------------------------- /py/cleavepred/extract_uniprot_annotated_seqs_from_xml.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Extract a .lf file, containing sequences and cleavage annotation masks, out of a UniProt's XML file. 3 | Arguments: 4 | - output_file_path (file path, optional): The path to write the output .lf file to. If not provided, will update the project's relevant file. 5 | ''' 6 | 7 | import sys 8 | import re 9 | import xml.etree.ElementTree as et 10 | from StringIO import StringIO 11 | 12 | from cleavepred import util 13 | from cleavepred import project_paths 14 | 15 | project_paths.dataset_name = 'uniprot' 16 | 17 | if len(sys.argv) > 1: 18 | output_file_path = sys.argv[1] 19 | else: 20 | output_file_path = project_paths.get_annotated_seqs_file_path() 21 | 22 | def get_unique(element, xpath): 23 | 24 | subelements = element.findall(xpath) 25 | 26 | if len(subelements) == 0: 27 | return None 28 | if len(subelements) == 1: 29 | return subelements[0] 30 | else: 31 | raise Exception('%d subelements: %s' % (len(subelements), xpath)) 32 | 33 | def parse_uniprot_xml(raw_xml_path): 34 | raw = util.read_file(raw_xml_path) 35 | fixed_raw = re.sub(r'xmlns="[^"]*"', '', raw) 36 | return et.fromstring(fixed_raw) 37 | 38 | def get_proteins_with_cleavage_sites(raw_xml_path): 39 | 40 | root = parse_uniprot_xml(raw_xml_path) 41 | 42 | for entry in root.findall('./entry'): 43 | 44 | accession = entry.findall('./accession')[0].text 45 | raw_seq = get_unique(entry, './sequence').text 46 | seq = re.sub(r'\s', '', raw_seq) 47 | 48 | signal_peptide_end = 0 49 | cleavage_sites = set() 50 | skip_protein = False 51 | 52 | for feature in entry.findall('./feature'): 53 | 54 | type = feature.get('type').lower() 55 | 56 | if type in ['peptide', 'chain', 'propeptide', 'signal peptide']: 57 | 58 | try: 59 | begin = int(get_unique(feature, './location/begin').get('position')) 60 | except: 61 | begin = None 62 | 63 | try: 64 | end = int(get_unique(feature, './location/end').get('position')) 65 | except: 66 | end = None 67 | 68 | if type == 'signal peptide': 69 | if end is None: 70 | print ('%s: no end to signal peptide. We will ignore this protein.' % accession) 71 | skip_protein = True 72 | break 73 | else: 74 | signal_peptide_end = max(signal_peptide_end, end) 75 | else: 76 | 77 | if begin is not None: 78 | cleavage_sites.add(begin - 1) 79 | cleavage_sites.add(begin - 2) 80 | 81 | if end is not None: 82 | if type == 'propeptide': 83 | cleavage_sites.add(end - 1) 84 | else: 85 | cleavage_sites.add(end) 86 | 87 | if skip_protein: 88 | continue 89 | 90 | cleavage_sites = set([i for i in cleavage_sites if i >= signal_peptide_end + 3 and i < len(seq) - 3 and seq[i] in 'KR']) 91 | cleavage_sites_to_remove = set([i - 1 for i in cleavage_sites]) # If 11, we take only the second 92 | cleavage_sites = cleavage_sites.difference(cleavage_sites_to_remove) 93 | 94 | if cleavage_sites: # we don't want samples with no cleavages at all - it's probably a mistake 95 | yield accession, seq, cleavage_sites, signal_peptide_end 96 | 97 | def cleavage_sites_to_mask(seq_length, cleavage_sites): 98 | 99 | mask = ['0'] * seq_length 100 | 101 | for cleavage_site in cleavage_sites: 102 | mask[cleavage_site] = '1' 103 | 104 | return ''.join(mask) 105 | 106 | def remove_xs(seq, mask): 107 | 108 | revised_seq = '' 109 | revised_mask = '' 110 | 111 | for aa, label in zip(seq, mask): 112 | if aa.lower() != 'x': 113 | revised_seq += aa 114 | revised_mask += label 115 | 116 | return revised_seq, revised_mask 117 | 118 | def space_seq(seq, chunk_length = 10): 119 | return ' '.join(util.split_to_chunks(seq, chunk_length)) 120 | 121 | def write_fasta_like_record(file, accession, seq, mask): 122 | file.write('>' + accession + '\n') 123 | file.write(space_seq(seq) + '\n') 124 | file.write(space_seq(mask) + '\n') 125 | file.write('\n') 126 | 127 | if __name__ == '__main__': 128 | 129 | output_file = open(output_file_path, 'wb') 130 | 131 | try: 132 | for accession, seq, cleavage_sites, signal_peptide_end in get_proteins_with_cleavage_sites(project_paths.get_raw_data_xml_file_path()): 133 | mask_to_write = cleavage_sites_to_mask(len(seq), cleavage_sites)[signal_peptide_end:] 134 | seq_to_write = seq[signal_peptide_end:] 135 | seq_to_write, mask_to_write = remove_xs(seq_to_write, mask_to_write) 136 | write_fasta_like_record(output_file, accession, seq_to_write, mask_to_write) 137 | finally: 138 | output_file.close() 139 | 140 | print 'Done.' 141 | -------------------------------------------------------------------------------- /py/cleavepred/extract_windows.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to extract the window features for a given dataset. 3 | Arguments: 4 | - dataset_name (string): The name of the dataset to extract the window features for. 5 | - advanced (boolean): Whether to use the extra tracks when extracting the windows, or extracting only the simple sequence-based features. 6 | - output_file_path (file path, optional): The path to write the output CSV to. If not provided, will update the project's relevant file. 7 | ''' 8 | 9 | import sys 10 | 11 | import asap 12 | 13 | from cleavepred import util 14 | from cleavepred import project_paths 15 | from cleavepred.common import window_extraction_params 16 | 17 | ### Parse arguments ### 18 | 19 | project_paths.dataset_name = sys.argv[1] 20 | advanced = util.parse_bool(sys.argv[2]) 21 | 22 | if len(sys.argv) > 3: 23 | output_file_path = sys.argv[3] 24 | else: 25 | output_file_path = project_paths.get_window_features_file_path(advanced) 26 | 27 | ### Extract the windows ### 28 | 29 | annotated_seqs_file = None 30 | seqs_filtration_file = None 31 | csv_output_file = None 32 | extra_tracks_files = {} 33 | 34 | def open_files(): 35 | 36 | global annotated_seqs_file, seqs_filtration_file, csv_output_file, extra_tracks_files 37 | 38 | annotated_seqs_file = open(project_paths.get_annotated_seqs_file_path(), 'rb') 39 | seqs_filtration_file = open(project_paths.get_filtered_seqs_file_path(), 'rb') 40 | csv_output_file = open(output_file_path, 'wb') 41 | 42 | if advanced: 43 | for track_name, track_file_path in project_paths.get_track_file_paths().items(): 44 | extra_tracks_files[track_name] = open(track_file_path, 'rb') 45 | 46 | def close_files(): 47 | util.close_files([annotated_seqs_file, seqs_filtration_file, csv_output_file]) 48 | util.close_files(extra_tracks_files.values()) 49 | 50 | def extract_windows(): 51 | asap.extract_windows_from_file(annotated_seqs_file, extract_annotations = True, seqs_filtration_file = seqs_filtration_file, \ 52 | extra_tracks_files = extra_tracks_files, csv_output_file = csv_output_file, window_extraction_params = window_extraction_params) 53 | 54 | if __name__ == '__main__': 55 | try: 56 | open_files() 57 | extract_windows() 58 | finally: 59 | close_files() 60 | -------------------------------------------------------------------------------- /py/cleavepred/get_disopred.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to run disopred3 on a multifasta file, then parse and collate the output. 3 | Steps: 4 | 1. Extract each fasta from a copy of original multifasta file. (Stored in a seperate directory) 5 | 2. (Opt?) Remove original multifasta file. 6 | 3. Run disopred on each fasta. 7 | 4. i. gather the data/output for all the fastas. (output file name is the fasta's name). 8 | 4. ii. Clean the format (for each file), and save to file: "output_feat.diso", (in a format like lf/ss/acc) 9 | 4.iii. Save this output to the external features folder. 10 | 5. Import from the standard pipeline (config) / not here. 11 | ''' 12 | 13 | import os 14 | import subprocess 15 | from subprocess import call 16 | import sys 17 | import csv 18 | import glob 19 | import pandas as pd 20 | # from Bio.Seq import Seq 21 | # from Bio.SeqRecord import SeqRecord 22 | from Bio import SeqIO 23 | 24 | 25 | #Location of the directory containing "run_disopred.pl" 26 | DISOPRED_LOCATION = r'/cs/stud/danofer/Desktop/danofer/Software/DISOPRED' 27 | DISO_PROG = 'run_disopred.pl' 28 | 29 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V3/' 30 | # FASTA_TARGET = 'NeuroPred_nNames_70ID.fasta' 31 | 32 | #NEW: 33 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/data/uniprot/UniProtTestSeqs/D3/' 34 | FASTA_LOCATION = '/a/fr-05/vol/protein/danofer/imac/Desktop/DFTP/' 35 | 36 | # FASTA_TARGET = 'D3_TEST_50_FILT.fasta' 37 | FASTA_TARGET = 'D3_TEST_50_FILT_mod.fasta' 38 | 39 | SPLIT_FASTAS_DIR = 'splitfasta1' 40 | split_fastas_dir = os.path.join(FASTA_LOCATION,SPLIT_FASTAS_DIR) 41 | 42 | 43 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET) 44 | 45 | ALT_DIR_OUTPUT = os.path.join(FASTA_LOCATION,'altFiltered') 46 | 47 | 48 | 49 | def parse_DISOPRED(): 50 | ''' 51 | Parse all pbdat files in a dir, (Output of DISOPRED) 52 | save their output in a format like hssp/ss/acc 53 | ''' 54 | # os.chdir(os.path.dirname(split_fastas_dir)) 55 | os.chdir(split_fastas_dir) 56 | 57 | files = glob.glob('*.pbdat') 58 | print('amount of .pbdat files:',len(files)) 59 | # output_file = open('/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V2_ExternalFeat_NP/output_feat.DISO', 'w') 60 | # output_file = open(FASTA_LOCATION+'output_feat.DISO', 'w') 61 | output_file = open(os.path.join(FASTA_LOCATION,'output_feat.DISO'), 'w') 62 | 63 | # print('Joined results will be saved to ',(FASTA_LOCATION+'output_feat.DISO')) 64 | print('Joined results will be saved to ',os.path.join(FASTA_LOCATION+'output_feat.DISO')) 65 | 66 | for f in files: 67 | accession = str('>'+os.path.splitext(os.path.basename(f))[0]) 68 | f=open(f) 69 | lines = f.readlines() 70 | seq = [] 71 | diso_state=[] 72 | for line in lines: 73 | parts = line.strip(' \n \t').split(' ') 74 | if parts[0] != '#': 75 | seq += parts[1] 76 | diso_state += parts[2] 77 | seq = ''.join(seq) 78 | diso_state = ''.join(diso_state) 79 | output_file.write(accession+'\n') 80 | output_file.write((seq)+'\n') 81 | output_file.write((diso_state)+'\n') 82 | # print(accession) 83 | print("Saved to output_feat.DISO") 84 | output_file.close() 85 | 86 | 87 | def split_fasta(filter = False): 88 | ''' 89 | https://py4bio.wordpress.com/2009/07/22/split_fasta_file/ 90 | This script takes a fasta file and split it in one file per fasta entry. 91 | It saves the outputs fastas in a new directory 92 | ''' 93 | os.chdir(os.path.dirname(FASTA_LOCATION)) 94 | print("Current working Directory:",os.getcwd()) 95 | filter_fastas = [] 96 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET) 97 | split_output_dir = SPLIT_FASTAS_DIR 98 | 99 | if filter == True: 100 | filter_fastas = filter_fasta_queries(fastas_dir=split_fastas_dir) 101 | split_output_dir = ALT_DIR_OUTPUT 102 | 103 | if not os.path.exists(split_output_dir): 104 | os.makedirs(split_output_dir) 105 | 106 | os.chdir(split_output_dir) 107 | 108 | i = 0 109 | read_counts = 0 110 | for record in SeqIO.parse(open(file_in), "fasta"): 111 | read_counts += 1 112 | if not (record.id in filter_fastas): 113 | # f_out = os.path.join(split_output_dir,record.id+'.fasta') 114 | f_out = (record.id+'.fasta') 115 | # f_out =(split_output_dir+record.id+'.fasta') 116 | print('save to:',f_out) 117 | # SeqIO.write([record],open(f_out,'w'),"fasta") 118 | with open(f_out, "w") as handle: 119 | SeqIO.write([record], handle, "fasta") 120 | i += 1 121 | 122 | print(read_counts," = Fastas in the original multifasta-file") 123 | print(i," = # Splitted Fasta files made") 124 | 125 | 126 | def call_DISOPRED(split_fastas_list): 127 | os.chdir(os.path.dirname(DISOPRED_LOCATION)) 128 | print(os.getcwd()) 129 | print("In DisoPred folder") 130 | for i, fasta in enumerate(split_fastas_list): 131 | print(i) 132 | print(fasta) 133 | subprocess.call([DISOPRED_LOCATION+'/'+DISO_PROG,fasta]) 134 | print() 135 | 136 | def filter_fasta_queries(fastas_dir=split_fastas_dir): 137 | ''' 138 | If Disopred job was interrupted in midway - 139 | This lets us continue (in a new dir) for 140 | only those sequences that do not have 141 | Disopred predictions = *.pbdat 142 | ''' 143 | os.chdir(fastas_dir) 144 | files = glob.glob('*.pbdat') 145 | print('# .pbdat files = fastas that were processed succesfully, previously:',len(files)) 146 | ids = [os.path.splitext(os.path.basename(f))[0] for f in files] 147 | print('len(ids)',len(ids)) 148 | return ids 149 | 150 | def find_missing(): 151 | ''' 152 | Disopred seems to "miss" some fastas. 153 | This helps us find them, assuming the 154 | disopred output is in the same dirr as 155 | the (split / "filtered") our fasta candidates 156 | ''' 157 | # os.chdir('/cs/prt3/danofer/CleavePred/Dataset/Uniprot/altFiltered') 158 | os.chdir(split_fastas_dir) 159 | 160 | fastas = [f for f in os.listdir('.') if f.endswith('.fasta')] 161 | ids = [os.path.splitext(os.path.basename(f))[0] for f in fastas] 162 | print('# Fastas present:',str(len(ids))) 163 | 164 | preds = fastas = [f for f in os.listdir('.') if f.endswith('.pbdat')] 165 | dis_ids = [os.path.splitext(os.path.basename(f))[0] for f in preds] 166 | print('# Predictions present:',str(len(dis_ids))) 167 | 168 | missing = [a for a in ids if a not in dis_ids] 169 | print('Missing IDs:') 170 | print(missing) 171 | return missing 172 | 173 | 174 | if __name__ == '__main__': 175 | 176 | # fe = filter_fasta_queries() 177 | # print('len filter_fasta_queries()',len(fe)) 178 | # split_fasta(filter = True) 179 | 180 | SPLIT_F = False 181 | CALL_DISO = False 182 | USE_FILTERED = False 183 | 184 | RESUME_DISO_PARTIAL = True 185 | 186 | if SPLIT_F == True: 187 | split_fasta() 188 | 189 | # split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta') 190 | split_fastas_list = glob.glob(os.path.join(split_fastas_dir+'/*.fasta')) 191 | print('\n split_fastas_list in dir: ',len(split_fastas_list)) 192 | 193 | if USE_FILTERED == True: 194 | # filt_split_fastas_list = glob.glob(ALT_DIR_OUTPUT+'/*.fasta') #ORIGINAL 195 | filt_split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta') #CHANGED #D 196 | print('\n filt_split_fastas_list ',len(filt_split_fastas_list)) 197 | print(filt_split_fastas_list[0]) 198 | print(filt_split_fastas_list[1]) 199 | call_DISOPRED(filt_split_fastas_list) 200 | 201 | 202 | if CALL_DISO == True: 203 | call_DISOPRED(split_fastas_list) 204 | print("\n DISOPRED DONE! \n") 205 | 206 | parse_DISOPRED() 207 | 208 | missing_ID = find_missing() 209 | if RESUME_DISO_PARTIAL: 210 | # fe = filter_fasta_queries() 211 | print("\n Calling disopred on missing IDs") 212 | print("Missing: \n",missing_ID) 213 | call_DISOPRED(split_fastas_list) 214 | print("\n DISOPRED DONE! \n") 215 | 216 | 217 | -------------------------------------------------------------------------------- /py/cleavepred/produce_auto_files.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A master script to produce all the required auto-generated files: 3 | 1. Uniprot's annotated seqs .lf file 4 | 2. CSVs of the windows with features 5 | 3. Pickle dump files of trained predictors. 6 | Just execute this script as it is, with no arguments. Make sure to be in the py/ directory when running it. 7 | ''' 8 | 9 | import sys 10 | import logging 11 | 12 | # For logger initialization 13 | import asap 14 | 15 | LOGGING_PREFIX = '********** ' 16 | 17 | logger = logging.getLogger('EXEC') 18 | 19 | # Uniprot's .lf file 20 | logger.info(LOGGING_PREFIX + 'Running extract_uniprot_annotated_seqs_from_xml.py') 21 | sys.argv = [''] 22 | execfile('cleavepred/extract_uniprot_annotated_seqs_from_xml.py') 23 | 24 | # Create CSVs 25 | for dataset in ['neuropred', 'uniprot']: 26 | for advanced in ['false', 'true']: 27 | logger.info(LOGGING_PREFIX + 'Running extract_windows.py with dataset="%s" and advanced="%s"' % (dataset, advanced)) 28 | sys.argv = ['', dataset, advanced] 29 | execfile('cleavepred/extract_windows.py') 30 | 31 | # Create dump files 32 | for advanced in ['false', 'true']: 33 | logger.info(LOGGING_PREFIX + 'Running train_classifier.py with advanced="%s"' % advanced) 34 | sys.argv = ['', advanced, 'auto'] 35 | execfile('cleavepred/train_classifier.py') 36 | 37 | logger.info(LOGGING_PREFIX + 'Finished executing scripts. All auto-generated files should now be updated.') 38 | -------------------------------------------------------------------------------- /py/cleavepred/project_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .common import AVAILABLE_TRACKS 4 | 5 | # A global variable to update whenever looking to work on another dataset 6 | dataset_name = None 7 | 8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | DATA_DIR = os.path.join(BASE_DIR, 'data/cleavage') 10 | 11 | def get_dataset_dir(): 12 | return os.path.join(DATA_DIR, '%s_dataset' % dataset_name) 13 | 14 | def get_peptide_predictor_dump_file_path(advanced): 15 | if advanced: 16 | return os.path.join(DATA_DIR, 'advanced_peptide_predictor.pkl') 17 | else: 18 | return os.path.join(DATA_DIR, 'simple_peptide_predictor.pkl') 19 | 20 | def get_window_features_file_path(advanced): 21 | if advanced: 22 | return os.path.join(get_dataset_dir(), 'window_advanced_features.csv') 23 | else: 24 | return os.path.join(get_dataset_dir(), 'window_simple_features.csv') 25 | 26 | def get_raw_data_xml_file_path(): 27 | # Relevant only for when dataset_name = 'uniprot' 28 | return os.path.join(get_dataset_dir(), 'raw_data.xml') 29 | 30 | def get_annotated_seqs_file_path(): 31 | return os.path.join(get_dataset_dir(), 'annotated_seqs.lf') 32 | 33 | def get_filtered_seqs_file_path(): 34 | return os.path.join(get_dataset_dir(), 'filtered_seqs.fasta') 35 | 36 | def get_track_file_paths(): 37 | return {track: os.path.join(get_dataset_dir(), 'extra_tracks/seqs.%s' % track) for track in AVAILABLE_TRACKS} -------------------------------------------------------------------------------- /py/cleavepred/test_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to test a classifier that was trained on NeuroPred's dataset against UniProt's dataset 3 | Arguments: 4 | - advanced (boolean): Whether to test the advanced or simple classifier. 5 | ''' 6 | 7 | import sys 8 | import pickle 9 | import logging 10 | 11 | import pandas as pd 12 | 13 | from cleavepred import util 14 | from cleavepred import project_paths 15 | 16 | ### Parse arguments ### 17 | 18 | advanced = util.parse_bool(sys.argv[1]) 19 | 20 | ### Configuration ### 21 | 22 | # We use UniProt's dataset for testing our predictors. 23 | project_paths.dataset_name = 'uniprot' 24 | 25 | logger = logging.getLogger('TEST') 26 | 27 | ### Test the classifier ### 28 | 29 | predictor_dump_file = None 30 | windows_file = None 31 | 32 | def open_files(): 33 | global predictor_dump_file, windows_file 34 | predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(advanced), 'rb') 35 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 36 | 37 | def close_files(): 38 | util.close_files([predictor_dump_file, windows_file]) 39 | 40 | def test_classifier(): 41 | peptide_predictor = pickle.load(predictor_dump_file) 42 | windows_data_frame = pd.read_csv(windows_file) 43 | score, roc, sensitivity, precision, specificity, cm = peptide_predictor.window_classifier.test_performance(windows_data_frame, \ 44 | drop_only_almost_positives = True) 45 | logger.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity)) 46 | logger.info('Confusion matrix:' + '\n' + str(cm)) 47 | 48 | if __name__ == '__main__': 49 | try: 50 | open_files() 51 | test_classifier() 52 | finally: 53 | close_files() 54 | -------------------------------------------------------------------------------- /py/cleavepred/train_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to train a classifier from NueroPred's dataset. Will log performance to stdout. 3 | Arguments: 4 | - advanced (boolean): Whether to use the advanced features (extracted with the extra tracks), or just the simple ones. 5 | - predictor_dump_path (file path, optional): The path to dump the trained PeptidePredictor. If not provided, will not dump it at all. If provided 6 | with the keyword "auto", will dump it to the project's relevant file. 7 | ''' 8 | 9 | import sys 10 | import pickle 11 | 12 | import pandas as pd 13 | 14 | from sklearn.feature_selection import VarianceThreshold, SelectFdr 15 | from sklearn.linear_model import LogisticRegressionCV 16 | from sklearn.ensemble import RandomForestClassifier 17 | from sklearn.svm import SVC 18 | from mlxtend.classifier import EnsembleClassifier 19 | 20 | from asap import train_window_classifier, PeptidePredictor, FeatureSelectionPipeline 21 | 22 | from cleavepred import util 23 | from cleavepred import project_paths 24 | from cleavepred.common import window_extraction_params 25 | 26 | ### Parse arguments ### 27 | 28 | advanced = util.parse_bool(sys.argv[1]) 29 | 30 | if len(sys.argv) > 2: 31 | if sys.argv[2].lower() == 'auto': 32 | predictor_dump_path = project_paths.get_peptide_predictor_dump_file_path(advanced) 33 | else: 34 | predictor_dump_path = sys.argv[2] 35 | else: 36 | predictor_dump_path = None 37 | 38 | ### Configuration ### 39 | 40 | # We use NeuroPred's dataset for training/validation of our predictors. 41 | project_paths.dataset_name = 'neuropred' 42 | 43 | ensemble_classifiers = [ 44 | LogisticRegressionCV(Cs = 16, n_jobs = -2, class_weight = 'auto'), 45 | RandomForestClassifier(n_estimators = 250, bootstrap = True, criterion = 'gini', n_jobs = -2, class_weight = 'auto'), 46 | SVC(kernel = 'rbf', C = 3.798, probability = True, cache_size = 2400, class_weight = 'auto'), 47 | ] 48 | classifiers = [EnsembleClassifier(clfs = ensemble_classifiers, voting = 'hard')] 49 | 50 | feature_selector = FeatureSelectionPipeline([ 51 | VarianceThreshold(0.03), 52 | SelectFdr(alpha = 0.1), 53 | ]) 54 | 55 | ### Train the classifier and dump the predictor ### 56 | 57 | windows_file = None 58 | predictor_dump_file = None 59 | 60 | def open_files(): 61 | global windows_file, predictor_dump_file 62 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 63 | predictor_dump_file = util.open_file(predictor_dump_path, 'wb') 64 | 65 | def close_files(): 66 | util.close_files([windows_file, predictor_dump_file]) 67 | 68 | def dump_predictor(predictor): 69 | if predictor_dump_file is not None: 70 | pickle.dump(predictor, predictor_dump_file) 71 | 72 | def train_classifier(): 73 | windows_data_frame = pd.read_csv(windows_file) 74 | window_classifier, classifier_performance = train_window_classifier(windows_data_frame, classifiers = classifiers, \ 75 | drop_only_almost_positives = True, feature_selector = feature_selector, n_folds = 10) 76 | peptide_predictor = PeptidePredictor(window_classifier, window_extraction_params = window_extraction_params) 77 | dump_predictor(peptide_predictor) 78 | 79 | if __name__ == '__main__': 80 | try: 81 | open_files() 82 | train_classifier() 83 | finally: 84 | close_files() 85 | -------------------------------------------------------------------------------- /py/cleavepred/util.py: -------------------------------------------------------------------------------- 1 | def split_to_chunks(array, chunk_size): 2 | for i in xrange(0, len(array), chunk_size): 3 | yield array[i:(i + chunk_size)] 4 | 5 | def parse_bool(raw_value): 6 | if raw_value.lower() in ['true', 'yes', '1']: 7 | return True 8 | elif raw_value.lower() in ['false', 'no', '0']: 9 | return False 10 | else: 11 | raise Exception('Unrecognized boolean value: ' + str(raw_value)) 12 | 13 | def open_file(path, *args, **argv): 14 | if path is None: 15 | return None 16 | else: 17 | return open(path, *args, **argv) 18 | 19 | def read_file(path): 20 | 21 | f = open(path, 'rb') 22 | 23 | try: 24 | return f.read() 25 | finally: 26 | f.close() 27 | 28 | def close_file(file): 29 | if file is not None: 30 | file.close() 31 | 32 | def close_files(files): 33 | for file in files: 34 | close_file(file) -------------------------------------------------------------------------------- /py/deeppred/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import simple_cleavage_predictor, advanced_cleavage_predictor -------------------------------------------------------------------------------- /py/deeppred/api.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from . import project_paths 4 | 5 | class CleavagePredictor(object): 6 | 7 | ''' 8 | A predictor trained to predict the cleavage of peptides. 9 | There should be only two instances of this class: 10 | 1. simple_cleavage_predictor - Uses only the basic features derived from the amino-acid sequence of peptides. 11 | 2. advanced_cleavage_predictor - Used also features derived from external tools (ss, acc, disorder and pssm). 12 | ''' 13 | 14 | def __init__(self, advanced): 15 | self.advanced = advanced 16 | self._peptide_predictor = None 17 | 18 | def predict(self, seq, extra_tracks_data = {}, proba = False): 19 | ''' 20 | Predicts cleavage for a given peptide. 21 | @param seq (string): 22 | The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet. 23 | @param extra_tracks_data (dict, empty by default): 24 | A dictionary for providing extra tracks of the given peptide. If using the simple predictor (i.e. advanced = False), it can 25 | be left empty. If using the advanced predictor (i.e. advanced = True), must receive all tracks (i.e. ss, acc, disorder 26 | and pssm). The given dictionary should map from track names to their sequence. 27 | @param proba (default False): 28 | Whether to return mask of predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s). 29 | @return: 30 | A tuple composed of: 31 | 1. cleavage_mask - If proba = False, it will be a binary string (0's and 1's) representing whether each residue is a cleavage 32 | site (1) or not (0). If proba = True, it will be a list of floats (between 0 to 1) representing the probability of each residue 33 | to be a cleavage site. Either way, the length of the returned string/list will correspond to the length of the provided peptide 34 | sequence. 35 | 2. cleavage_products - A list of strings, each representing the amino-acid sequence of a predicted cleavage product. 36 | ''' 37 | cleavage_mask = self.get_peptide_predictor().predict_annotations(seq, extra_tracks_data = extra_tracks_data, proba = proba) 38 | cleavage_products = _get_cleavage_products(seq, cleavage_mask) 39 | return cleavage_mask, cleavage_products 40 | 41 | def get_peptide_predictor(self): 42 | 43 | ''' 44 | @return: 45 | The PeptidePredictor object associated with this cleavage predictor. 46 | ''' 47 | 48 | if self._peptide_predictor is None: 49 | self._peptide_predictor = self._load_peptide_predictor() 50 | 51 | return self._peptide_predictor 52 | 53 | def _load_peptide_predictor(self): 54 | 55 | predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(self.advanced), 'rb') 56 | 57 | try: 58 | return pickle.load(predictor_dump_file) 59 | finally: 60 | predictor_dump_file.close() 61 | 62 | simple_cleavage_predictor = CleavagePredictor(False) 63 | advanced_cleavage_predictor = CleavagePredictor(True) 64 | 65 | def _get_cleavage_products(seq, cleavage_mask): 66 | 67 | products = [] 68 | current_product = '' 69 | 70 | for i in range(len(seq)): 71 | 72 | current_product += seq[i] 73 | 74 | # When we have continuous positive cleavage sites, we consider only the most C-terminus one. 75 | if _is_cleavage(cleavage_mask[i]) and (i >= len(seq) - 1 or not _is_cleavage(cleavage_mask[i + 1])): 76 | _add_if_not_empty(products, current_product) 77 | current_product = '' 78 | 79 | _add_if_not_empty(products, current_product) 80 | return products 81 | 82 | def _is_cleavage(label): 83 | if isinstance(label, str): 84 | return label == '1' 85 | elif isinstance(label, int) or isinstance(label, float): 86 | return int(round(label)) == 1 87 | else: 88 | raise Exception('Unknown label type: ' + str(type(label))) 89 | 90 | def _add_if_not_empty(array, string): 91 | if len(string) > 0: 92 | array += [string] 93 | -------------------------------------------------------------------------------- /py/deeppred/check_top_features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Checks the top features predicted for a given dataset. 3 | Arguments: 4 | - dataset_name (string): The name of the dataset to use. 5 | - advanced (boolean): Whether to use all or only some features, corresponding to simple and advanced classifiers respectively. 6 | ''' 7 | 8 | import sys 9 | import logging 10 | 11 | import pandas as pd 12 | 13 | from asap import get_top_features 14 | 15 | from cleavepred import util 16 | from cleavepred import project_paths 17 | 18 | logger = logging.getLogger('FEATURES') 19 | 20 | ### Parse arguments ### 21 | 22 | project_paths.dataset_name = sys.argv[1].lower() 23 | advanced = util.parse_bool(sys.argv[2]) 24 | 25 | ### Get top features ### 26 | 27 | windows_file = None 28 | 29 | def open_files(): 30 | global windows_file 31 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 32 | 33 | def close_files(): 34 | util.close_files([windows_file]) 35 | 36 | def get_advanced_label(): 37 | if advanced: 38 | return 'advanced' 39 | else: 40 | return 'simple' 41 | 42 | def check_top_features(): 43 | windows_data_frame = pd.read_csv(windows_file) 44 | logger.info('Checking top features over %s dataset with %s features...' % (project_paths.dataset_name, get_advanced_label())) 45 | top_features = get_top_features(windows_data_frame, drop_only_almost_positives = True) 46 | logger.info('Top features: ' + ', '.join(top_features)) 47 | 48 | if __name__ == '__main__': 49 | try: 50 | open_files() 51 | check_top_features() 52 | finally: 53 | close_files() 54 | -------------------------------------------------------------------------------- /py/deeppred/common.py: -------------------------------------------------------------------------------- 1 | from asap import FEATURE_KEY_OPTIONS, WindowExtractionParams 2 | from asap.config import POSITIVE_AMINO_ACIDS 3 | 4 | AVAILABLE_TRACKS = [ 5 | 'ss', 6 | 'acc', 7 | 'disorder', 8 | 'pssm', 9 | ] 10 | 11 | # Here we prefer using 'aa_reduced' over 'aa'. We give up on some other features. 12 | USED_FEATURES = set(FEATURE_KEY_OPTIONS).difference(['aa', 'accum_charge_left', 'accum_charge_right', 'accum_pos_charge_left', 'accum_pos_charge_right']) 13 | 14 | def windows_filter(window): 15 | ''' 16 | We consider only windows with a positively charged amino-acid (i.e. K/R) in the hot index (only then it can be a 17 | cleavage candidate). 18 | ''' 19 | return window.get_aa_seq()[window_extraction_params.window_hot_index] in POSITIVE_AMINO_ACIDS 20 | 21 | window_extraction_params = WindowExtractionParams(window_prefix = 11, window_suffix = 8, neighbourhood_prefix = 5, \ 22 | neighbourhood_suffix = 5, windows_filter = windows_filter, feature_keys = USED_FEATURES) 23 | -------------------------------------------------------------------------------- /py/deeppred/extract_uniprot_annotated_seqs_from_xml.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Extract a .lf file, containing sequences and cleavage annotation masks, out of a UniProt's XML file. 3 | Arguments: 4 | - output_file_path (file path, optional): The path to write the output .lf file to. If not provided, will update the project's relevant file. 5 | ''' 6 | 7 | import sys 8 | import re 9 | import xml.etree.ElementTree as et 10 | from StringIO import StringIO 11 | 12 | from cleavepred import util 13 | from cleavepred import project_paths 14 | 15 | project_paths.dataset_name = 'uniprot' 16 | 17 | if len(sys.argv) > 1: 18 | output_file_path = sys.argv[1] 19 | else: 20 | output_file_path = project_paths.get_annotated_seqs_file_path() 21 | 22 | def get_unique(element, xpath): 23 | 24 | subelements = element.findall(xpath) 25 | 26 | if len(subelements) == 0: 27 | return None 28 | if len(subelements) == 1: 29 | return subelements[0] 30 | else: 31 | raise Exception('%d subelements: %s' % (len(subelements), xpath)) 32 | 33 | def parse_uniprot_xml(raw_xml_path): 34 | raw = util.read_file(raw_xml_path) 35 | fixed_raw = re.sub(r'xmlns="[^"]*"', '', raw) 36 | return et.fromstring(fixed_raw) 37 | 38 | def get_proteins_with_cleavage_sites(raw_xml_path): 39 | 40 | root = parse_uniprot_xml(raw_xml_path) 41 | 42 | for entry in root.findall('./entry'): 43 | 44 | accession = entry.findall('./accession')[0].text 45 | raw_seq = get_unique(entry, './sequence').text 46 | seq = re.sub(r'\s', '', raw_seq) 47 | 48 | signal_peptide_end = 0 49 | cleavage_sites = set() 50 | skip_protein = False 51 | 52 | for feature in entry.findall('./feature'): 53 | 54 | type = feature.get('type').lower() 55 | 56 | if type in ['peptide', 'chain', 'propeptide', 'signal peptide']: 57 | 58 | try: 59 | begin = int(get_unique(feature, './location/begin').get('position')) 60 | except: 61 | begin = None 62 | 63 | try: 64 | end = int(get_unique(feature, './location/end').get('position')) 65 | except: 66 | end = None 67 | 68 | if type == 'signal peptide': 69 | if end is None: 70 | print ('%s: no end to signal peptide. We will ignore this protein.' % accession) 71 | skip_protein = True 72 | break 73 | else: 74 | signal_peptide_end = max(signal_peptide_end, end) 75 | else: 76 | 77 | if begin is not None: 78 | cleavage_sites.add(begin - 1) 79 | cleavage_sites.add(begin - 2) 80 | 81 | if end is not None: 82 | if type == 'propeptide': 83 | cleavage_sites.add(end - 1) 84 | else: 85 | cleavage_sites.add(end) 86 | 87 | if skip_protein: 88 | continue 89 | 90 | cleavage_sites = set([i for i in cleavage_sites if i >= signal_peptide_end + 3 and i < len(seq) - 3 and seq[i] in 'KR']) 91 | cleavage_sites_to_remove = set([i - 1 for i in cleavage_sites]) # If 11, we take only the second 92 | cleavage_sites = cleavage_sites.difference(cleavage_sites_to_remove) 93 | 94 | if cleavage_sites: # we don't want samples with no cleavages at all - it's probably a mistake 95 | yield accession, seq, cleavage_sites, signal_peptide_end 96 | 97 | def cleavage_sites_to_mask(seq_length, cleavage_sites): 98 | 99 | mask = ['0'] * seq_length 100 | 101 | for cleavage_site in cleavage_sites: 102 | mask[cleavage_site] = '1' 103 | 104 | return ''.join(mask) 105 | 106 | def remove_xs(seq, mask): 107 | 108 | revised_seq = '' 109 | revised_mask = '' 110 | 111 | for aa, label in zip(seq, mask): 112 | if aa.lower() != 'x': 113 | revised_seq += aa 114 | revised_mask += label 115 | 116 | return revised_seq, revised_mask 117 | 118 | def space_seq(seq, chunk_length = 10): 119 | return ' '.join(util.split_to_chunks(seq, chunk_length)) 120 | 121 | def write_fasta_like_record(file, accession, seq, mask): 122 | file.write('>' + accession + '\n') 123 | file.write(space_seq(seq) + '\n') 124 | file.write(space_seq(mask) + '\n') 125 | file.write('\n') 126 | 127 | if __name__ == '__main__': 128 | 129 | output_file = open(output_file_path, 'wb') 130 | 131 | try: 132 | for accession, seq, cleavage_sites, signal_peptide_end in get_proteins_with_cleavage_sites(project_paths.get_raw_data_xml_file_path()): 133 | mask_to_write = cleavage_sites_to_mask(len(seq), cleavage_sites)[signal_peptide_end:] 134 | seq_to_write = seq[signal_peptide_end:] 135 | seq_to_write, mask_to_write = remove_xs(seq_to_write, mask_to_write) 136 | write_fasta_like_record(output_file, accession, seq_to_write, mask_to_write) 137 | finally: 138 | output_file.close() 139 | 140 | print 'Done.' 141 | -------------------------------------------------------------------------------- /py/deeppred/extract_windows.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to extract the window features for a given dataset. 3 | Arguments: 4 | - dataset_name (string): The name of the dataset to extract the window features for. 5 | - advanced (boolean): Whether to use the extra tracks when extracting the windows, or extracting only the simple sequence-based features. 6 | - output_file_path (file path, optional): The path to write the output CSV to. If not provided, will update the project's relevant file. 7 | ''' 8 | 9 | import sys 10 | 11 | import asap 12 | 13 | from cleavepred import util 14 | from cleavepred import project_paths 15 | from cleavepred.common import window_extraction_params 16 | 17 | ### Parse arguments ### 18 | 19 | project_paths.dataset_name = sys.argv[1] 20 | advanced = util.parse_bool(sys.argv[2]) 21 | 22 | if len(sys.argv) > 3: 23 | output_file_path = sys.argv[3] 24 | else: 25 | output_file_path = project_paths.get_window_features_file_path(advanced) 26 | 27 | ### Extract the windows ### 28 | 29 | annotated_seqs_file = None 30 | seqs_filtration_file = None 31 | csv_output_file = None 32 | extra_tracks_files = {} 33 | 34 | def open_files(): 35 | 36 | global annotated_seqs_file, seqs_filtration_file, csv_output_file, extra_tracks_files 37 | 38 | annotated_seqs_file = open(project_paths.get_annotated_seqs_file_path(), 'rb') 39 | seqs_filtration_file = open(project_paths.get_filtered_seqs_file_path(), 'rb') 40 | csv_output_file = open(output_file_path, 'wb') 41 | 42 | if advanced: 43 | for track_name, track_file_path in project_paths.get_track_file_paths().items(): 44 | extra_tracks_files[track_name] = open(track_file_path, 'rb') 45 | 46 | def close_files(): 47 | util.close_files([annotated_seqs_file, seqs_filtration_file, csv_output_file]) 48 | util.close_files(extra_tracks_files.values()) 49 | 50 | def extract_windows(): 51 | asap.extract_windows_from_file(annotated_seqs_file, extract_annotations = True, seqs_filtration_file = seqs_filtration_file, \ 52 | extra_tracks_files = extra_tracks_files, csv_output_file = csv_output_file, window_extraction_params = window_extraction_params) 53 | 54 | if __name__ == '__main__': 55 | try: 56 | open_files() 57 | extract_windows() 58 | finally: 59 | close_files() 60 | -------------------------------------------------------------------------------- /py/deeppred/get_disopred.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to run disopred3 on a multifasta file, then parse and collate the output. 3 | Steps: 4 | 1. Extract each fasta from a copy of original multifasta file. (Stored in a seperate directory) 5 | 2. (Opt?) Remove original multifasta file. 6 | 3. Run disopred on each fasta. 7 | 4. i. gather the data/output for all the fastas. (output file name is the fasta's name). 8 | 4. ii. Clean the format (for each file), and save to file: "output_feat.diso", (in a format like lf/ss/acc) 9 | 4.iii. Save this output to the external features folder. 10 | 5. Import from the standard pipeline (config) / not here. 11 | ''' 12 | 13 | import os 14 | import subprocess 15 | from subprocess import call 16 | import sys 17 | import csv 18 | import glob 19 | import pandas as pd 20 | # from Bio.Seq import Seq 21 | # from Bio.SeqRecord import SeqRecord 22 | from Bio import SeqIO 23 | 24 | 25 | #Location of the directory containing "run_disopred.pl" 26 | DISOPRED_LOCATION = r'/cs/stud/danofer/Desktop/danofer/Software/DISOPRED' 27 | DISO_PROG = 'run_disopred.pl' 28 | 29 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V3/' 30 | # FASTA_TARGET = 'NeuroPred_nNames_70ID.fasta' 31 | 32 | #NEW: 33 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/data/uniprot/UniProtTestSeqs/D3/' 34 | FASTA_LOCATION = '/a/fr-05/vol/protein/danofer/imac/Desktop/DFTP/' 35 | 36 | # FASTA_TARGET = 'D3_TEST_50_FILT.fasta' 37 | FASTA_TARGET = 'D3_TEST_50_FILT_mod.fasta' 38 | 39 | SPLIT_FASTAS_DIR = 'splitfasta1' 40 | split_fastas_dir = os.path.join(FASTA_LOCATION,SPLIT_FASTAS_DIR) 41 | 42 | 43 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET) 44 | 45 | ALT_DIR_OUTPUT = os.path.join(FASTA_LOCATION,'altFiltered') 46 | 47 | 48 | 49 | def parse_DISOPRED(): 50 | ''' 51 | Parse all pbdat files in a dir, (Output of DISOPRED) 52 | save their output in a format like hssp/ss/acc 53 | ''' 54 | # os.chdir(os.path.dirname(split_fastas_dir)) 55 | os.chdir(split_fastas_dir) 56 | 57 | files = glob.glob('*.pbdat') 58 | print('amount of .pbdat files:',len(files)) 59 | # output_file = open('/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V2_ExternalFeat_NP/output_feat.DISO', 'w') 60 | # output_file = open(FASTA_LOCATION+'output_feat.DISO', 'w') 61 | output_file = open(os.path.join(FASTA_LOCATION,'output_feat.DISO'), 'w') 62 | 63 | # print('Joined results will be saved to ',(FASTA_LOCATION+'output_feat.DISO')) 64 | print('Joined results will be saved to ',os.path.join(FASTA_LOCATION+'output_feat.DISO')) 65 | 66 | for f in files: 67 | accession = str('>'+os.path.splitext(os.path.basename(f))[0]) 68 | f=open(f) 69 | lines = f.readlines() 70 | seq = [] 71 | diso_state=[] 72 | for line in lines: 73 | parts = line.strip(' \n \t').split(' ') 74 | if parts[0] != '#': 75 | seq += parts[1] 76 | diso_state += parts[2] 77 | seq = ''.join(seq) 78 | diso_state = ''.join(diso_state) 79 | output_file.write(accession+'\n') 80 | output_file.write((seq)+'\n') 81 | output_file.write((diso_state)+'\n') 82 | # print(accession) 83 | print("Saved to output_feat.DISO") 84 | output_file.close() 85 | 86 | 87 | def split_fasta(filter = False): 88 | ''' 89 | https://py4bio.wordpress.com/2009/07/22/split_fasta_file/ 90 | This script takes a fasta file and split it in one file per fasta entry. 91 | It saves the outputs fastas in a new directory 92 | ''' 93 | os.chdir(os.path.dirname(FASTA_LOCATION)) 94 | print("Current working Directory:",os.getcwd()) 95 | filter_fastas = [] 96 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET) 97 | split_output_dir = SPLIT_FASTAS_DIR 98 | 99 | if filter == True: 100 | filter_fastas = filter_fasta_queries(fastas_dir=split_fastas_dir) 101 | split_output_dir = ALT_DIR_OUTPUT 102 | 103 | if not os.path.exists(split_output_dir): 104 | os.makedirs(split_output_dir) 105 | 106 | os.chdir(split_output_dir) 107 | 108 | i = 0 109 | read_counts = 0 110 | for record in SeqIO.parse(open(file_in), "fasta"): 111 | read_counts += 1 112 | if not (record.id in filter_fastas): 113 | # f_out = os.path.join(split_output_dir,record.id+'.fasta') 114 | f_out = (record.id+'.fasta') 115 | # f_out =(split_output_dir+record.id+'.fasta') 116 | print('save to:',f_out) 117 | # SeqIO.write([record],open(f_out,'w'),"fasta") 118 | with open(f_out, "w") as handle: 119 | SeqIO.write([record], handle, "fasta") 120 | i += 1 121 | 122 | print(read_counts," = Fastas in the original multifasta-file") 123 | print(i," = # Splitted Fasta files made") 124 | 125 | 126 | def call_DISOPRED(split_fastas_list): 127 | os.chdir(os.path.dirname(DISOPRED_LOCATION)) 128 | print(os.getcwd()) 129 | print("In DisoPred folder") 130 | for i, fasta in enumerate(split_fastas_list): 131 | print(i) 132 | print(fasta) 133 | subprocess.call([DISOPRED_LOCATION+'/'+DISO_PROG,fasta]) 134 | print() 135 | 136 | def filter_fasta_queries(fastas_dir=split_fastas_dir): 137 | ''' 138 | If Disopred job was interrupted in midway - 139 | This lets us continue (in a new dir) for 140 | only those sequences that do not have 141 | Disopred predictions = *.pbdat 142 | ''' 143 | os.chdir(fastas_dir) 144 | files = glob.glob('*.pbdat') 145 | print('# .pbdat files = fastas that were processed succesfully, previously:',len(files)) 146 | ids = [os.path.splitext(os.path.basename(f))[0] for f in files] 147 | print('len(ids)',len(ids)) 148 | return ids 149 | 150 | def find_missing(): 151 | ''' 152 | Disopred seems to "miss" some fastas. 153 | This helps us find them, assuming the 154 | disopred output is in the same dirr as 155 | the (split / "filtered") our fasta candidates 156 | ''' 157 | # os.chdir('/cs/prt3/danofer/CleavePred/Dataset/Uniprot/altFiltered') 158 | os.chdir(split_fastas_dir) 159 | 160 | fastas = [f for f in os.listdir('.') if f.endswith('.fasta')] 161 | ids = [os.path.splitext(os.path.basename(f))[0] for f in fastas] 162 | print('# Fastas present:',str(len(ids))) 163 | 164 | preds = fastas = [f for f in os.listdir('.') if f.endswith('.pbdat')] 165 | dis_ids = [os.path.splitext(os.path.basename(f))[0] for f in preds] 166 | print('# Predictions present:',str(len(dis_ids))) 167 | 168 | missing = [a for a in ids if a not in dis_ids] 169 | print('Missing IDs:') 170 | print(missing) 171 | return missing 172 | 173 | 174 | if __name__ == '__main__': 175 | 176 | # fe = filter_fasta_queries() 177 | # print('len filter_fasta_queries()',len(fe)) 178 | # split_fasta(filter = True) 179 | 180 | SPLIT_F = False 181 | CALL_DISO = False 182 | USE_FILTERED = False 183 | 184 | RESUME_DISO_PARTIAL = True 185 | 186 | if SPLIT_F == True: 187 | split_fasta() 188 | 189 | # split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta') 190 | split_fastas_list = glob.glob(os.path.join(split_fastas_dir+'/*.fasta')) 191 | print('\n split_fastas_list in dir: ',len(split_fastas_list)) 192 | 193 | if USE_FILTERED == True: 194 | # filt_split_fastas_list = glob.glob(ALT_DIR_OUTPUT+'/*.fasta') #ORIGINAL 195 | filt_split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta') #CHANGED #D 196 | print('\n filt_split_fastas_list ',len(filt_split_fastas_list)) 197 | print(filt_split_fastas_list[0]) 198 | print(filt_split_fastas_list[1]) 199 | call_DISOPRED(filt_split_fastas_list) 200 | 201 | 202 | if CALL_DISO == True: 203 | call_DISOPRED(split_fastas_list) 204 | print("\n DISOPRED DONE! \n") 205 | 206 | parse_DISOPRED() 207 | 208 | missing_ID = find_missing() 209 | if RESUME_DISO_PARTIAL: 210 | # fe = filter_fasta_queries() 211 | print("\n Calling disopred on missing IDs") 212 | print("Missing: \n",missing_ID) 213 | call_DISOPRED(split_fastas_list) 214 | print("\n DISOPRED DONE! \n") 215 | 216 | 217 | -------------------------------------------------------------------------------- /py/deeppred/produce_auto_files.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A master script to produce all the required auto-generated files: 3 | 1. Uniprot's annotated seqs .lf file 4 | 2. CSVs of the windows with features 5 | 3. Pickle dump files of trained predictors. 6 | Just execute this script as it is, with no arguments. Make sure to be in the py/ directory when running it. 7 | ''' 8 | 9 | import sys 10 | import logging 11 | 12 | # For logger initialization 13 | import asap 14 | 15 | LOGGING_PREFIX = '********** ' 16 | 17 | logger = logging.getLogger('EXEC') 18 | 19 | # Uniprot's .lf file 20 | logger.info(LOGGING_PREFIX + 'Running extract_uniprot_annotated_seqs_from_xml.py') 21 | sys.argv = [''] 22 | execfile('cleavepred/extract_uniprot_annotated_seqs_from_xml.py') 23 | 24 | # Create CSVs 25 | for dataset in ['neuropred', 'uniprot']: 26 | for advanced in ['false', 'true']: 27 | logger.info(LOGGING_PREFIX + 'Running extract_windows.py with dataset="%s" and advanced="%s"' % (dataset, advanced)) 28 | sys.argv = ['', dataset, advanced] 29 | execfile('cleavepred/extract_windows.py') 30 | 31 | # # Create dump files 32 | # for advanced in ['false', 'true']: 33 | # logger.info(LOGGING_PREFIX + 'Running train_classifier.py with advanced="%s"' % advanced) 34 | # sys.argv = ['', advanced, 'auto'] 35 | # execfile('cleavepred/train_classifier.py') 36 | 37 | logger.info(LOGGING_PREFIX + 'Finished executing scripts. All auto-generated files should now be updated.') 38 | -------------------------------------------------------------------------------- /py/deeppred/project_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from .common import AVAILABLE_TRACKS 4 | 5 | # A global variable to update whenever looking to work on another dataset 6 | dataset_name = None 7 | 8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | # DATA_DIR = os.path.join(BASE_DIR, 'data/cleavage') 10 | DATA_DIR = os.path.join(BASE_DIR, 'data/deep/cleavage') 11 | 12 | def get_dataset_dir(): 13 | return os.path.join(DATA_DIR, '%s_dataset' % dataset_name) 14 | 15 | def get_peptide_predictor_dump_file_path(advanced): 16 | if advanced: 17 | return os.path.join(DATA_DIR, 'advanced_peptide_predictor.pkl') 18 | else: 19 | return os.path.join(DATA_DIR, 'simple_peptide_predictor.pkl') 20 | 21 | def get_window_features_file_path(advanced): 22 | if advanced: 23 | return os.path.join(get_dataset_dir(), 'window_advanced_features.csv') 24 | else: 25 | return os.path.join(get_dataset_dir(), 'window_simple_features.csv') 26 | 27 | def get_raw_data_xml_file_path(): 28 | # Relevant only for when dataset_name = 'uniprot' 29 | return os.path.join(get_dataset_dir(), 'raw_data.xml') 30 | 31 | def get_annotated_seqs_file_path(): 32 | return os.path.join(get_dataset_dir(), 'annotated_seqs.lf') 33 | 34 | def get_filtered_seqs_file_path(): 35 | return os.path.join(get_dataset_dir(), 'filtered_seqs.fasta') 36 | 37 | def get_track_file_paths(): 38 | return {track: os.path.join(get_dataset_dir(), 'extra_tracks/seqs.%s' % track) for track in AVAILABLE_TRACKS} -------------------------------------------------------------------------------- /py/deeppred/test_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to test a classifier that was trained on NeuroPred's dataset against UniProt's dataset 3 | Arguments: 4 | - advanced (boolean): Whether to test the advanced or simple classifier. 5 | ''' 6 | 7 | import sys 8 | import pickle 9 | import logging 10 | 11 | import pandas as pd 12 | 13 | from cleavepred import util 14 | from cleavepred import project_paths 15 | 16 | ### Parse arguments ### 17 | 18 | advanced = util.parse_bool(sys.argv[1]) 19 | 20 | ### Configuration ### 21 | 22 | # We use UniProt's dataset for testing our predictors. 23 | project_paths.dataset_name = 'uniprot' 24 | 25 | logger = logging.getLogger('TEST') 26 | 27 | ### Test the classifier ### 28 | 29 | predictor_dump_file = None 30 | windows_file = None 31 | 32 | def open_files(): 33 | global predictor_dump_file, windows_file 34 | predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(advanced), 'rb') 35 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 36 | 37 | def close_files(): 38 | util.close_files([predictor_dump_file, windows_file]) 39 | 40 | def test_classifier(): 41 | peptide_predictor = pickle.load(predictor_dump_file) 42 | windows_data_frame = pd.read_csv(windows_file) 43 | score, roc, sensitivity, precision, specificity, cm = peptide_predictor.window_classifier.test_performance(windows_data_frame, \ 44 | drop_only_almost_positives = True) 45 | logger.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity)) 46 | logger.info('Confusion matrix:' + '\n' + str(cm)) 47 | 48 | if __name__ == '__main__': 49 | try: 50 | open_files() 51 | test_classifier() 52 | finally: 53 | close_files() 54 | -------------------------------------------------------------------------------- /py/deeppred/train_classifier.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A script to train a classifier from NueroPred's dataset. Will log performance to stdout. 3 | Arguments: 4 | - advanced (boolean): Whether to use the advanced features (extracted with the extra tracks), or just the simple ones. 5 | - predictor_dump_path (file path, optional): The path to dump the trained PeptidePredictor. If not provided, will not dump it at all. If provided 6 | with the keyword "auto", will dump it to the project's relevant file. 7 | ''' 8 | 9 | import sys 10 | import pickle 11 | 12 | import pandas as pd 13 | 14 | from sklearn.feature_selection import VarianceThreshold, SelectFdr 15 | from sklearn.linear_model import LogisticRegressionCV 16 | from sklearn.ensemble import RandomForestClassifier 17 | from sklearn.svm import SVC 18 | from mlxtend.classifier import EnsembleClassifier 19 | 20 | from asap import train_window_classifier, PeptidePredictor, FeatureSelectionPipeline 21 | 22 | from cleavepred import util 23 | from cleavepred import project_paths 24 | from cleavepred.common import window_extraction_params 25 | 26 | ### Parse arguments ### 27 | 28 | advanced = util.parse_bool(sys.argv[1]) 29 | 30 | if len(sys.argv) > 2: 31 | if sys.argv[2].lower() == 'auto': 32 | predictor_dump_path = project_paths.get_peptide_predictor_dump_file_path(advanced) 33 | else: 34 | predictor_dump_path = sys.argv[2] 35 | else: 36 | predictor_dump_path = None 37 | 38 | ### Configuration ### 39 | 40 | # We use NeuroPred's dataset for training/validation of our predictors. 41 | project_paths.dataset_name = 'neuropred' 42 | 43 | ensemble_classifiers = [ 44 | LogisticRegressionCV(Cs = 16, n_jobs = -2, class_weight = 'auto'), 45 | RandomForestClassifier(n_estimators = 250, bootstrap = True, criterion = 'gini', n_jobs = -2, class_weight = 'auto'), 46 | SVC(kernel = 'rbf', C = 3.798, probability = True, cache_size = 2400, class_weight = 'auto'), 47 | ] 48 | classifiers = [EnsembleClassifier(clfs = ensemble_classifiers, voting = 'hard')] 49 | 50 | feature_selector = FeatureSelectionPipeline([ 51 | VarianceThreshold(0.03), 52 | SelectFdr(alpha = 0.1), 53 | ]) 54 | 55 | ### Train the classifier and dump the predictor ### 56 | 57 | windows_file = None 58 | predictor_dump_file = None 59 | 60 | def open_files(): 61 | global windows_file, predictor_dump_file 62 | windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb') 63 | predictor_dump_file = util.open_file(predictor_dump_path, 'wb') 64 | 65 | def close_files(): 66 | util.close_files([windows_file, predictor_dump_file]) 67 | 68 | def dump_predictor(predictor): 69 | if predictor_dump_file is not None: 70 | pickle.dump(predictor, predictor_dump_file) 71 | 72 | def train_classifier(): 73 | windows_data_frame = pd.read_csv(windows_file) 74 | window_classifier, classifier_performance = train_window_classifier(windows_data_frame, classifiers = classifiers, \ 75 | drop_only_almost_positives = True, feature_selector = feature_selector, n_folds = 10) 76 | peptide_predictor = PeptidePredictor(window_classifier, window_extraction_params = window_extraction_params) 77 | dump_predictor(peptide_predictor) 78 | 79 | if __name__ == '__main__': 80 | try: 81 | open_files() 82 | train_classifier() 83 | finally: 84 | close_files() 85 | -------------------------------------------------------------------------------- /py/deeppred/util.py: -------------------------------------------------------------------------------- 1 | def split_to_chunks(array, chunk_size): 2 | for i in xrange(0, len(array), chunk_size): 3 | yield array[i:(i + chunk_size)] 4 | 5 | def parse_bool(raw_value): 6 | if raw_value.lower() in ['true', 'yes', '1']: 7 | return True 8 | elif raw_value.lower() in ['false', 'no', '0']: 9 | return False 10 | else: 11 | raise Exception('Unrecognized boolean value: ' + str(raw_value)) 12 | 13 | def open_file(path, *args, **argv): 14 | if path is None: 15 | return None 16 | else: 17 | return open(path, *args, **argv) 18 | 19 | def read_file(path): 20 | 21 | f = open(path, 'rb') 22 | 23 | try: 24 | return f.read() 25 | finally: 26 | f.close() 27 | 28 | def close_file(file): 29 | if file is not None: 30 | file.close() 31 | 32 | def close_files(files): 33 | for file in files: 34 | close_file(file) -------------------------------------------------------------------------------- /web/cleavage/context_processors.py: -------------------------------------------------------------------------------- 1 | from django.conf import settings 2 | 3 | def settings_access(request): 4 | return {'settings': settings} -------------------------------------------------------------------------------- /web/cleavage/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /web/cleavage/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for cleavepred project. 3 | """ 4 | 5 | import sys 6 | import os 7 | 8 | ### Local Environment-Dependent Settings ### 9 | 10 | SITE_URL = '' 11 | 12 | ### Preparations ### 13 | 14 | BASE_DIR = os.path.dirname(__file__) 15 | PROJECT_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) 16 | PROJECT_PY_DIR = os.path.join(PROJECT_DIR, 'py') 17 | 18 | # In order to load 'cleavepred' module later on 19 | sys.path += [PROJECT_PY_DIR] 20 | 21 | ### Development vs. Production status ### 22 | 23 | DEBUG = True 24 | 25 | TEMPLATE_DEBUG = True 26 | 27 | ALLOWED_HOSTS = [] 28 | 29 | ### Security ### 30 | 31 | SECRET_KEY = 'ij*hpack1#brf--5b_1nd7$cz*8h*y=b!y#_bd48v5kcdg0*vd' 32 | 33 | ### Localization ### 34 | 35 | TIME_ZONE = 'UTC' 36 | LANGUAGE_CODE = 'en-us' 37 | 38 | ### URLs ### 39 | 40 | ROOT_URLCONF = 'urls' 41 | 42 | ### Context Processors, Middlewares & Apps ### 43 | 44 | TEMPLATE_CONTEXT_PROCESSORS = ( 45 | 'django.contrib.auth.context_processors.auth', 46 | 'django.core.context_processors.debug', 47 | 'django.core.context_processors.media', 48 | 'django.core.context_processors.static', 49 | 'django.contrib.messages.context_processors.messages', 50 | 'context_processors.settings_access' 51 | ) 52 | 53 | MIDDLEWARE_CLASSES = ( 54 | 'django.contrib.sessions.middleware.SessionMiddleware', 55 | 'django.middleware.csrf.CsrfViewMiddleware', 56 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 57 | 'django.middleware.common.CommonMiddleware', 58 | 'django.contrib.messages.middleware.MessageMiddleware', 59 | ) 60 | 61 | INSTALLED_APPS = ( 62 | 'django.contrib.staticfiles', 63 | 'django.contrib.sessions', 64 | 'django.contrib.contenttypes', 65 | 'django.contrib.auth', 66 | 'django.contrib.admin', 67 | 'django.contrib.humanize', 68 | ) 69 | 70 | ### Templates ### 71 | 72 | TEMPLATE_DIRS = ( 73 | os.path.join(BASE_DIR, 'templates'), 74 | ) 75 | 76 | ### Static Files ### 77 | 78 | STATIC_URL = '/static/' 79 | -------------------------------------------------------------------------------- /web/cleavage/templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | CleavePred - A Machine-Learning model for predicting cleavage products of neuropeptide precursors 4 | 5 | 6 |
7 |

CleavePred

8 |
A Machine-Learning model for predicting cleavage products of neuropeptide precursors
9 |
10 |
11 |
12 | {% block content %}{% endblock %} 13 |
14 |
15 |
16 |

CleavePred is powered by ASAP, a generic API for easily learning local protein annotations with minimal fine-tuning using powerful feature engineering combined with standard Machine-Learning models. Inside ASAP's GitHub project, you will also find the source code of CleavePred, which comes with a handy API as well. To learn more about either of ASAP's or CleavePred's APIs (which offer more options than this website), read the Wiki page in GitHub (in particular this tutorial). To learn more about the underlying algorithm, please read our paper "ASAP: A Machine-Learning Framework for Local Protein Properties". If you found our work to be useful for your research, please cite it.

17 |

For any issue/request, feel free to contact us: Nadav Brandes (nadav.brandes@mail.huji.ac.il) and Dan Ofer (ddofer@gmail.com).

18 |
19 | 20 | -------------------------------------------------------------------------------- /web/cleavage/templates/cleavage-prediction.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |
5 |

Following are the prediction results. Below each amino-acid is its predicted probability of being a cleavage site.

6 | {% for record_id, labeled_aa_chunks, cleavage_products in seqs_data %} 7 |
> {{ record_id }}
8 | 9 | {% for labeled_aa_chunk in labeled_aa_chunks %} 10 | 11 | {% for labeled_aa in labeled_aa_chunk %} 12 | 13 | {% endfor %} 14 | 15 | 16 | {% for labeled_aa in labeled_aa_chunk %} 17 | 18 | {% endfor %} 19 | 20 | {% endfor %} 21 |
{{ labeled_aa.aa }}
{{ labeled_aa.cleavage_probability|floatformat:2 }}
22 |
Predicted cleavage products:
23 | 28 |
Go back to home page
29 | {% endfor %} 30 |
31 | {% endblock %} -------------------------------------------------------------------------------- /web/cleavage/templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block content %} 4 |
5 |
Please fill below the amino-acid sequences to predict in FASTA format. You may either upload a FASTA file, or paste it as a free text.
6 |
7 | {% csrf_token %} 8 |
9 |
10 |
11 |
12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /web/cleavage/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import patterns, url 2 | from django.views.generic import TemplateView 3 | 4 | import views 5 | 6 | urlpatterns = patterns('', 7 | 8 | url(r'^$', 9 | TemplateView.as_view(template_name = 'home.html'), 10 | ), 11 | 12 | url(r'^cleavage-prediction/$', 13 | views.cleavage_prediction, 14 | ), 15 | ) 16 | -------------------------------------------------------------------------------- /web/cleavage/views.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | import logging 3 | 4 | from Bio import SeqIO 5 | 6 | from django.template import RequestContext 7 | from django.shortcuts import render_to_response 8 | 9 | from cleavepred import simple_cleavage_predictor 10 | from cleavepred.util import split_to_chunks 11 | 12 | LOGGER = logging.getLogger('WEB') 13 | 14 | def cleavage_prediction(request): 15 | seqs_data = _get_seqs_data(_get_raw_seqs(request)) 16 | return render_to_response('cleavage-prediction.html', {'seqs_data': seqs_data}, context_instance = RequestContext(request)) 17 | 18 | def _get_raw_seqs(request): 19 | if 'seqs-file' in request.FILES: 20 | return request.FILES['seqs-file'].read() 21 | else: 22 | return request.POST.get('seqs-text') 23 | 24 | def _get_seqs_data(raw_seqs): 25 | 26 | raw_seqs = _fix_string_newlines(unicode(raw_seqs)) # This patch is required for some reason, because Django somehow corrupts uploaded files 27 | LOGGER.info('Received a %d bytes long FASTA' % len(raw_seqs)) 28 | seqs_fasta = StringIO(_fix_fasta_if_needed(raw_seqs)) 29 | records = list(SeqIO.parse(seqs_fasta, 'fasta')) 30 | LOGGER.info('About to process %d records' % len(records)) 31 | 32 | for record in records: 33 | 34 | id = record.id 35 | seq = str(record.seq) 36 | LOGGER.info('Processing record %s: %s' % (id, seq)) 37 | 38 | cleavage_mask, cleavage_products = simple_cleavage_predictor.predict(seq, proba = True) 39 | labeled_aa_chunks = split_to_chunks([_LabeledAminoAcid(aa, label) for aa, label in zip(seq, cleavage_mask)], _RESIDUES_TO_DISPAY_PER_ROW) 40 | yield id, labeled_aa_chunks, cleavage_products 41 | 42 | def _fix_fasta_if_needed(raw_fasta): 43 | if _has_fasta_headers(raw_fasta): 44 | return raw_fasta 45 | else: 46 | return _DEFAULT_FASTA_HEADER + '\n' + raw_fasta 47 | 48 | def _has_fasta_headers(raw_fasta): 49 | 50 | for line in raw_fasta.splitlines(): 51 | if line.startswith('>'): 52 | return True 53 | 54 | return False 55 | 56 | def _fix_string_newlines(string): 57 | 58 | fixed_string = '' 59 | 60 | for i in xrange(len(string)): 61 | if string[i] == '\r' and i < len(string) - 1 and string[i + 1] != '\n': 62 | fixed_string += '\r\n' 63 | else: 64 | fixed_string += string[i] 65 | 66 | return fixed_string 67 | 68 | class _LabeledAminoAcid(object): 69 | 70 | def __init__(self, aa, cleavage_probability): 71 | self.aa = aa 72 | self.cleavage_probability = cleavage_probability 73 | 74 | def is_cleavage(self): 75 | return self.cleavage_probability >= 0.5 76 | 77 | def background_color(self): 78 | if self.is_cleavage(): 79 | return '#ff0000' 80 | else: 81 | return '#ffffff' 82 | 83 | def probability_color(self): 84 | if self.cleavage_probability <= 0.0: 85 | return '#aaaaaa' 86 | if self.cleavage_probability < 0.1: 87 | return '#666666' 88 | if self.cleavage_probability < 0.3: 89 | return '#aaaa00' 90 | if self.cleavage_probability < 0.5: 91 | return '#ffaa00' 92 | else: 93 | return '#ff0000' 94 | 95 | def __repr__(self): 96 | return '<%s>' % self.aa 97 | 98 | _DEFAULT_FASTA_HEADER = '>input_seq' 99 | _RESIDUES_TO_DISPAY_PER_ROW = 20 100 | -------------------------------------------------------------------------------- /web/cleavage/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for cleavepred project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.6/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings") 12 | 13 | from django.core.wsgi import get_wsgi_application 14 | application = get_wsgi_application() 15 | --------------------------------------------------------------------------------