├── .gitattributes
├── .gitignore
├── README.md
├── data
    ├── cleavage
    │   ├── neuropred_dataset
    │   │   ├── annotated_seqs.lf
    │   │   ├── extra_tracks
    │   │   │   ├── seqs.acc
    │   │   │   ├── seqs.disorder
    │   │   │   ├── seqs.pssm
    │   │   │   └── seqs.ss
    │   │   └── filtered_seqs.fasta
    │   └── uniprot_dataset
    │   │   ├── extra_tracks
    │   │       ├── seqs.acc
    │   │       ├── seqs.disorder
    │   │       ├── seqs.pssm
    │   │       └── seqs.ss
    │   │   ├── filtered_seqs.fasta
    │   │   └── raw_data.xml
    └── phosphoserine
    │   ├── annotated_seqs.lf
    │   └── annotated_seqs_demo.lf
├── py
    ├── asap
    │   ├── __init__.py
    │   ├── classification.py
    │   ├── config.py
    │   ├── data.py
    │   ├── features.py
    │   ├── features_deps
    │   │   ├── AAScales.py
    │   │   ├── AAlphabets.py
    │   │   ├── Disorder.py
    │   │   └── __init__.py
    │   ├── parse.py
    │   ├── sklearn_extensions.py
    │   ├── util.py
    │   └── window_extraction.py
    ├── cleavepred
    │   ├── __init__.py
    │   ├── api.py
    │   ├── check_top_features.py
    │   ├── common.py
    │   ├── extract_uniprot_annotated_seqs_from_xml.py
    │   ├── extract_windows.py
    │   ├── get_disopred.py
    │   ├── produce_auto_files.py
    │   ├── project_paths.py
    │   ├── test_classifier.py
    │   ├── train_classifier.py
    │   └── util.py
    └── deeppred
    │   ├── __init__.py
    │   ├── api.py
    │   ├── check_top_features.py
    │   ├── common.py
    │   ├── extract_uniprot_annotated_seqs_from_xml.py
    │   ├── extract_windows.py
    │   ├── get_disopred.py
    │   ├── produce_auto_files.py
    │   ├── project_paths.py
    │   ├── test_classifier.py
    │   ├── train_classifier.py
    │   └── util.py
└── web
    └── cleavage
        ├── context_processors.py
        ├── manage.py
        ├── settings.py
        ├── templates
            ├── base.html
            ├── cleavage-prediction.html
            └── home.html
        ├── urls.py
        ├── views.py
        └── wsgi.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | 
47 | # Translations
48 | *.mo
49 | *.pot
50 | 
51 | # Django stuff:
52 | *.log
53 | 
54 | # Sphinx documentation
55 | docs/_build/
56 | 
57 | # PyBuilder
58 | target/
59 | 
60 | # cleavepred auto-generated files
61 | data/cleavage/uniprot_dataset/annotated_seqs.lf
62 | data/cleavage/neuropred_dataset/window_simple_features.csv
63 | data/cleavage/neuropred_dataset/window_advanced_features.csv
64 | data/cleavage/uniprot_dataset/window_simple_features.csv
65 | data/cleavage/uniprot_dataset/window_advanced_features.csv
66 | data/cleavage/simple_peptide_predictor.pkl
67 | data/cleavage/advanced_peptide_predictor.pkl
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # asap
 2 | Amino-acid Sequence Annotation Predictor
 3 | 
 4 | Please take a look at our Wiki for a quick tutorial: https://github.com/ddofer/asap/wiki/Getting-Started:-A-Basic-Tutorial
 5 | 
 6 | Feel free to add or improve!
 7 | 
 8 | If you use us (or our code), please cite us:
 9 | 
10 | > Brandes, N., Ofer, D., & Linial, M. (2016). ASAP: a machine learning framework for local protein properties. Database : the journal of biological databases and curation, 2016, baw133. https://doi.org/10.1093/database/baw133
11 | 


--------------------------------------------------------------------------------
/data/cleavage/uniprot_dataset/extra_tracks/seqs.acc:
--------------------------------------------------------------------------------
  1 | >P0DMI8
  2 | eeee--eee-e-eeeeeeeeeeee-ee--e--ee--e--eee-eeeee-eeee
  3 | >Q8AYR6
  4 | eeeeeeee--e--ee--ee-ee--e-ee-ee--eeeeeeeee-eee-eeee-ee-ee-eeeeee---e--ee--eeeeeeeeeeeeee-ee-----e-ee--e-eeeee
  5 | >A7WNV3
  6 | eeeeeeeeeeeeeeeeee-e--ee--e------ee--ee---e--eee
  7 | >B3VZU0
  8 | eeeeeeeeeeeeee-eeeeee--ee--e---ee-ee------eee
  9 | >E7EKE0
 10 | eeeeeeeeeeee--eee-eee--ee-ee---e-eee--ee--ee-e-e-eeee
 11 | >Q2UXW0
 12 | eeeeeeeeeeeeeeeee-e-e-eee--ee------ee-ee---e-eeee
 13 | >P0CAQ4
 14 | eeeeeeeeeeeee-eeeeee--e-eee-e---ee-eee-e-ee
 15 | >P05222
 16 | eeeee-e---e---e--ee--e---e--ee-eeeeeeeeeee-eeeeee-eee--e-------e--ee--e-------e-eeeeeeeeeee-eeeeee-eeee-e-------e--ee--e---ee-eeeeeeeeeeeeee-eeeeee-e------eeeeeee
 17 | >B6D434
 18 | eeeee-e-ee--ee--e---eeeeee------e-e-eeeeeeeeeeeee-e-e-ee-e-eeeeeeeeee-e-eeeee-ee------eeeeee--e---eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-ee--ee-ee--ee-eee-ee--ee--ee-ee--eeeee
 19 | >Q01301
 20 | eeeeeeeee-eee--eeeee---e--e--e---e--eeeee-eee-eeeee--ee-ee--ee--e------ee-ee--ee-e-eeee-eeeeeeeeeeeeeeeeeeeeee--eee-ee--eeeeeeee----e-----eeeeeeeeeee--e-e-e---eeee-eee-eeeeeeeee--eeee------eee-e---e--ee-eeeee---eeee-ee-eee-eee-e-------eeee--eeeeeeee----eeee-ee-ee--e---e--ee--ee-eeee
 21 | >P0C8W0
 22 | eeeeeeee-eeeee-ee-e-ee--ee-eeee-eeeee-eee--eeeeee---eee-e-e-eeeeeee-e-eeeeee-ee
 23 | >D6C4I0
 24 | eeeeeee-ee-eee--eee--eee--ee-eee-eee-e--ee---e-e-eeeee-ee-eee
 25 | >C7DQB7
 26 | ee--eee----ee----------e-ee--e-e-eeeeeeee
 27 | >C7DQB9
 28 | eee-eeee---e------ee-e---e-eeeee-eeee--ee-ee--eeeee-ee--ee
 29 | >D6C4J2
 30 | eeeee-eee-eeee-eeeee
 31 | >D6C4I9
 32 | ee-e-eeeee---e-eee-ee--ee--eeeee-eee-eeee-eeeee
 33 | >Q9BPJ1
 34 | eeeeeeeeeeee-ee-eee-eeeeee--ee-ee--ee---e-ee-ee
 35 | >Q9BPI6
 36 | eeeeeeeeee-ee-eeee-eeeeeee-ee-ee--ee------e--ee
 37 | >P0C1N7
 38 | eeeeeeeeeee--ee-eeeeeee-eeeee--e-e-eee-e----e
 39 | >C1J5M7
 40 | eeeeeeeeeeeee-eee-eeeeee--eee--ee--eeeeeeee--e--eee-e--eee
 41 | >P0C7I1
 42 | eeeeeeeeeeee-ee-eee-eeeeee--eee-ee--eeeee--eee-eee---ee
 43 | >Q5K0B9
 44 | eeeeee-ee-ee-eeeeeeeee-ee-eee----ee-e--------e---------e-e
 45 | >Q3YEF1
 46 | eeeeeeee---eeeee--e-eee-ee--ee-eeee--e-e---------e--------------eee
 47 | >Q9XZK9
 48 | eeeeee--ee---e--ee-eeeeeee-eee-eeeeee-eeeee--------e-e
 49 | >P0C8V5
 50 | eeeeee-ee--ee-eee-eeee-ee-eeeee--ee------eee----e---e------ee
 51 | >P0CB09
 52 | eeeeee--e--ee-eee-eeee-ee-eee--eeeee--e--eee--ee-----e-e
 53 | >P58913
 54 | eeeeee-ee--ee-eee-eeee-ee-eeee---ee------ee-----e---e---eee
 55 | >P69762
 56 | eeeeee-ee--ee-eee-eeeeeee-eee--e--ee-----ee--ee--ee-e-e
 57 | >Q9UA72
 58 | eeeeeeeeeeee-e-eeeeee--ee-eeeee---eeee--e-e-eeee-e
 59 | >Q3YEF4
 60 | eeeeeeeee--e--eeeeeeeeeeeee-ee---e-eee-e--e-e-ee--eeee
 61 | >Q3YEF9
 62 | eeeeeeeeeeeee-e--eeeee-ee--eee-ee-------e-e--eee-ee---eee
 63 | >G1AS83
 64 | eeeeeeeeee-ee-eeeeeeeeee-ee--eeeeee-ee--e----e-eeee-ee
 65 | >P0CY65
 66 | eeeeeeeeeeee-eeeee-eeeeeee-eeeeee---ee-e---e-eeeeee-ee
 67 | >D2Y4A1
 68 | eeeeeeee--eeeee-e-ee------ee----e-e
 69 | >B2KPN7
 70 | eeeeeee-eee-eeeeee-eeeeee-ee--eeee-e-eeeeeee---eee
 71 | >Q17AN4
 72 | ee-e-eee-eeeeeeeeeeeeeeee--ee-e-e-eeeeee-ee--ee--eee-e-e--e---ee-ee--ee--ee-ee-------e-e--ee-eeee-e-eeeeeeee--ee--eee
 73 | >A0NDK8
 74 | ee-e-eee-eeeeeee-eeeeeeeeeeee-ee-e-ee--ee--eeee-ee-eeeeee--ee--eee-e-e-----ee-eeeee--e--eee-ee-eee------------eeeee------e-eeeee-eee-e-eeee-ee-eeeeeeee
 75 | >Q9BH75
 76 | eeeeeee-e----e--ee-ee--e---e-ee--eee---eee
 77 | >Q9BPD6
 78 | eeeeee-eee--eeeeeeee-eeeee--------e---e--eeeee
 79 | >Q9BH86
 80 | eeeeeee-e--e-ee--ee--ee-eee-e--ee-e--eee
 81 | >Q3YEH6
 82 | eeeeeee-e-ee-eee-ee--e--eeeee--eeeee-ee
 83 | >Q1A3R0
 84 | eeeeeee-e-ee-eee-ee--ee--eeee--eee----ee
 85 | >P0C641
 86 | eeeeeeeeeeee-ee-eee--eee-eee--eee------ee
 87 | >Q3YEH2
 88 | eeeeeee-eeee--ee-ee--e--eeeee--eee---eeee
 89 | >P69766
 90 | eeeeeeee-e----eee-ee--ee-eeee----eee---e
 91 | >Q9BPH2
 92 | eeeeee-ee-eeeeee-ee----eee----ee---eee
 93 | >O17512
 94 | eeeeee----eee-ee--ee--ee-ee-ee---e--ee--ee-eeeee
 95 | >P0DJC4
 96 | eeeeeee-eeee-eee-eee-e-ee-ee---ee----eee
 97 | >B5A9S9
 98 | eeeeeeeeeeee-eeeeeee-eee--e-e-----eee
 99 | >A4H222
100 | eeeeeee-eeeeee-eee-eeee--ee--ee------eeeeeeeeeeeeee-e-ee-eee--------e---ee-e-eeeeeeeee-eee-eee-eeee-eeee
101 | >Q8N687
102 | eee-ee--eee-ee-eee--eeee-e---eee-e---e-eeeee-ee-eeee-ee-e---e-ee-ee--e----e-e-e---eeeeeeeeeeeeee-ee-eee--eeeeee-ee-eeee-e-ee-eeeeee--eee
103 | >Q9BYW3
104 | ee--ee--ee-e--eee-eeee-e-ee---e--eeee---e-eeeee------eee-eee---e-e--ee----ee---ee-eeeeeeeee
105 | >Q9BEE3
106 | ee--ee--ee-e--eee-eeee-e-ee------eeee---e-eeeee------eeeeeee-------------------e---eee---e--e--e-e-eeee
107 | >Q9H1M4
108 | eee--ee--e-e-ee--e-ee-ee---ee-e----e-ee-ee-ee-eeeeeee-ee---e-eeeee--ee-ee-eeeee
109 | >P0C8A2
110 | eeeeee-ee--eeeee-eeeeee-------e--eeeeeeeeeee-----e-e-ee-eeeeee-eeee-e-e---eeee
111 | >P0C8A1
112 | eeeeeeeeeee-eeee-ee-ee---------eeeeeeeeeee-ee---e-e---e--ee-eee-eeeeee-e---eeee
113 | >Q17UZ0
114 | eeeeeeeeeeeeeeeeeeeeeee--e---e-ee--ee--ee-ee-eeeeeeeeeeeee
115 | >O93222
116 | eeeeeeeeeeeeeeeeeeeeeee--------------e---e--e-eeeee
117 | >O93224
118 | eeeeeeeeeeeeeeeeeeeeee---e--e---e---e---e--ee-eeee
119 | >O93454
120 | eeeeeeeeeeeeeeeeeeeeeeee---e--ee------e-----eeeee
121 | >O93453
122 | eeeeeeeeeeeeeeeeeeeeeee--ee-ee-eee-eeeeeeeeeee--ee--eeeeee
123 | >Q6XMH8
124 | eeeeeeeeeeeeeeeeee-e-eee-ee-eee-ee---eeeeeeee
125 | >Q19165
126 | eeeee-eee-ee-ee---e--eeeeeee-e-ee-ee---e--ee--eeeeee--eeee-e-ee-ee---e--eeeeeeeee-e-ee-e-------eeeeeeeeeeeeeeeeee-e--e------e--eeeeeeeee-e-eee-----e-ee
127 | >Q9XVX1
128 | eeeeeeeee-e-e--eeeee-e-e--eeeee--e--eeeeee-ee-eee-eee-e-eee-e--ee-eee
129 | >O44185
130 | eeeee-e---ee-eeeee--e--eeeeeeeeeeeeeeee-----ee-eeeee-----eeeeeee-----ee--eeee-----e-ee-ee-----ee-eeeee-----e-eeeee-----eeee---------eeee--e-eee
131 | >O17058
132 | ee-eee-ee-eee---e--e-ee-eeee-eeeee---e--eeee
133 | >P80398
134 | eeeeeeeeeeeee-eeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-ee-eeeeee
135 | >P85070
136 | eeeeeeeeeeeeeee-ee-ee-ee-------eeeee-----eeee
137 | >P61516
138 | eee-ee-eeeeeeeeee-eeeee-eeeeeeeee-----eeeeeee-e--e---e--eeee-e---ee
139 | >Q801Y3
140 | eeeee-eeeeeee-eeeeeeeeeeeeee-e-eee-e-eeee--e--e---e--eeee-----ee
141 | >A1Z0M0
142 | eeeeeeee-eee----eee-e-ee-e-e---eeeeeeeeee-e---e--eeeee-----ee
143 | >Q9VT52
144 | eee---ee-ee--ee---e---ee-eee-eeeeeee-ee-ee-e---e--eee-e---e--ee-e---ee--ee--e-ee--e--eeeeee
145 | >Q7KUD5
146 | eee-ee--ee--e--e---ee-------ee------e-ee--ee-eeeeeee-ee-ee-eee-ee--ee--eee-e-ee-eee-ee
147 | >Q9W4Z4
148 | eeeeeee-eeee---eee-ee--ee---e-e-e-ee--eeeeeeeeeee-ee-ee---eeee-e-ee--ee-ee
149 | >K7ZGS2
150 | eeeeeeeeeeeeeeee-e-e--ee--e---ee------eee
151 | >E4Z7G0
152 | eeeeeeeeeeeeeeeeeeeeeeeeeeeee--ee--ee-e-eee
153 | >C5J893
154 | eeeeeeee-e-eee-ee--e---eeeee-ee-eee-ee
155 | >Q16998
156 | eeeeeeee-ee-eeeee--e-ee-eeeee-ee-ee-ee-ee-ee-ee-ee-ee-eeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
157 | >Q58T45
158 | eeeeeeeee-eeee-eeee--ee-ee-ee---e--ee--ee--ee--e--eeeeeeeeeee--ee-ee--ee-ee-e-eee-eeeeeee-eeee--e--eee-ee----------e--ee--eeee
159 | >C5J897
160 | eee---e--ee--ee--eeeeeeeeeeeeeee-ee--ee-eee-eee-ee--eeee
161 | >Q718F4
162 | ee----ee---------eeeeeee-eee-eeeee-eeeeee-ee--ee-eee
163 | >P0DJO3
164 | eee----ee-ee---ee--ee-ee-ee-e-ee--eee-ee---e--ee--e
165 | >P0DJ02
166 | e-e---ee-ee--eeee-ee-ee-eee-eee-eee--e--ee-ee
167 | >L0GCV8
168 | eee-e--ee-ee--eeee-ee-ee--e--eee-ee-ee--eee-ee-eee--ee-e
169 | >I0DEB3
170 | eee---e--ee--eeee-ee-ee-eee-eee-eee--e--ee-ee
171 | >P0C8W1
172 | e-e---e--ee--eeee-ee-e-ee-ee--eeee-eee--e--ee-ee
173 | >P0DJ03
174 | e-e---e--ee--eeee-ee-e-ee-ee--eee-eee--e--ee-ee
175 | >F1CJ89
176 | eeee--ee--eeee-ee-eeee
177 | >Q9U662
178 | eeeeeee--e---e-eee-eeeeeee-eee--ee--e-eeeee--ee----e-e
179 | >Q9U657
180 | eeeee---e--ee-eee-eeeeeee-eeeee-eee-e--e-e----e---e---------e
181 | >Q9BP77
182 | eeeeeeee-eeeee-eeeee-e--ee-eee-e--eeeeee----e--eeeee------eeee
183 | >Q9BPC6
184 | eeeeeeeee-eee-e---eeeee-ee--e-e-ee--e--eeeee--ee--e-e-eee
185 | >Q9BPC3
186 | eeeeeeeeeeeee-e--eeee-eeeeee-e---------eeeeee---e---e---ee
187 | >Q9BP62
188 | eeeeeeeeeeeee--e-eeeee-eee--eee-ee--eeeeee---e-eeee-e-----ee
189 | >Q9BP65
190 | eeeeeeeeeeee-e-eeeee-eee-eee-eee-eeeeee--eeeeeee-eee-ee
191 | >B6DT16
192 | ee------e--eeee-ee-eeee-ee-eeeeee-ee-eeee-ee-eee--ee-eee--e--eeeeeeee
193 | >P0DKU2
194 | e---e--e---eeee-ee-ee--eee--e--ee-ee-ee-e-eeee-e-e-ee-------e--ee--e--ee-eeeeeeeeeee--e----------e-e--e-e-eee-eee-ee-eee-ee-eee-ee
195 | >Q16N80
196 | eeeeeeeeee-ee--eeeee-ee-eee--------e--ee---ee--ee--ee-eee-----eee-ee--ee--ee----------e-eeeee------e-ee-eeeeeeeeeeeeee-e--ee-eee-e--ee-eee-e-e-eee-ee
197 | >Q0VZ39
198 | eeeeeeeeeeeeeeeeeeeeeeee--e--ee--ee--e--eeee
199 | >P85882
200 | eeeeeeeeeeeeeeeeeeeeeeee--e--ee--e--ee--eeee
201 | >A8B5P7
202 | eeeeeeeeeeeee-eeee-ee---e--eeee-ee--ee---e-eeee
203 | >P31394
204 | eeeeee----eeee-ee--eeee-ee--ee-eeee-eee-eeee-e-e---e--eeeee-ee---e-eeeee-eeeee--e-eee-eeeee-eeeeee-e-e-eee-eeee-e-e-e-ee--eeeee-----e-eeeeee-e--ee-e--eeeee-eeee--e--e-e-eee-e-ee-e---e-e-eeeee-----e-ee-ee-e---------eeeee-------ee-------e--eeeee---------eeee-ee-ee-e-eee-e-ee--eee----------eee-eeee--------ee---ee--eee-e-e-----eeeeeeeeeeeeeee-----e----eeee-eee-eee--e------eeeee----e---------e-ee----------e-ee-eee---------e--e--eeeeeeeeeeeeeeee
205 | >D3UA80
206 | eee-e-eeee-eee--ee-ee--ee--ee--ee-e-e-eee
207 | >P83719
208 | eeeeeeeeeeeee--eee-ee---e---eeeeeeee-eee
209 | >P08950
210 | e-ee--ee-ee-eeeee-eee-ee----e---eee-eeeeee-e---ee--eeee
211 | >Q99109
212 | eee-e-ee-eeeee-ee--eee-ee-ee-e---e--eeee-ee-ee--ee-eee-e----eeee--e-ee-e-e----eeeeeeeee-ee--e---e--eeeeee-------eeee-e--eee-e-eeeeee-ee-e-eeeeeeeeeeeeeee-e--e--ee-ee--ee-ee------e-e-eeeeee-----e---e--e--eeeeeee----eeeee-e-eeee------e------e-eeeeee----e-eee-e-eeee--ee-ee---e----ee-e-e-ee-eee-ee-ee-eee-e-e--eee---e-ee-eee-ee-eee-ee-e--ee-ee-eeeeeeee-e-eeeee--ee-eee-ee-eeeeeeee-eeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeee-eeee-----e--e-eeeee--e-eeeeee---e----e-e-e
213 | >A6MWS8
214 | ee-----eeeeeeeeeeeeeee--ee-e-ee----ee-eeee-eeeeeeeeee
215 | >Q9SE35
216 | ee-ee--eee-eeeeeeeeee-eeeeeeeeeeee-ee-eeeeee-eeee-ee-e-eeee-ee-e-eeee-ee-eeeeeeeeeee-e--eeeeeee----eeeeeee-ee-eeeee--ee--eeeeeee-ee-eeeeeeeee--eeeeeee-eeeeeeeee--eeeeeee-eeeeeeeeee-eeeeeee-eeeeeeeee--eeeeeee-eeeeeeeeee-eeeeeee-eeeeeeeee--eee-eeee
217 | >Q9YGH3
218 | eee-eeeeeeeeeeeee-ee-ee---e------e--ee-e-eee--e-e-eeeee-eee-eeee--ee-ee-eeeeeeee-ee---ee-eee
219 | >Q9YGH4
220 | eeeeee-e-e-e-ee--ee-ee-ee--ee--ee--ee--ee-e-ee-e-eee-ee--ee-ee-e-e-eee-eeeee-eeeeeeee-ee---ee-eee
221 | >Q91194
222 | e--eeeeeeee-e---ee--ee-e---e-eee--e---ee--ee-eeeeeeeeeee-eeeee---e--e--eeeee-eeeeeeee-ee---ee-eee
223 | >A7WNV7
224 | eeeeeeeeeeeeeeeee-e-e--ee--e---e--ee-eee
225 | >P79875
226 | eeeeeeeeeeeee-eeeeee-eee--e---e--e--eee
227 | >P00735
228 | ee--eeeee-ee--e--ee--ee--ee-eeee--ee--eee-eeee-ee--eeeeeeeeeeeeee--ee-eeeee-eee-ee-e---ee-e--e-eeee---e-e--e-eeee--e-e-eeeeeee-e-ee------ee-ee------eeeeeee-----e--eeeee--eeeee-eee---e--eeeee-eee-ee-e-ee-e-eeeee--e-e-ee-ee--e--eeeee-eeeee-----ee-ee-------ee-------e-e-eeeeee-e-ee-eee--eeeeeeeeeeeeee--e--eee---eeeee--------eeeee-e--e--eee-eeee--eeee-ee----------eeeeee---------ee-----------eeee-e-e-ee----------eee-eee-ee-e-ee--e-ee--eee-----------eee-eeeee-------eee--ee--eee----------eeeeeeeeeee-e-e----------eeee-ee--eee--e-------eeeeeee----e-----------eeee---------------eeee--------e--e--ee--eeeee
229 | >A5A3H1
230 | eeee-eee-eee-ee-ee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e
231 | >D5KXG5
232 | eeeeeeeee--ee-ee--e--eeee-e-eeeeeeeee-------ee---e-eeeeeee
233 | >E2S064
234 | eee-ee-ee-eeee-eeeeeee-e-eeee-eee--ee-eeeeeee-eee-eee
235 | >P07198
236 | eeeeeeeeeeee-ee--e---ee-eee-ee--e-e-ee--eeeeee--ee-eeeeeeeeee
237 | >Q8AYR5
238 | eeeeeeeeee--ee---ee--------eeeee--e--eeeeee-eeee-ee-ee-eeee--e---e--ee-ee-eeeeee---e-e-----e-ee----eeeee
239 | >Q805D4
240 | eeeee-e-eeeeeee-ee-ee--eee-ee-eeeeeeeeee-eeeeee-eee-eeeeeeeeeeeeeeeeeeeeeee--eeee-eeee-e-eeee---e--ee--ee-ee-eeeeeeee-------e-ee--e-eeeee
241 | >Q805D3
242 | eeeeeeeee-eee-e-ee--e--eeeeeeeeeeeeee--e--ee-eee-e------e--eeeeeeeeeee-eeeeeeeee-------ee---e--eee
243 | >Q5SC60
244 | eeeee-e-ee-eeeeeeeeee-ee-e-eeee-e-eeeeeee-----eeee-e--------ee-------ee-e---------ee-ee-ee------eeeeeee-ee-eee-e-ee-ee-e-ee-e---ee-eee-eeee-----e-eeeeeeeeeeee---------ee----ee-e
245 | >P80111
246 | eeeee-eeeeeeeeeeee-ee-eeee--ee-eee-eee--ee--ee--ee-ee-eee-ee-eeee-----e--eeeee-eee
247 | >Q0MWV8
248 | eeeee-eee-eeee-e--eeeeeeeeee-eeee-eeeeee-eeeeee-ee-eee-ee--eee
249 | >P0C7P5
250 | eeeeeeeeeee-eeee-e-ee--eeeeeeee-ee-e-ee--eeeeeee-eeee-eeee-eee--e---ee-eee-eeee-e--eee-eeeee--e-----e--ee-eee-ee--ee-ee---------e----------e--ee--ee---e-----e-ee--eeeeeee
251 | >P68515
252 | eeeee-eeee-eeeeee-ee-e-ee--eeeeeeeeee-ee-e-ee-eee-eeeeeee-e--e--eeeeeeeeeeee-e-ee--eeeeeeee-eeee-ee--eeeeeee--eeee-eee--e---ee-eee-e--e-e----eeeeeeeee-e--eeeeeeee--e------e--ee-eee-ee--ee-ee----e------e----------e-eee-eeee-e-----e-ee--e-eeeee
253 | >B0VXV8
254 | eeeee-eeeeeeee-ee-e--e--eeeeeee-ee-eeee-eeeeeeeeee-eee------ee-eee-e--eeee--eeee-eeeee-ee-eee-eeee---------e--ee--ee-ee--ee-e-----e-----------------e-eee-eeee-e-----e-ee--eee-eee
255 | >P0CB12
256 | eeeeee-eeeeeeee-ee--e-eeeeee--e-ee-eeeeeee-e
257 | >E3PQQ8
258 | eeeeee--ee-e-eeeeeee-eeee-eee------e----eeeeeeee-eeeeee--e--eeee--ee----eeeee-eee-ee-eeeeeeeeeee-ee--e--ee--eeeeee-ee--eeeeeee-eeeeeeeeee--ee---------eeeee----e--ee-ee--eeeeeeee--ee-eeee
259 | >P50145
260 | eeeeee-eeee-e-eeeeeeeeeeeeeee-ee-eeeee-eeeeeeeeee--e---e--ee-eeeeee-e--e-ee-ee-e-eee-ee----------ee--ee-e-ee
261 | >E2E4E4
262 | eee-e-eeeee--ee-eeeeeeeeeeeee-ee-ee-eeeeeeeee--e---e--ee-ee-eeeee-eee-ee----------ee--ee-e-ee
263 | >D6C4K5
264 | eeeeeeeeeeeee-eee-eeeeeee-eee-eee-e---e----e----------e-ee
265 | >C7DQC2
266 | eeeeee--e-e-e----e--e-e-eee-e-eeeeee-ee--eee
267 | >C7DQX6
268 | eeeee-e-eeeee--ee----e--eee-eeeeeeee-ee--eeee
269 | >C7DQB8
270 | ee--eeee---ee-e--ee--------ee--eeeeeeeeeee
271 | >Q9BPJ8
272 | eeeeeeeeeeeeeee-eee--eeeeee-ee-ee--e-e-e-e--e-ee
273 | >Q9BPH5
274 | eee-ee-eeeeee-eeeee-ee--e--ee---------eee----e
275 | >Q5EHP2
276 | eee-ee-eeeeee-eeeeee----e-eeee--e-e--ee--e--ee
277 | >P60207
278 | eeeeeeee--eeeee-eee--eeeee--eee
279 | >D5L5Q7
280 | eeeeeee-ee-ee-eeeee--e-e-eeeeee-ee--e---eee
281 | >Q3YEG3
282 | eeeeeeee-eeeeeeeeeee-e--e--ee-eeeeee-ee-eee--e--e-eeeeee--ee
283 | >Q9U654
284 | eeeee-ee--ee-eee-eeee-ee-eee--eeeee-ee-eee--e-e----e-e
285 | >Q9UA95
286 | eeeeeeeeeeee-eeeeeeee-eee-eeeeee-eee-e---e--ee-eee-eeeeee
287 | >G1AS75
288 | eeeeeeeeee-e--eeeeeeeeee--e-e------e-eee-e--eee--ee--eeee
289 | >G1AS80
290 | eeeeeeeeeeeee-e--eeeeee-ee--eee--e--ee-eeeee--eee-eeee-eee
291 | >P10000
292 | e-eeeee-ee-eeeee--e--e--eee-eee-ee-eeeeeeeeeeeeeee------eeeeeeeeeeeeeeeeeeee--------eeee-eeeee-e---ee-eeeeeeeee-e-e-eee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-e-e----ee-eeeee-e---ee-eeeeeee--e--ee--eeeee
293 | >Q9XYR5
294 | ee-ee--ee--eee-e-e-------eeeeeeeeeeeeeeee---e--eeeeeee
295 | >Q9NDA7
296 | eeeeeeeeee-eeeeeeee--ee--e--eeee-eeeee-e
297 | >P0C907
298 | eeeeeee-e--e-eee--ee--eeee--ee-ee--e
299 | >P0C906
300 | eeeeeee-e-ee-eee-e-e-ee--eee--ee-----eeeee
301 | >Q3YEH1
302 | eeeeeeeeee-e-ee-eee-ee--eee-e-ee--eee---e--eeeee
303 | >P69767
304 | eeeeee-e-ee-eeeeee--ee--ee-e--eeeee--e
305 | >C0KYC3
306 | eeeeeee-e--e-eeeeeeeeeee--e-------e-e
307 | >Q1A3Q5
308 | eeee-e--eeeeeeeeee-e---e--eeeee--eeee-e--ee
309 | >Q9BPH1
310 | eeeeeee-eeee-eeeeeeeeeeeeee----ee-ee
311 | >D2Y493
312 | eeeeeeeeeeeeeeee-e--e-eeee-------e-e-ee-eeeee---ee--eeeee
313 | >D6C4L9
314 | eeeeeeeeeeeeee---------eeee-eeeee--eeee-e----ee-ee-ee-e
315 | >B3FIA6
316 | eeeeeeee--ee--ee-eeee-e-ee--ee-eeeee--eee-e-eeee-e--eeee
317 | >O77256
318 | eeeeeeeeeeeee-eee-eeeeeee-eeeeeeee
319 | >P0C8A9
320 | eeeeeeee-eee-eeeeeeeeee--ee-e--e-eeee--eeeee------ee-eeeeee-ee---e-eee--------ee--eeeeeee
321 | >O93456
322 | eeeeeeeeeeeeeeeeeeee-ee-ee-----eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeeeeee-eeeeeeeeeeeeeeeeeeee-ee---e-eeee-ee-eeeeeeeeeeeeeeee--e------eee-eee
323 | >B3IUE0
324 | ee--e--ee-eeee------ee-e-e-----------e-eeeeee--e-eeeeeeeeeeee-ee---ee--e--eee-e-ee--ee-ee-eee--e--ee--ee-eeeeee-ee---e-ee
325 | >O93451
326 | eeeeeeeeeeeeeeeeeeeeeee-ee--ee-ee---e--e---ee-ee--eeeee
327 | >O93223
328 | eeeeeeeeeeeeeeeeeeee-eee--ee-ee--ee--ee--ee-ee--eeeee
329 | >P81490
330 | eeeeeeeeee-eeeeeeeeee-ee--ee--ee--ee--ee-ee--eeeee
331 | >Q9XWV7
332 | eeeeeeee-e-e-eeeeeeee--e---eeeeeeee---e--e--e--ee--e---e--ee--eeee---eee-eeeeee-eeeeee--e-eeeeee--e--eeeee--e-eeeeee--e-eee
333 | >Q9N4V0
334 | eeee--e-eeee-eee--e--e--eeeeee------ee-e--------eeee--eeeeeeeeeee-----e--e-e-eeeeee--------eee-eeeeee--------eeeeeeeeee--------e-eeeeeeeeeeee--------eee--------e-ee-eee--e--eeee-e---e-eee
335 | >Q8MPY9
336 | ee---e----eee-eee-eeee--eeee-eee-e-ee---e--eeeeeeee-e--ee----eeee
337 | >A8WU84
338 | eeeeeeee---eeee-ee--e---e-eeee-e-e-e--eeeeeeee----------ee-e-e--ee
339 | >Q90W78
340 | eeeeeeeeeeeeeeeeeeeeee-eee-ee-eeeee-ee--eee-eeeeee
341 | >Q9VT50
342 | eee-e-e-ee--ee-e-e---ee-ee--e-----------eeeeeeeeeeeeeee-eee-eee--e-ee-e-ee--------e-e---------ee-eeeee---ee--ee--eeee--e--eee
343 | >Q9VT51
344 | ee--ee-ee--ee--ee--e--eeeee-e--eee-ee-e-----eeeeeeeee-ee--e----e-e-eee--e------eeeeeeee--ee--eee-e-ee-ee--ee-ee
345 | >Q9VT53
346 | eee---e--ee--e-----e-eee-eeeeeeeeee-ee--e---e--ee-e-e-eeee-eeeeeeeeeeeeeeeeeee-ee-ee--ee--ee--eee-e-ee--ee-e
347 | >G3ETQ2
348 | eeeeeeeeeeee--eee-eeeee---e--ee--ee-e-e-eeee
349 | >P0DJ35
350 | eeee-eeeeeee-eeeee---e---eeeeeeee-eee-ee
351 | >P08947
352 | eeeeee-ee-eeee--ee---e----eeeee-eeeeeeeeee-e-----eeee-e-eeee
353 | >Q17093
354 | eeeeee---ee-eeeee----eeee-----ee-eeee-----eeee-----ee-e-ee-----eeee-----eeee-ee-----eeeeeeeeeee-----ee-e-ee-eeeee-eeeee-eeeeee
355 | >Q25060
356 | eeeeeeeeeeeeee-eeeeeeeee-eee-ee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeee-ee-eeee-eeeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeee-eeeee-eee--eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeee-eeeeeeeeeeeeee-eeeeeee--eeeeeeee-eeeeeee-eeeeeeeee-eeeeeeeeeeee-eeee-eeeeeeeee-ee--eee--eeeeeeee-eeeeeeeeeeee
357 | >Q09180
358 | eeeee--e--ee-ee--e-----ee-eeeeeee-eeeeeeeeeeee--e--e---e--e-eeeeeee-eeeeeeeeeeee--e--e---e--e-e-eeeee-eeeeeeeeeeee--e--e------e-eeeeeee-eee-eeeeeeeeeeee--e---------eeeee-ee-e-eee-ee
359 | >C9X4J0
360 | ee-ee--ee--eee--ee-eee-ee--ee--eee-eeeee-eee-eeeeeeeeee
361 | >R4JNJ5
362 | ee-ee--ee-ee---eee-eeeee-ee--eee-eee--e--ee--e
363 | >I0DEB5
364 | eeee--ee--e--eeee-ee-ee-ee--eee-eee--e--ee-ee
365 | >D9U2B8
366 | ee---e------e--eeeeeeeeeee-eeeeee--eee-e-ee--ee-eee
367 | >G1FE62
368 | ee----ee--e--ee--eeeeeeee-eee-e--ee-eeeeee-ee--ee-eee
369 | >P0CI89
370 | eee---e------e--eeeee-e-eee-eeeeee-eeeeee-ee--ee-eee
371 | >B8XH50
372 | ee-e---e---ee--ee-e--e-----e-eeee-ee-ee--eee
373 | >L0GCJ6
374 | eeeee-e-eeeeeee--eeeeeeeee-e-eeeeeeeeeee--eeee-eee-------eeeeeeee
375 | >P43443
376 | ee-eee-ee-ee-eee------e--eeee-eee-eeeeee-eeee-eee--e--e-e-eeee-ee-eeeeeee-ee--ee--ee--eeeee
377 | >Q9BPA3
378 | eeeee--e---ee-eee-eee-eee-ee-eee------eeeee---------e-e
379 | >Q9BP87
380 | eeeeeeee-e--e-ee--ee-e-eee-ee-eee-e-e---ee-e-ee-e-ee-eee
381 | >Q9BPA0
382 | eee-eeeeeee--ee--ee-eeeeeeee-eeeeeee-eeeee---e---e--eee---e-e
383 | >Q9BP90
384 | eeeee-ee--eee-ee---e-eeeeee--ee------eeee--ee-------ee
385 | >Q9BPA7
386 | eeeeee-eee-ee-eeeeeeeeeeeeee-eee-ee-eeeee--e---eee-e
387 | >Q9BP70
388 | eeeeeeee-eee-e-e-e-eee-eeee-eee-ee--eeeeee--eeeeeee---e--ee
389 | >Q7PTL2
390 | ee-eeeeee-ee-eeeeeee-ee-e-eeeeeeeeeee-ee-eee--------e--ee---ee--ee--ee-eee-----eee-ee--ee--ee----------eeeeeee-----e-ee-eeeeeeeeeeee-e--ee-eee-e--ee-ee--eee-e---e-ee
391 | >P85799
392 | eeee-eee-ee-eee-eeeeee-ee-----eeeeeeeeeeeeee--ee-ee----e-e-ee-ee--eeeeeeeeeee-ee-----ee-e-ee--e-eee--e--e--e------eeeeeeeeeee--eeeeeee-e-eeeee-eeeeeeee-eeeeeee-ee-eeee-eee-e-eee---------eeeeeeee-ee--eee-ee-eeeee--eeeeeee-----e-ee--ee--e-ee--ee------e--ee--ee-eee-eee-eeee-eeee-e-e---e---eeeee-----eeeeee-ee-eee-----eeeeee-e-e---ee
393 | >P86040
394 | eeeeeeeeeeeeeeeeeee-eeee-ee-ee--ee-ee-ee-e-e------ee--eeee
395 | >B3VZU5
396 | eeeeeeeeeeeeeee-eee-ee---e--e---e--ee
397 | >Q99N14
398 | eeeeeeee---eeeeee------ee-e-ee-e-e-eeeeeee-ee------ee----e--e---e---ee---ee-e-eee-eeeeeeee-eeeee-eeeeeeeeeeeeeee
399 | >M9P2C1
400 | ee-e-eee-e---ee-eee-ee-ee--ee--eeee-ee------eeeeeee---eeeeeeee--ee-eeeee
401 | >A5A3H2
402 | eeeeee-eeeee-eee-eeeeee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e
403 | >P0DKP4
404 | eeee-ee-ee-e-ee--e--eee--e--eeee-ee-ee-ee-e-ee--e--eeeeee-eeeee-ee-ee-ee-e-ee--e--eeeeeee-eeee-------eeeeee
405 | >A5LHG2
406 | eeeeeeeeeeeeeeeeee--e--e-eeee-ee--------eeeeeeeeeeeeeeeeee--eeeeeeeeee-eeeeeeeeeeeeee-eeee
407 | >Q868F8
408 | eeeeeeeee-ee-----eeee-e-e-e-e----e--ee---e-e-eee-ee-eee-ee-eeee---e--eee--ee-eeeeee-e-ee-e-ee-e-eee
409 | >P20968
410 | eeeee-ee-ee--eee-eee-eeee-eeee-eee--ee-eee-e-eeeeeeeeeeeeeeee--ee--ee--ee-ee-eeeeeeee-------e-ee--e-eeeee
411 | >Q805D8
412 | ee--e--eeee-ee-ee--ee-eee-eeeeeeeeeeee-eeeeeeeeeeeeee-eeeeeeeeee-eeeeeeeeeeeee-eee-ee--e-eeeee-e---e--ee--ee-----eeeee
413 | >Q25461
414 | eeeeeeeeeee-eeeeeeeee--e--eeee---e---eeee-ee-eeeeee--e-ee-eeeeeeeeeeeee--eeeee-eeeeeeee---e--ee--ee-e--eee-eeee---e--eeeeeeeeee---e--ee--ee-eee--ee-ee------e--ee-ee--ee-eeeee
415 | >P01021
416 | eeeee--eeeeeeee-eeeeeeee-ee-ee-ee-ee-e-eee-ee-eeeee-ee-eeeee-eeeeeeee-ee-eeeee-eee-eeee-ee-ee-eeeeeeee--e--eeee-eee------ee--ee-e--------e-e-eeeeee-ee--ee-eeeee-e----------e--eee-ee--ee-ee----e--------------e--e-eee-eeee-e-----e-ee--eee-eee
417 | >P21591
418 | ee--ee-eeeee-eeeeeeee---------eee-eeee-ee-ee--ee--ee--e--eeeeeeeeeeee-ee-----ee--ee--eeeee
419 | >A0AEI5
420 | eeeeeeeeeeeee--ee-ee---ee--ee--ee--ee--ee-e-e-eeee
421 | >P50983
422 | eeeee--eeee---e-e
423 | >P05226
424 | eeeeee-e---e---e--ee--e------ee-ee-ee-eeee---eeeee--------eeeeeee-eee--e-------e--ee--e-------e-eeeeeeeeeee--eeeee---------eeee-e--------eeeeeee-eee--e-------e--ee--e---ee-eeeeeeeeeeeeee-eeeeee-e------eeeeeee
425 | >U5KJZ2
426 | eee-e-ee--ee--ee--eeeeee------e-eeeeeeeeeeeeeee-e-e-ee-e-eeeeeeeeee-e-eeeee-ee-e-e-eeeeeee--e---eeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeee-ee-eee-ee-ee--ee-eee-eee-ee-eee-e-----eee
427 | >Q8JHB9
428 | eee-eeeeeee-eee-eeeeeeeeeeee-eeeeee-eeeeeeeee-ee-ee---e--ee-ee-eeee----e-ee-e-e-e-ee-e--------ee--ee-e-ee
429 | >D6C4H9
430 | eeeeeee--ee-e-ee-eee-ee-eee-ee--ee-eeeee--eee--ee---ee-ee-e
431 | >C7DQX9
432 | e-eeee---e---e--eee--eee-ee--eeeeeeeeee---e--eee
433 | >P16240
434 | eeee--ee-eeee-eeeee-eeee-eee-e--e---e--ee-eee--ee-eeeeeeeee-e--e---e-ee-eeeeeee--ee--e------e--ee--ee-eeee
435 | >Q9BPJ4
436 | eeeeeeeeee-ee-eee-eeeeee--ee-eeeeeee--e-ee-e-eeeeeee---ee---ee-ee-ee
437 | >Q5EHP4
438 | eee-ee-eeeeee-eeeee-e---e--eeee----ee-eee-e--e
439 | >Q9BPH3
440 | eee-ee--eeeee-eeeeeee---e--ee--ee-eeee-e-ee
441 | >Q3YEG4
442 | ee-eeeeee-e---eeeee-e-eee-eeeee--eeeee--ee--ee-eeeee
443 | >P0C833
444 | eeeeeeeeeeee-eeeeee-e--ee-eeeeee-eee-e--ee--ee-ee---e------eeeee
445 | >P0CH24
446 | eeeeeeeeeeeeeeeeeee-ee-ee-ee--ee-ee----eeeeeeeeee
447 | >Q2I2P1
448 | ee-eee-ee---e--eee--ee--eeeeee-e-eeee-e-e----e--eeeeeeeee-eeeee
449 | >Q9NDA6
450 | eeeeeeeeee-eeeeeeeeeeee--e--eeee----ee-e
451 | >Q9GU58
452 | ee-eee-eeee-eeeeee-eee-ee------eee-eee-eeeee-eee-----e--eeeee
453 | >Q800R2
454 | eeeeeeeeeeeeeeeeeeeeeeeeeee---e---e--ee--ee--e---eeee
455 | >Q9BPE7
456 | eeeeeee-e-ee-eee-ee--ee--eeee--eeee---eee
457 | >P0C667
458 | eeeeeee--ee-eee-ee--ee--e-ee--eeeee--e--eeee
459 | >P0C8A4
460 | ee-eee-eeeeee-e-eeeeee-e---eeee-ee-e-e--e-eeee-eeee---ee--e-eeeeeee-eeeee---ee
461 | >P0C8A5
462 | eeeee--e-eee-eee-e-eee-ee--eeeee---ee---eee
463 | >B3IUD8
464 | ee--e--ee-eeee------ee-e-e-----------e-eeeeeeeeeeeeee-ee---e---e---ee-eeeeeee-eeee-e--ee--ee-eee--e--eeee-eeee
465 | >A7WNV5
466 | eeeeeeeeeeeeeee--ee------ee--e---e---ee-ee-e-e----e-eeee
467 | >C6EVG1
468 | eeeeee-eeeeeeeeee--ee-eeeeeeeeeeeeeeeeeeeeeeeeee-eeee-eeeeeeeeee
469 | >Q23212
470 | eeeee-e---e--e----eeee-eeeee-------eeeeeeeeeeeeeeeee----------eee----------eeeeee--eeeeeeee-eeee--------e-eeeeee-------eeeee---------e--eeeee-----e-eeeeee---e-ee
471 | >Q7YX32
472 | ee-eee------ee-ee-eeeee--eeeeeeee-e---e-eee-e---e-eee-e---e--eeeeeee
473 | >Q21156
474 | eee-eeeeee-ee---e--e-eee-ee---e--eeeeeeeee-eeeee-eeeeeeeee---e--eeee-ee-ee--e--ee-eeeeee
475 | >Q8ML70
476 | eeeee-eeee------eeee------eee-e-e-ee-e-e-ee-eee-e---e-ee------eeeeeeeee----------e-eeee--ee-e--ee-eeeee---e-eee-ee-ee-e-----eeeeee-e-e-ee---e--eee-----eeeeee-e-e-ee-----eeee------eeeeee-e-eeee--ee-ee----eee----e-eeeee---eee-e-----eee--ee--e-ee--ee-e
477 | >E6ZBE2
478 | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeee-ee-----------eee
479 | >R4JQZ0
480 | ee----e---e--eeee-ee-ee-ee--eee-eee--e--ee-ee
481 | >P0DMF9
482 | ee-e---ee-eee-eeeeeeee--e--eee--ee-ee--eee
483 | >Q9BP85
484 | eeeeeeeee-e--e-ee--ee-eeeeee-eeeeee-eeee--ee-----eeee---eee
485 | >Q9U660
486 | eeeeee-ee--eeeeee-eeee-ee-eee--eeeee-e---ee--eee------e-ee
487 | >Q9NL82
488 | ee-e--eee-eee-ee-e---ee-e-ee-----------e---e--eeee-e-ee---ee-e-ee-e-ee---ee-e-e----ee---ee-eeee-e-ee---ee-e-ee---ee---ee-e-ee---ee---ee-e-ee---ee---ee-e-eee--ee---ee-eeee-e-eeee--eeeeee-ee---ee-eeee---ee---ee-eeee-e-ee-e-ee-eee---ee-eee----eeee-e
489 | >G3F828
490 | eeeeeeeeeeee--ee--eee--ee-ee--ee---e--ee-e-e-eeee
491 | >Q91826
492 | eeeeeeeeeeeeeee--e---ee--e--ee--e---ee-eeeee
493 | >P01170
494 | eeeeeeeeeee-e-e-e-ee--ee-ee-ee--ee--ee--ee--ee-e-ee-e-eee-ee--e-eeee-e-eee-eeeee-eeeeeeee-ee---ee-eee
495 | >P87385
496 | eeeeee-eeeeeee-ee--ee---e--ee-eeeeeee-ee-eee-eeeeeee-eee-e-ee-e-eeeeee-ee---ee-eee
497 | >P0DJK0
498 | eeeeee--ee-eeeeeeeeeeeee--e--e-eeeeee---e-eeeee--eeeeeeeeeeee-e-ee-eeee--e---ee---e-eeeeeeeeeee
499 | >A8YPR9
500 | eeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeee-eeeeeeeeee-eeeeeeee-e--eeeeeeeee-eeeeeeee-e-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee--eeeeeee-e-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee--eeeeeeeee-eeeeeeeeee-eeeeeeeeee-eeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeee-ee-eeeee
501 | >E7EKD0
502 | eeeeeeeeeeeeeeee-eee-eee---e---e--ee-eee
503 | >B3VZU3
504 | eeeeeeeeeeeeeeeeeeee-eee--e---e--ee-eee
505 | >S0F1M9
506 | ee-e-eee-----eeeee--eeee-eeeee--ee-eeeeeeeeeee-eeeee
507 | >Q6ECK6
508 | eeeeee---eee-e-------ee-e-ee-e-e-ee-eeee-ee------ee-ee-ee--ee--eee---ee-eeeeeeeee
509 | >P83580
510 | eee-ee-ee---ee-eeeeeeee-eeeeee-eeeee-eeeee-eeee-eeeeeeeeeeeee-e
511 | >Q7ZZY8
512 | eeeeee-e-e-ee-e--eeee--e-eeee-eee-----e-----------ee-eeee-e-eee--ee-eee---eeee-ee-----ee-eeeeee-e---e--e
513 | >Q09GK2
514 | eeeeeee-ee-eeee-e---ee-eee-e--ee--eeee-e-eee-ee-ee-eee-eeeeee--e-e-------e---ee-ee--ee-ee---ee-e-eeeeeee-ee----e-eeeeee-e-e-----e-ee--eee-eee
515 | >Q800I8
516 | eeeee-ee-ee--eee-eeeeeeeeeeeeeeeeeee-eeeee-eeeeee---e--ee--ee-eeeeeeeeeee-------e-ee--e-eeeee
517 | >Q805D5
518 | eeeeeeeeee--ee---ee---------eeee--ee-eeeee-eeeeeee-ee-e---eeeee--e---e--ee-ee-eeeeee---e-e-----e-ee----eeeee
519 | >Q76KW6
520 | eee-eeeeeeee--e---e---e--ee-eeeeeeeeeeee-eee-eee-ee--eeeee---eee-eeeeeeeeeeeee--ee--eeee--eee-ee-eeee-----e-ee--e-eeeee
521 | >P86093
522 | eeeeeeeeeeeeeeee-ee-ee-ee-eee-ee-eee
523 | >P20481
524 | eeeeeeeeee-e-e--ee--eeeeeee-e-eee-eeeee--e-e--ee-eeeee-e-e-eee-eeeee-e-e-eee-eeeee-e-ee-e--eeee-e-e-eee-eeeee-e-e-eee-eeeee-e-e-eee-eeeee-e-eeeee-eeeee-eeeeeee-eeeeeee-e--ee-eeeeeee-e-eee-eeeee-e-e-eee-eeeeee--e--e--eeeeee-e--eeeeeee-eee-eee-eeeee-e-e-ee--ee--e--e-eee-eeee-e-e-eeeeeeeeeee-e-eeeeeeeee-e-e-eee-ee-eeee-e-ee-e-eeeeee-e-eee-eeeee-e-e-ee-eeeeee-e-e-ee-eeeeee-e-e-ee-eeeeee-eee-eee-ee-ee-e-e-ee-eeeeee-e-e-ee-eeeeee-e-e-eeeeeeeee-eee-eeeeeeeee-eee-ee-eeeeeeee-eeeeeeee
525 | >O93464
526 | ee-eee-eeeee-e-e--eeeeeeeeee-eee--ee-ee-eeeee-eee--e---e--eeeeeeee-ee-e-ee-eee-e-ee-e--------ee--ee-e-ee
527 | >C7DQX7
528 | eeee------e-e----e--------e-eeeeeeee--ee-eee
529 | >D2DGD7
530 | eeeeeeeeeee-ee-eee--eee-ee-eeeeee-eee-e----------e--e------eee
531 | >C7DQC1
532 | eeee-ee--------------ee-eee-ee--ee-eeeee
533 | >D6C4J3
534 | ee-eeee-eee-ee--eee-eee-eee-eeeee-eee-ee
535 | >Q9BPH6
536 | eee-ee-eeeeee-eeeeeee---e--eeee--eee--e----ee-e
537 | >P01523
538 | eeeeeeeeeeee-ee-eee-eeee-e--eeeee--eeeee-eeee-eeee--eee
539 | >P0C1N6
540 | eee-ee-eeeeee-eeeee-e---e--ee---eee--e-ee-e--ee
541 | >P56529
542 | eeeeeeeeeeeeeee-e-e-eee-ee--eeeeee-----e-ee-ee-eeee--eee
543 | >P69769
544 | eeeeeeee-ee--ee-eee-eeeeee--eee-ee--e-e-ee-eeee-eeee--ee
545 | >D6C4L5
546 | ee--ee-eeeeee-eeee-eeeee--e-ee----eee---e-e
547 | >Q2I2P8
548 | eeeeeeeeeee-eeeeee--e--e--ee--eeeeeeee--ee--ee---e---eee-eeeeee
549 | >Q3YEG7
550 | eeeeeeeeeeee-eeeeee-e--ee-eeee-e-eee-e--ee--eeeee---eeeeee
551 | >Q86RA3
552 | eeeeeeeeeeee-eeeeee-e--ee-eeeeee-eeeee--eee-eee--e-e
553 | >Q9UA91
554 | eeeeeeeeee-e-eeeeeeee-eee-eeee-----------ee-eeeeeeee
555 | >Q3YEE1
556 | eeeeeeeeeee-eeeee-eeeee-e--ee--eee--------e---------e-ee----e-e
557 | >Q5K0C4
558 | eeeeeeee-e--e-eee-eee--ee--ee-eeeeee-eeeee--eee-eee---e--e
559 | >G1AS74
560 | eeeeeeeeee-e--eeee-e-ee--eeee-e-e-e-e-e-e--eee-ee---eee
561 | >Q3YEF8
562 | eeeeeeeee--ee-eee-eee-eeeeee-e-eee--e-e-eee--eeee
563 | >P0C1M8
564 | eeeee--eeee------e-eeeee--eee-eee--eee
565 | >P0CY72
566 | eeeeeeeee-ee-eeeeeeeeeeee-ee--ee-eeeee--eeeee--eee-eeeeeee-------ee--eee
567 | >D2Y495
568 | eeeeeeeeeee-eeeeeeeeeeeeee------ee-ee-eee-eeeeee--eee-e-e-eeeee--eee----e-e
569 | >Q9BH21
570 | eeeeeee-e--e-eee-ee--e--eeee-----eee-ee
571 | >Q3YEH0
572 | eeeeeee-e-ee-eee-ee--e--eeee--e-eeee--eee
573 | >Q9U6Z8
574 | eeeeeee-e-ee-eee-ee--ee-eee---eeeee--eee
575 | >Q3YEH3
576 | eeeeeee-e-ee-eee-ee--e---eeee--eee----ee
577 | >Q1A3Q7
578 | ee-eee-eee-eeeeee-eeeee-ee-ee-------eee---eeeeee
579 | >Q3YEH5
580 | eeeeeee-e-ee-eee-ee--ee--ee----eee-e--e
581 | >Q6PTD0
582 | eeeee-eeee-e--e--ee-eee-e-ee-eee--eee-e--ee
583 | >Q17313
584 | eeeeeee---eee-e---ee-ee--e---eeeee--e---eeee
585 | >P0C1D0
586 | eeeeeeee--e---ee--e---eeee-e-e-e
587 | >P0DJC6
588 | eeeeee-eeee---e-----e--e--eee-eeee-ee-eeeee-eeeee----e-e-e----e-------ee
589 | >P0DJC7
590 | eeeeee-eeeeeeeeeeee-eeeee-eeee-----eeeeee--e-----e-e
591 | >P0C8A7
592 | eeee-e--eeeeee-e-e-e-eeee-ee-e--------eeeeee
593 | >O93225
594 | eeeeeeeeeeeeeeeeeeeeeeee---e--e---e--ee--ee--eeeee
595 | >O93455
596 | eeeeeeeeeeeeeeeeeeee--ee--ee-ee--ee-eee--eee
597 | >C6EVG2
598 | ee-eee-eeeeeeeeee--e-eee-ee--eee--e--ee---ee--ee--eeeeeeeeeee
599 | >P42565
600 | ee-e-eeeee-----ee-e-----ee-e------e--------e--------e-------ee-------ee--------e--------e--------e--------e--------e--------e--------e--------e--------e--------e--------e-------------------ee--------e-----------------------e----------------------------------e------------------ee--------eeeeeeee-------e-e--------e-eee---eeee
601 | >Q18502
602 | eeeee-ee--e--eeeeeee--ee--e--eee-eeeeeee-e-eeeeee--e--eeee-----eeeeee-eeeeee--e-eee
603 | >Q1MX22
604 | eeeeeeee-eeeeeee-ee---e--e-e-eeeeeeeee--ee-eeeee-e---ee--e--ee-eee-eee-eee--e--e--ee-e--e--e--eeeee--eee-ee-eeeeee-ee---e--eeeee-eeeeeeeee-eeeeeeeeeeee-ee-e
605 | >A6P3B2
606 | eee-eee-eeee-ee--ee-e-eeeeeeee-ee-eee-ee-eeeee-eeeeee-e-eeeeee-e-e---ee-eeeeeeee------eeeeeeeeeeeee--e--ee-eee--e--eeeeeeee-ee--eeeeee-ee-ee-e-ee--eeeeeee-----eeeeeeeeeeee--e--eeeeeee-----eeeeee--e-eeeeeee--e-eeeeeee----eeeeeeee--e-eeeeeeee--e-eeeeeeee--e--eeeeeee--e-eeeeeeee--e--eeeeeee-----eeeeeee--e--eeeeeee----eeeee--e--eeeeeeee--e--eeeeeeee--e--ee-eeeeeeeee-ee
607 | >Q8AUU1
608 | eee--eeeeeeeeeeeee--eee-ee-eee-eeeeee---e---e-----eeee-ee--ee--e--ee--eeeeeee
609 | >Q8JIM3
610 | e-----eeee-ee-eeeee-ee--eeeee-eeeee-e-ee-ee-eeee--e--e-ee-e-e-eee-eeee--e---e---------eee-eee-eeeee-eee-ee--e------e-e-e-ee-ee--ee-ee-ee-e---e--ee--ee-eeeee-ee--ee-e---ee-eeeeeeee-eee-e---eeeeeeee
611 | >Q76IQ4
612 | eee--eeeeeeeeeeeeeeeee---e--ee--eeee-e-eeeeee---e---e-----eeee-ee--e---e--ee--eeeeeee
613 | >Q9PUR1
614 | eeeeeeeeeee-ee-eeeee--ee-------e--eee--ee--e---eeeeeeeeeeee--e--------e--eee--ee--e--eeeeeeeee-eee--eee-eee--ee--e--ee---ee--e---eee-eeeee
615 | >O12956
616 | eeeee-eeeeee-e-eeeeeeeeeee-eeeee--ee---e---e--eee--ee--e---eeeeeeeee-eeeeeee-eeeeeeee--eeeee-e-eeeeee-eeeeeeeeeeeeeeeeee-eeeeeeee-eee-ee-ee-ee---e-e--ee--e--ee---ee--e--eeee-eeee-eee-e
617 | >Q9GQV7
618 | eeeee-eee-eeeeeeeeeee---ee--eeeeeeeeee-eee-eeeee-e--eeeeeeeeee-eee-eee-eeeeeeee-eeeeeeeee--ee-eeeeeeeeeeee
619 | >G0LWV9
620 | eeeeeeeeeeeeeeeeeeeeeeeeee--ee--ee-ee--ee--e-eee
621 | >Q16992
622 | eeeeeeeeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eeeeeee--e-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-eee-ee-ee-ee-ee-ee-ee--eeeeeee-eeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-ee-eeeeee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee-ee--e-ee-ee-ee-ee-ee-ee-e-eeeeeeee-ee-ee-ee-ee-ee-eeee
623 | >H2CYR5
624 | eee---e---e--eee--ee-ee-ee--eee-eee--e--ee-ee
625 | >E4VP50
626 | eeee--e--eee-ee--eeeeeee--eee-ee--ee-eee-e-ee--ee-eee
627 | >Q09982
628 | eeee-e-e--e-e-----eee-eeee-ee-ee---------------------------------------------ee-eee
629 | >P86442
630 | eeeeeee-ee--e--e--ee-ee---e--eee-eeeee-eeeeeee--eeeee-ee--ee--eee
631 | >Q9BH84
632 | eeeeee-ee---e--ee-eee-ee-eee---ee-e----ee-e-e--e-----e-e
633 | >Q9BPA4
634 | eeeee--ee--ee--ee-eee-ee-eeeee-eee-e-----e-eee--------e-e
635 | >P21259
636 | eeee-ee--eee-eeeeeeeeeeeeeeeeeee-eeee--eeeeee-eee-eeeeeee--eee--eeeeeee-eee--eeeeee--eee--eeeeeeeeeee--eeeeeee-eee--eeeeee--eee--eeeeeeeeeee--eeeeeee-eee--eeeeee-eeee--eeeeee-eeee--eeeeeeeeeee--eeeeee-eeee--eeeeeee-eee-eeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeee-eeee
637 | >P82003
638 | eeeeeeeeeeeeeeeeee-eeeeeeeeeee-ee-eeeeee-e-ee-eeeeee-e-eeeeeeeeeee--e-eeeeeeee-ee-eeeeee-e-ee-eeeeeeeeeeeeeeeee-ee-eeeeee-e-ee-eeeeee-e-eeeeeeeeeee--e-eeeeeeee--e-eeeeee-e-ee-ee-eee-ee-e-ee-eeeeee-e-eeeeeeeee-eeeeeeeeeeeeeeeeeeeeeee-ee-eeeeeeeeeeeeeeeeeee
639 | >C5J8E3
640 | eeeeeeeeeeeeeeeee-ee-eee-ee----eeeeeee
641 | >Q2V2G5
642 | eeeee-eeee-e-eee--ee-eee-----eeeeeeeeeee-e----e--e---------e--ee-ee--eeeeeee-------eee
643 | >A0SIF1
644 | eeeeeeee--ee--ee--eee-e-e-e-ee-e-eee-eeeee-e-e--eeeeee-e-e-eeee--eeeeee-ee---e--ee-ee-ee-eee---eee-eee-ee--ee--e-e-e-e-eee-eeee---ee--ee-e-e-e--eee-e-e-eee--e-e-eeeeeeee-eeeeeeeee-e--eeeee-eeeeeeee--eeeeeeeeeeeee
645 | >A0SIX6
646 | eeeeeeee--ee--e----ee---e-e-ee-e-eee-eeeee-e-e--eeeeee-e-eee-eee-eeeeee--e-------e-ee-ee-eee--eeeee-eeee-eeeeee-e---eeee--ee--ee-e---e--eee-e-ee--eee-eee---e--eeeeeeeeeeee--ee-eee-e-eeeeeeeeeee
647 | >A8YPR6
648 | eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eee-eeeeeeeeeee-eeeeeee-eeeeeeeeeee-eeeeeeeeeee-eeeeeeeeeeeeeee-eeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee-eeeeeeeeeee-eeeeeeee-ee--eeeeee-ee-eee--eeee-ee-eee----e
649 | >E7EKD4
650 | eeeeeeeeeeeee-e-e-eeee--e---e--ee-eee
651 | >Q86UU9
652 | eeeee-e--ee-e-------ee-e-ee-e-e-ee-eeee-ee------ee-eee-e--eeeee--e-ee---e---ee---eeeeeeeeeeee
653 | >P04560
654 | eeeeee-e-e-ee-e--eeee----eeee-eee-e---e----e------eee-e-eee--ee--ee---e-eee-eee-e-----ee-eeeeee-e---e--e
655 | 


--------------------------------------------------------------------------------
/py/asap/__init__.py:
--------------------------------------------------------------------------------
1 | from .features import FEATURE_KEY_OPTIONS, DEFAULT_FEATURE_KEYS
2 | from .parse import convert_lf_to_fasta
3 | from .window_extraction import META_WINDOW_HEADERS, WindowExtractionParams, extract_windows_from_file, extract_windows_from_seq
4 | from .classification import WindowClassifier, PeptidePredictor, train_window_classifier, get_top_features, get_windows_data
5 | from .sklearn_extensions import FeatureSelectionPipeline


--------------------------------------------------------------------------------
/py/asap/classification.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | 
  7 | from sklearn.utils import shuffle
  8 | from sklearn.pipeline import Pipeline
  9 | from sklearn.preprocessing import StandardScaler
 10 | from sklearn.feature_selection import VarianceThreshold, SelectFdr, RFECV
 11 | from sklearn.cross_validation import StratifiedKFold
 12 | from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn.svm import SVC
 15 | 
 16 | from . import util
 17 | from . import window_extraction
 18 | from . import sklearn_extensions
 19 | 
 20 | LOGGER = logging.getLogger('ML')
 21 | 
 22 | DEFAULT_CLASSIFIERS = [
 23 |     RandomForestClassifier(n_estimators = 180, n_jobs = -2, class_weight = 'auto'),
 24 |     SVC(kernel = 'rbf', class_weight = 'auto', probability = True, cache_size = 1500),
 25 | ]
 26 | 
 27 | DEFAULT_TRANSFORMER = StandardScaler(copy = False)
 28 | 
 29 | DEFAULT_FEATURE_SELECTOR = sklearn_extensions.FeatureSelectionPipeline([
 30 |     VarianceThreshold(0.01),
 31 |     SelectFdr(alpha = 0.1),
 32 | ])
 33 | 
 34 | RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER = sklearn_extensions.RandomForestClassifierWithCoef(n_estimators = 400, n_jobs = -2, class_weight = 'auto')
 35 | 
 36 | # Use a constant seed
 37 | SEED = 1812
 38 | np.random.seed(SEED)
 39 | 
 40 | # Silence annoying pandas warnings
 41 | pd.options.mode.chained_assignment = None
 42 | 
 43 | class WindowClassifier(object):
 44 | 
 45 |     '''
 46 |     Classifies windows extracted with their features.
 47 |     '''
 48 | 
 49 |     def __init__(self, raw_classifier, used_features, transformer = DEFAULT_TRANSFORMER):
 50 |         '''
 51 |         @param raw_classifier (sklearn classifier): A raw classifier trained against a dataset of windows.
 52 |         @param used_features (list of strings): The names of the features used while training the raw classifier (after feature selection
 53 |         has been applied).
 54 |         @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler): The exact same transformer used
 55 |         when training the raw classifier (providing any other transformer is expected to give very poor results).
 56 |         '''
 57 |         self.raw_classifier = raw_classifier
 58 |         self.used_features = used_features
 59 |         self.transformer = transformer
 60 | 
 61 |     def classify_windows(self, windows_data_frame, proba = False):
 62 | 
 63 |         '''
 64 |         Classifies windows extracted with their features, given in a CSV format. Obviously, these windows don't need to have annotations/labels.
 65 |         Even if labels are given, they will be ignored.
 66 |         @param windows_data_frame (pandas.DataFrame):
 67 |             A data frame of the windows' CSV.
 68 |         @param proba (default False):
 69 |             Whether to return predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s).
 70 |         @return:
 71 |             A numpy array of the predicted labels for the given windows. The length of the returned array will correspond to the number of
 72 |             windows in the given data frame.
 73 |         '''
 74 | 
 75 |         if len(windows_data_frame) == 0:
 76 |             return np.empty(shape = 0)
 77 |         else:
 78 | 
 79 |             X = windows_data_frame[self.used_features].values
 80 |             X = self._transform(X)
 81 | 
 82 |             if proba:
 83 |                 return self.raw_classifier.predict_proba(X)[:,1]
 84 |             else:
 85 |                 return self.raw_classifier.predict(X)
 86 | 
 87 |     def test_performance(self, windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, scoring_method = f1_score):
 88 |         '''
 89 |         Tests the performance of this trained classifier, that was originally trained on a certain dataset, on a new dataset. The given dataset
 90 |         should be windows extracted with their features and annotations, given in a CSV format.
 91 |         The documentation of this method is partial and lacks some important details, as it's very similar to train_window_classifier, which
 92 |         already has a detailed documentation. Therefore, make sure to read the documentation of the other method in order to understand the full
 93 |         meaning of all the parameters.
 94 |         @param windows_data_frame (pandas.DataFrame):
 95 |             A data frame of the windows' CSV.
 96 |         @param drop_only_almost_positives (boolean, default False):
 97 |             Whether to drop only almost positive windows in the dataset before evaluating the performance of this classifier against it.
 98 |         @param drop_duplicates (boolean, default True):
 99 |             Whether to drop duplicating windows in the dataset, based on their neighbourhood property, before evaluating the performance
100 |             of this classifier against it.
101 |         @param scoring_method (function, default sklearn.metrics.f1_score):
102 |             A scoring method to evaluate the classifiers by, just like in train_window_classifier.
103 |         @return:
104 |             A tuple of scores measuring the performance of this classifier against the given dataset in the format (score, roc, sensitivity,
105 |             precision, specificity, cm), just like in train_window_classifier.
106 |         '''
107 |         LOGGER.info('Testing ' + str(type(self.raw_classifier)))
108 |         features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, self.transformer, \
109 |                 features = self.used_features)
110 |         LOGGER.info('Predicting %d records...' % len(X))
111 |         y_pred = self.raw_classifier.predict(X)
112 |         return _get_prediction_scores(y, y_pred, scoring_method)
113 | 
114 |     def _transform(self, X):
115 |         if self.transformer is None:
116 |             return X
117 |         else:
118 |             return self.transformer.fit_transform(X)
119 | 
120 | class PeptidePredictor(object):
121 | 
122 |     '''
123 |     Uses a trained window classifier to predicts annotations for new peptide.
124 |     '''
125 | 
126 |     def __init__(self, window_classifier, window_extraction_params = window_extraction.WindowExtractionParams()):
127 |         '''
128 |         @param window_classifier (WindowClassifier):
129 |             The trained window classifier to use.
130 |         @param window_extraction_params (WindowExtractionParams, default params by default):
131 |             The exact same parameters that have been used to extract the windows on which the window classifier has been trained (providing
132 |             any other set of parameters is expected to result very unpleasant errors).
133 |         '''
134 |         self.window_classifier = window_classifier
135 |         self.window_extraction_params = window_extraction_params
136 | 
137 |     def predict_annotations(self, seq, extra_tracks_data = {}, proba = False):
138 | 
139 |         '''
140 |         Predicts the annotations of a peptide.
141 |         @param seq (string):
142 |             The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet.
143 |         @param extra_tracks_data (dict, empty by default):
144 |             A dictionary for providing extra tracks of the given peptide. Must receive the data for all the tracks that have been used to
145 |             extract the windows for training this classifier. Specifically, if this predictor relies on a feature that relies on a certain
146 |             track, then this track must be provided here. The given dictionary should map from track names to their sequence.
147 |         @param proba (default False):
148 |             Whether to return predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s).
149 |         @return:
150 |             If proba = False, will return a binary string (of 0's and 1's) representing the predicted annotations for the given peptide. If
151 |             proba = True, will return a list of floats (between 0 to 1), representing the predicted probabilities. Either way, the length of
152 |             the returned string/list will correspond to the length of the provided peptide sequence.
153 |         '''
154 | 
155 |         length = len(seq)
156 |         windows_csv = window_extraction.extract_windows_from_seq(seq, extra_tracks_data = extra_tracks_data, \
157 |                 window_extraction_params = self.window_extraction_params)
158 |         windows_data_frame = pd.read_csv(windows_csv)
159 |         window_indices = windows_data_frame['window_hot_index'].values
160 | 
161 |         labels = self.window_classifier.classify_windows(windows_data_frame, proba = proba)
162 |         annotation_mask = [0] * length
163 | 
164 |         for window_index, label in zip(window_indices, labels):
165 |             if window_index >= 0 and window_index < length:
166 |                 annotation_mask[window_index] = label
167 | 
168 |         if proba:
169 |             return map(float, annotation_mask)
170 |         else:
171 |             return ''.join(map(str, annotation_mask))
172 | 
173 | def train_window_classifier(windows_data_frame, classifiers = DEFAULT_CLASSIFIERS, drop_only_almost_positives = False, \
174 |         drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, feature_selector = DEFAULT_FEATURE_SELECTOR, n_folds = 5, \
175 |         scoring_method = f1_score, select_best = True):
176 | 
177 |     '''
178 |     Trains a window classifier using a CSV of windows with extracted features and annotations/labels (obtained by either
179 |     window_extraction.extract_windows_from_file with extract_annotations = True or window_extraction.extract_windows_from_seq with a given
180 |     annotation_mask). The evaluation of the classifiers will be based on the kfold procedure, during which various metrics will be calculated.
181 |     The final training of the classifier will be based on the entire data set.
182 |     @param windows_data_frame (pandas.DataFrame):
183 |         A data frame of the windows' CSV.
184 |     @param classifiers (list of sklearn classifiers, default Gaussian-kernel SVM and random forest):
185 |          A list of classifiers to try training independently, from which the best classifier can be chosen.
186 |     @param drop_only_almost_positives (boolean, default False):
187 |         Whether to drop "only almost positive" windows in the dataset. An only almost positive window is a window with a false label in its
188 |         hot index, but with a true label in either of the flanking indices. In some learning scenarios, the labeling of the residues (i.e.
189 |         annotations) isn't so important in a strict manner, and it only matters whether larger regions contain a positive label. It's especially
190 |         important in cases that the actual used dataset is only accurate up to +/-1 shifts of the labels. In such scenarios, using this parameter
191 |         might enhance performance.
192 |     @param drop_duplicates (boolean, default True):
193 |         Whether to drop duplicating windows in the dataset, based on their neighbourhood property.
194 |     @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler):
195 |         A preprocessing transformer to use for the data before starting the kfold evaluation and final training of the classifiers. If None, will
196 |         not perform any preprocessing  transformation.
197 |     @param feature_selector (sklearn feature selector, optional, default a pipeline of VarianceThreshold and SelectFdr):
198 |         A feature selection procedure to apply during both the kfold evaluation and final training of each classifier. If None, will not perform
199 |         feature selection (i.e. will use all features). Note that the given feature selector must implement the get_support method (hence sklearn's
200 |         builtin Pipeline object cannot be used; if you want to pipeline then use FeatureSelectionPipeline of this project).
201 |     @param n_folds (int, default 5):
202 |         The number of folds to use during the kfold evaluation procedure.
203 |     @param scoring_method (function, default sklearn.metrics.f1_score):
204 |         A scoring method to evaluate the classifiers by. Expecting a method that receives two parameters (y_true and y_pred) and returns a float
205 |         score. This score will be calculated for all classifiers, in addition to other metrics. Also, if select_best is set to True, this score
206 |         will be used in order to choose the best classifier.
207 |     @param select_best (boolean, default True):
208 |         Whether to return only the best evaluated classifier or all of them.
209 |     @return:
210 |         For each classifier, will return a tuple of the trained WindowClassifier object and its metrics, as evaluated during the kfold procedure.
211 |         The metrics are also a tuple of floats in the format (score, roc, sensitivity, precision, specificity, cm), where: score is the score
212 |         calculated by scoring_method; roc is Area Under the Curve (AUC); cm stands for the 2X2 confusion matrix of the results. If select_best is
213 |         set to True, will return only the tuple of the best classifier (based on the score). Otherwise, will return a list of tuples for all the
214 |         classifiers, sorted by their score in a descending order.
215 |     '''
216 | 
217 |     features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer)
218 |     window_classifiers_and_results = []
219 | 
220 |     for classifier in classifiers:
221 |         kfold_results = _get_classifier_kfold_results(classifier, X, y, n_folds, feature_selector, scoring_method)
222 |         window_classifier = _get_trained_window_classifier(classifier, features, X, y, feature_selector, transformer)
223 |         window_classifiers_and_results += [(window_classifier, kfold_results)]
224 | 
225 |     window_classifiers_and_results.sort(key = lambda window_classifier_and_results: window_classifier_and_results[1], reverse = True)
226 | 
227 |     if select_best:
228 |         best_classifier, best_results = window_classifiers_and_results[0]
229 |         LOGGER.info('The best classifier is %s with score %f.' % (str(type(best_classifier.raw_classifier)), best_results[0]))
230 |         return best_classifier, best_results
231 |     else:
232 |         return window_classifiers_and_results
233 | 
234 | def get_top_features(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \
235 |         classifier = RFECV_FEATURE_SELECTION_DEFAULT_CLASSIFIER, n_folds = 3, step = 0.05, scoring = 'f1'):
236 | 
237 |     '''
238 |     Using sklearn.feature_selection.RFECV model in order to find the top features of given windows with features, given in a CSV format.
239 |     @param windows_data_frame (pandas.DataFrame):
240 |         A data frame of the windows' CSV.
241 |     @param drop_only_almost_positives (boolean, default False):
242 |         Same as in train_window_classifier.
243 |     @param drop_duplicates (boolean, default True):
244 |         Whether to drop duplicating windows in the dataset, based on their neighbourhood property, prior to RFECV.
245 |     @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler):
246 |         A preprocessing transformer to use for the data before applying RFECV. If None, will not perform any preprocessing transformation.
247 |     @param classifier (sklearn classifier, default a special version of random forest suitable for RFECV):
248 |         The classifier to use as the estimator of RFECV.
249 |     @param n_folds (int, default 2):
250 |         The n_folds to use in the kfold cross-validation as part of the RFECV process.
251 |     @param step (default 0.05):
252 |         See sklearn.feature_selection.RFECV
253 |     @param scoring (default 'f1'):
254 |         See sklearn.feature_selection.RFECV
255 |     @return:
256 |         A list of the top features, each represented as a string.
257 |     '''
258 | 
259 |     features, X, y = get_windows_data(windows_data_frame, drop_only_almost_positives, drop_duplicates, transformer)
260 |     kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED)
261 |     rfecv = RFECV(estimator = classifier, cv = kfold, step = step, scoring = scoring)
262 |     rfecv.fit(X, y)
263 |     return util.apply_mask(features, rfecv.support_)
264 | 
265 | def get_windows_data(windows_data_frame, drop_only_almost_positives = False, drop_duplicates = True, transformer = DEFAULT_TRANSFORMER, \
266 |         features = None):
267 | 
268 |     '''
269 |     Extracts numeric vectorial data in numpy format, suitable for applying standard sklearn models on, from a CSV of windows with features.
270 |     @param windows_data_frame (pandas.DataFrame):
271 |         A data frame of the windows' CSV.
272 |     @param drop_only_almost_positives (boolean, default False):
273 |         Same as in train_window_classifier.
274 |     @param drop_duplicates (boolean, default True):
275 |         Whether to drop duplicating windows in the dataset, based on their neighbourhood property.
276 |     @param transformer (sklearn transformer, optional, default sklearn.preprocessing.StandardScaler):
277 |         A transformer to apply on the data (X). If None, will not perform any preprocessing transformation.
278 |     @param features (list of strings, optional):
279 |         The names of the features to extract from each window. If None, will extract all the features that appear in the given CSV.
280 |     @return:
281 |         A tuple comprised of:
282 |         1. features - A list of strings corresponding to the names of the features extracted from the data.
283 |         2. X - A numpy matrix of the extracted data points. Each row in the matrix represents a window, and each column a feature.
284 |         3. y - A numpy array of binary integer values (0s and 1s), corresponding to the label of the extracted data points (windows). The
285 |         length of y is equal to the number of rows in X.
286 |     '''
287 | 
288 |     LOGGER.info('Given a data frame of %d records X %d columns.' % windows_data_frame.shape)
289 | 
290 |     if drop_only_almost_positives:
291 |         windows_data_frame = windows_data_frame[windows_data_frame['window_only_almost_positive'] == 0]
292 |         LOGGER.info('Dropped only almost positives. %d records remained.' % len(windows_data_frame))
293 | 
294 |     if drop_duplicates:
295 |         # When we remove duplicates, we want to give priority to positives
296 |         windows_data_frame.sort(columns = 'window_label', ascending = False, inplace = True)
297 |         windows_data_frame.drop_duplicates(subset = 'window_neighbourhood', inplace = True)
298 |         LOGGER.info('Dropped duplicates. %d records remained.' % len(windows_data_frame))
299 | 
300 |     if features is None:
301 |         features = [header for header in windows_data_frame.columns if header not in window_extraction.META_WINDOW_HEADERS]
302 | 
303 |     LOGGER.info('%d features to process.' % len(features))
304 |     X = windows_data_frame[features].values
305 |     y = windows_data_frame['window_label'].values
306 |     X, y = shuffle(X, y, random_state = SEED)
307 | 
308 |     if transformer is not None:
309 |         X = transformer.fit_transform(X)
310 |         LOGGER.info('Transformed the data.')
311 | 
312 |     LOGGER.info('Final data: samples = %d, features = %d' % X.shape)
313 |     return features, X, y
314 | 
315 | def _get_classifier_kfold_results(classifier, X, y, n_folds, feature_selector, scoring_method):
316 | 
317 |     LOGGER.info('Estimating ' + str(type(classifier)))
318 |     time_before = datetime.datetime.now()
319 | 
320 |     y_pred = _predict_using_kfold(X, y, classifier, n_folds, feature_selector)
321 |     score, roc, sensitivity, precision, specificity, cm = _get_prediction_scores(y, y_pred, scoring_method)
322 | 
323 |     time_diff = datetime.datetime.now() - time_before
324 |     LOGGER.info('Finished estimating. Took %d seconds' % int(time_diff.total_seconds()))
325 | 
326 |     LOGGER.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity))
327 |     LOGGER.info('Confusion matrix:' + '\n' + str(cm))
328 |     return score, roc, sensitivity, precision, specificity, cm
329 | 
330 | def _get_trained_window_classifier(classifier, features, X, y, feature_selector, transformer):
331 |     LOGGER.info('Training ' + str(type(classifier)))
332 |     X_reduced = feature_selector.fit_transform(X, y)
333 |     used_features = util.apply_mask(features, feature_selector.get_support())
334 |     classifier.fit(X_reduced, y)
335 |     return WindowClassifier(classifier, used_features, transformer)
336 | 
337 | def _predict_using_kfold(X, y, classifier, n_folds, feature_selector):
338 | 
339 |     kfold = StratifiedKFold(y, n_folds = n_folds, shuffle = True, random_state = SEED)
340 |     y_pred = np.zeros(len(y))
341 | 
342 |     for i, fold in enumerate(kfold):
343 | 
344 |         LOGGER.info('Running fold %d/%d...' % (i + 1, n_folds))
345 | 
346 |         train_indices, test_indices = fold
347 |         X_train = X[train_indices]
348 |         X_test = X[test_indices]
349 |         y_train = y[train_indices]
350 | 
351 |         if feature_selector is not None:
352 |             feature_selector.fit(X_train, y_train)
353 |             X_train = feature_selector.transform(X_train)
354 |             X_test = feature_selector.transform(X_test)
355 |             LOGGER.info('Selected features. Remained with %d features.' % X_train.shape[1])
356 | 
357 |         classifier.fit(X_train, y_train)
358 |         y_pred[test_indices] = classifier.predict(X_test)
359 | 
360 |     return y_pred
361 | 
362 | def _get_prediction_scores(y_true, y_pred, scoring_method):
363 | 
364 |     cm = confusion_matrix(y_true, y_pred, labels = [0, 1])
365 |     roc = roc_auc_score(y_true, y_pred)
366 |     score = scoring_method(y_true, y_pred)
367 | 
368 |     tn = float(cm[0][0])
369 |     tp = float(cm[1][1])
370 |     fp = float(cm[0][1])
371 |     fn = float(cm[1][0])
372 |     n = tn + fp
373 |     p = tp + fn
374 | 
375 |     sensitivity = tp / p
376 |     specificity = tn / n
377 |     precision = tp / (tp + fp)
378 | 
379 |     return score, roc, sensitivity, precision, specificity, cm
380 | 


--------------------------------------------------------------------------------
/py/asap/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | AMINO_ACIDS = '_ACDEFGHIKLMNPQRSTVWY'
 4 | REDUCED_AMINO_ACIDS = 'ACDGFIHKMNQPSRW'
 5 | POSITIVE_AMINO_ACIDS = 'KR'
 6 | NEGATIVE_AMINO_ACIDS = 'DE'
 7 | 
 8 | SS_OPTIONS = '_HCE' # 3 state 2D structure
 9 | ACC_OPTIONS = '_-e' # Binary
10 | DISORDER_OPTIONS = '.-^'
11 | PSSM_AMINO_ACIDS = 'ARNDCQEGHILKMFPSTWYV' # The order is super-important here!
12 | 
13 | # Init logger
14 | LOG_FORMAT = '%(asctime)s [%(name)s:%(levelname)s] %(message)s'
15 | logging.basicConfig(format = LOG_FORMAT, level = 'INFO')
16 | 


--------------------------------------------------------------------------------
/py/asap/data.py:
--------------------------------------------------------------------------------
  1 | from Bio.Seq import Seq
  2 | from Bio.SeqRecord import SeqRecord
  3 | from Bio.Alphabet import IUPAC
  4 | 
  5 | from . import features
  6 | 
  7 | class SequenceTrack(object):
  8 | 
  9 |     def __init__(self, name, seq, padding_value = '_'):
 10 |         '''
 11 |         @param name:
 12 |             The name of the track (string).
 13 |         @param seq:
 14 |             The actual sequence (string).
 15 |         '''
 16 |         self.name = name
 17 |         self.seq = seq
 18 |         self.padding_value = padding_value
 19 | 
 20 |     def length(self):
 21 |         return len(self.seq)
 22 | 
 23 |     def get_subsequence(self, start, length):
 24 |         return SequenceTrack(self.name, self.seq[start:(start + length)])
 25 | 
 26 |     def pad(self, prefix_length, suffix_length):
 27 |         prefix = prefix_length * self.padding_value
 28 |         suffix = suffix_length * self.padding_value
 29 |         self.seq = prefix + self.seq + suffix
 30 | 
 31 |     def __repr__(self):
 32 |         return '%s: %s' % (self.name, self.seq)
 33 | 
 34 | class SequenceTracks(object):
 35 | 
 36 |     def __init__(self):
 37 |         self.tracks = {}
 38 | 
 39 |     def add_track(self, sequence_track):
 40 |         self.tracks[sequence_track.name] = sequence_track
 41 | 
 42 |     def get_track(self, name):
 43 |         return self.tracks[name]
 44 | 
 45 |     def get_subsequence(self, start, length):
 46 | 
 47 |         subsequence = SequenceTracks()
 48 | 
 49 |         for track in self.tracks.values():
 50 |             subsequence.add_track(track.get_subsequence(start, length))
 51 | 
 52 |         return subsequence
 53 | 
 54 |     def pad(self, prefix_length, suffix_length):
 55 |         for track in self.tracks.values():
 56 |             track.pad(prefix_length, suffix_length)
 57 | 
 58 |     def length(self):
 59 | 
 60 |         used_track_name = None
 61 |         length = None
 62 | 
 63 |         for track in self.tracks.values():
 64 |             if length is None:
 65 |                 used_track_name = track.name
 66 |                 length = track.length()
 67 |             elif length != track.length():
 68 |                 raise Exception('Track lengths don\'t match (%s: %d, %s: %d)' % (used_track_name, length, track.name, track.length()))
 69 | 
 70 |         return length
 71 | 
 72 | class DataRecord(object):
 73 | 
 74 |     def __init__(self, sequence_tracks):
 75 |         '''
 76 |         @param sequence_tracks (SequenceTracks):
 77 |             All the sequence tracks used for this data record.
 78 |         '''
 79 |         self.sequence_tracks = sequence_tracks
 80 |         self.padding_prefix_length = 0
 81 |         self.padding_suffix_length = 0
 82 |         
 83 |     def length(self):
 84 |         return self.sequence_tracks.length()
 85 | 
 86 |     def pad(self, prefix_length, suffix_length):
 87 |         self.padding_prefix_length += prefix_length
 88 |         self.padding_suffix_length += suffix_length
 89 |         self.sequence_tracks.pad(prefix_length, suffix_length)
 90 | 
 91 |     def get_track_seq(self, name):
 92 |         return self.sequence_tracks.get_track(name).seq
 93 | 
 94 |     def get_aa_seq(self):
 95 |         return self.get_track_seq('aa')
 96 | 
 97 |     def get_annotation_mask(self):
 98 |         return self.get_track_seq('annotation')
 99 | 
100 |     def get_available_tracks(self):
101 |         return self.sequence_tracks.tracks.keys()
102 | 
103 |     def has_annotation_mask(self):
104 |         return 'annotation' in self.get_available_tracks()
105 | 
106 | class FullDataRecord(DataRecord):
107 | 
108 |     def __init__(self, id, name, description, sequence_tracks):
109 |         '''
110 |         @see DataRecord
111 |         @param id (string):
112 |             The record's ID from FASTA
113 |         @param name (string):
114 |             The record's name from FASTA
115 |         @param description (string):
116 |             The record's description from FASTA
117 |         '''
118 |         DataRecord.__init__(self, sequence_tracks)
119 |         self.id = id
120 |         self.name = name
121 |         self.description = description
122 | 
123 |     def to_fasta_record(self):
124 |         return SeqRecord(Seq(self.get_aa_seq(), IUPAC.protein),
125 |                          id = self.id,
126 |                          name = self.name,
127 |                          description = self.description)
128 | 
129 |     def get_windows(self, window_size):
130 |         for i in range(self.length() - window_size + 1):
131 |             yield Window(self, i, i - self.padding_prefix_length, self.sequence_tracks.get_subsequence(i, window_size))
132 | 
133 |     def __repr__(self):
134 |         return 'Record %s' % self.id
135 | 
136 | class Window(DataRecord):
137 | 
138 |     def __init__(self, full_record, offset, original_index, sequence_tracks):
139 |         DataRecord.__init__(self, sequence_tracks)
140 |         self.full_record = full_record
141 |         self.offset = offset
142 |         self.original_index = original_index
143 | 
144 |     def get_left_context_track_seq(self, name):
145 |         return self.full_record.get_track_seq(name)[:self.offset]
146 | 
147 |     def get_right_context_track_seq(self, name):
148 |         return self.full_record.get_track_seq(name)[(self.offset + self.length()):]
149 | 
150 |     def get_neighbourhood(self, hot_index, neighbourhood_prefix, neighbourhood_suffix):
151 |         return self.get_aa_seq()[(hot_index - neighbourhood_prefix):(hot_index + neighbourhood_suffix + 1)]
152 | 
153 |     def get_label(self, hot_index):
154 |         return self.get_annotation_mask()[hot_index] == '1'
155 | 
156 |     def is_only_almost_positive(self, hot_index):
157 |         '''
158 |         @return:
159 |             whether the hot index is negative, but one of the flanking indices is positive. If so, this window shouldn't
160 |             be considered during the learning process (we treat it neither positive nor negative), assuming that default
161 |             configuration is used.
162 |         '''
163 |         mask = self.get_annotation_mask()
164 |         return mask[hot_index] == '0' and (mask[hot_index - 1] == '1' or mask[hot_index + 1] == '1')
165 | 
166 |     def get_features(self, hot_index, feature_keys = features.DEFAULT_FEATURE_KEYS):
167 |         return features.get_features(self, hot_index, feature_keys)
168 | 
169 |     def __repr__(self):
170 |         return 'Window %d of %s' % (self.offset, self.full_record.id)
171 | 


--------------------------------------------------------------------------------
/py/asap/features_deps/AAScales.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | AA Propensity Scales.
  3 | 
  4 | TODO: (Add/Note "combined" metrics: Georgiev scales. Kidera factors.. )
  5 | Some from BioPython.
  6 | (BioPython stored them as dictionaries, e.g: Bio.SeqUtils.ProtParamData.kd).
  7 | 
  8 | May need to be SCALED to 0-1 range ?!?
  9 | 
 10 | Data initially acquired from BioPython:
 11 | https://github.com/biopython/biopython/blob/master/Bio/SeqUtils/ProtParamData.py
 12 |     Bio.SeqUtils.ProtParamData
 13 | 
 14 |     Some more descriptors:
 15 |     https://github.com/ddofer/Protein-Descriptors/blob/master/src/csdsML/Descriptors.py
 16 | '''
 17 | 
 18 | import numpy as np
 19 | 
 20 | # Kyte & Doolittle {kd} index of hydrophobicity
 21 | hp = {'A': 1.8, 'R':-4.5, 'N':-3.5, 'D':-3.5, 'C': 2.5,
 22 |       'Q':-3.5, 'E':-3.5, 'G':-0.4, 'H':-3.2, 'I': 4.5,
 23 |       'L': 3.8, 'K':-3.9, 'M': 1.9, 'F': 2.8, 'P':-1.6,
 24 |       'S':-0.8, 'T':-0.7, 'W':-0.9, 'Y':-1.3, 'V': 4.2 }
 25 | 
 26 | # Flexibility
 27 | # Normalized flexibility parameters (B-values), average (Vihinen et al., 1994)
 28 | Flex= {'A': 0.984, 'C': 0.906, 'E': 1.094, 'D': 1.068,
 29 |        'G': 1.031, 'F': 0.915, 'I': 0.927, 'H': 0.950,
 30 |        'K': 1.102, 'M': 0.952, 'L': 0.935, 'N': 1.048,
 31 |        'Q': 1.037, 'P': 1.049, 'S': 1.046, 'R': 1.008,
 32 |        'T': 0.997, 'W': 0.904, 'V': 0.931, 'Y': 0.929}
 33 | 
 34 | # Hydrophilicity
 35 | # 1 Hopp & Wood
 36 | # Proc. Natl. Acad. Sci. U.S.A. 78:3824-3828(1981).
 37 | hw = {'A':-0.5, 'R': 3.0, 'N': 0.2, 'D': 3.0, 'C':-1.0,
 38 |       'Q': 0.2, 'E': 3.0, 'G': 0.0, 'H':-0.5, 'I':-1.8,
 39 |       'L':-1.8, 'K': 3.0, 'M':-1.3, 'F':-2.5, 'P': 0.0,
 40 |       'S': 0.3, 'T':-0.4, 'W':-3.4, 'Y':-2.3, 'V':-1.5 }
 41 | 
 42 | # Surface accessibility {"em"}
 43 | # 1 Emini Surface fractional probability
 44 | sa = {'A': 0.815, 'R': 1.475, 'N': 1.296, 'D': 1.283, 'C': 0.394,
 45 |       'Q': 1.348, 'E': 1.445, 'G': 0.714, 'H': 1.180, 'I': 0.603,
 46 |       'L': 0.603, 'K': 1.545, 'M': 0.714, 'F': 0.695, 'P': 1.236,
 47 |       'S': 1.115, 'T': 1.184, 'W': 0.808, 'Y': 1.089, 'V': 0.606 }
 48 | 
 49 | # 2 Janin Interior to surface transfer energy scale
 50 | ja = {'A': 0.28, 'R':-1.14, 'N':-0.55, 'D':-0.52, 'C': 0.97,
 51 |       'Q':-0.69, 'E':-1.01, 'G': 0.43, 'H':-0.31, 'I': 0.60,
 52 |       'L': 0.60, 'K':-1.62, 'M': 0.43, 'F': 0.46, 'P':-0.42,
 53 |       'S':-0.19, 'T':-0.32, 'W': 0.29, 'Y':-0.15, 'V': 0.60 }
 54 | 
 55 | # Disorder Propensity scale
 56 | #"TOP-IDP-Scale: A New Amino Acid Scale Measuring Propensity for Intrinsic Disorder"
 57 | #Campen, Uversky, Dunker et al. Protein Pept Lett. 2008
 58 | 
 59 | # Positive  values indicate protein (or windows) are likely to be ordered, etc' /
 60 | TOP_IDP ={'A':0.06, 'R':0.180, 'N':0.007, 'D':0.192, 'C': 0.02,
 61 |       'Q':0.318, 'E':0.736, 'G': 0.166, 'H':0.303, 'I': -0.486,
 62 |       'L': -0.326, 'K':0.586, 'M': -0.397, 'F': -0.697, 'P':0.987,
 63 |       'S':0.341, 'T':0.059, 'W': -0.884, 'Y':-0.510, 'V': -0.121 }
 64 | 
 65 | # https://github.com/ddofer/Protein-Descriptors/blob/master/src/csdsML/Descriptors.py
 66 | polarizability= {'A':0.046,'R':0.291,'N':0.134,'D':0.105,'C': 0.128,'Q':0.180,
 67 |                                          'E':0.151,'G':0.000,'H':0.230,'I':0.186,'L':0.186,'K':0.219,'M':0.221,
 68 |                                           'F':0.290,'P':0.131,'S':0.062,'T':0.108,'W':0.409,'Y':0.298,'V':0.140}
 69 | 
 70 | ASAInTripeptide = {'A':115,'R':225,'N':160,'D':150,'C':135,'Q':180,
 71 |                                          'E':190,'G':75,'H':195,'I':175,'L':170,'K':200,'M':185,
 72 |                                           'F':210,'P':145,'S':115,'T':140,'W':255,'Y':230,'V':155}
 73 | Volume = {'A':52.6,'R':109.1,'N':75.7,'D':68.4,'C':68.3,'Q':89.7,
 74 |                                          'E':84.7,'G':36.3,'H':91.9,'I':102.0,'L':102.0,'K':105.1,'M':97.7,
 75 |                                           'F':113.9,'P':73.6,'S':54.9,'T':71.2,'W':135.4,'Y':116.2,'V':85.1}
 76 | 
 77 | StericParam = {'A':0.52,'R':0.68,'N':0.76,'D':0.76,'C':0.62,'Q':0.68,
 78 |                                          'E':0.68,'G':0.00,'H':0.70,'I':1.02,'L':0.98,'K':0.68,'M':0.78,
 79 |                                           'F':0.70,'P':0.36,'S':0.53,'T':0.50,'W':0.70,'Y':0.70,'V':0.76}
 80 | Mutability = {'A':100,'R':65,'N':134,'D':106,'C':20,'Q':93,
 81 |                                          'E':102,'G':49,'H':66,'I':96,'L':40,'K':56,'M':94,
 82 |                                           'F':41,'P':56,'S':120,'T':97,'W':18,'Y':41,'V':74}
 83 | 
 84 | # Hydrophobicity_kd_TMD = Bio.SeqUtils.ProtParamData.kd
 85 | # Hydrophilicity = Bio.SeqUtils.ProtParamData.hw
 86 | # Surface_access  = Bio.SeqUtils.ProtParamData.em
 87 | # Ja_transfer_energy = Bio.SeqUtils.ProtParamData.ja
 88 | # flexibility = Bio.SeqUtils.ProtParamData.Flex
 89 | 
 90 | 
 91 | 'GeorgievScales:'
 92 | #Acquired from georgiev's paper of AAscales using helper script "GetTextData.py". + RegEx cleaning
 93 | gg_1 = {'Q': -2.54, 'L': 2.72, 'T': -0.65, 'C': 2.66, 'I': 3.1, 'G': 0.15, 'V': 2.64, 'K': -3.89, 'M': 1.89, 'F': 3.12, 'N': -2.02, 'R': -2.8, 'H': -0.39, 'E': -3.08, 'W': 1.89, 'A': 0.57, 'D': -2.46, 'Y': 0.79, 'S': -1.1, 'P': -0.58}
 94 | gg_2 = {'Q': 1.82, 'L': 1.88, 'T': -1.6, 'C': -1.52, 'I': 0.37, 'G': -3.49, 'V': 0.03, 'K': 1.47, 'M': 3.88, 'F': 0.68, 'N': -1.92, 'R': 0.31, 'H': 1, 'E': 3.45, 'W': -0.09, 'A': 3.37, 'D': -0.66, 'Y': -2.62, 'S': -2.05, 'P': -4.33}
 95 | gg_3 = {'Q': -0.82, 'L': 1.92, 'T': -1.39, 'C': -3.29, 'I': 0.26, 'G': -2.97, 'V': -0.67, 'K': 1.95, 'M': -1.57, 'F': 2.4, 'N': 0.04, 'R': 2.84, 'H': -0.63, 'E': 0.05, 'W': 4.21, 'A': -3.66, 'D': -0.57, 'Y': 4.11, 'S': -2.19, 'P': -0.02}
 96 | gg_4 = {'Q': -1.85, 'L': 5.33, 'T': 0.63, 'C': -3.77, 'I': 1.04, 'G': 2.06, 'V': 2.34, 'K': 1.17, 'M': -3.58, 'F': -0.35, 'N': -0.65, 'R': 0.25, 'H': -3.49, 'E': 0.62, 'W': -2.77, 'A': 2.34, 'D': 0.14, 'Y': -0.63, 'S': 1.36, 'P': -0.21}
 97 | gg_5 = {'Q': 0.09, 'L': 0.08, 'T': 1.35, 'C': 2.96, 'I': -0.05, 'G': 0.7, 'V': 0.64, 'K': 0.53, 'M': -2.55, 'F': -0.88, 'N': 1.61, 'R': 0.2, 'H': 0.05, 'E': -0.49, 'W': 0.72, 'A': -1.07, 'D': 0.75, 'Y': 1.89, 'S': 1.78, 'P': -8.31}
 98 | gg_6 = {'Q': 0.6, 'L': 0.09, 'T': -2.45, 'C': -2.23, 'I': -1.18, 'G': 7.47, 'V': -2.01, 'K': 0.1, 'M': 2.07, 'F': 1.62, 'N': 2.08, 'R': -0.37, 'H': 0.41, 'E': 0, 'W': 0.86, 'A': -0.4, 'D': 0.24, 'Y': -0.53, 'S': -3.36, 'P': -1.82}
 99 | gg_7 = {'Q': 0.25, 'L': 0.27, 'T': -0.65, 'C': 0.44, 'I': -0.21, 'G': 0.41, 'V': -0.33, 'K': 4.01, 'M': 0.84, 'F': -0.15, 'N': 0.4, 'R': 3.81, 'H': 1.61, 'E': -5.66, 'W': -1.07, 'A': 1.23, 'D': -5.15, 'Y': -1.3, 'S': 1.39, 'P': -0.12}
100 | gg_8 = {'Q': 2.11, 'L': -4.06, 'T': 3.43, 'C': -3.49, 'I': 3.45, 'G': 1.62, 'V': 3.93, 'K': -0.01, 'M': 1.85, 'F': -0.41, 'N': -2.47, 'R': 0.98, 'H': -0.6, 'E': -0.11, 'W': -1.66, 'A': -2.32, 'D': -1.17, 'Y': 1.31, 'S': -1.21, 'P': -1.18}
101 | gg_9 = {'Q': -1.92, 'L': 0.43, 'T': 0.34, 'C': 2.22, 'I': 0.86, 'G': -0.47, 'V': -0.21, 'K': -0.26, 'M': -2.05, 'F': 4.2, 'N': -0.07, 'R': 2.43, 'H': 3.55, 'E': 1.49, 'W': -5.87, 'A': -2.01, 'D': 0.73, 'Y': -0.56, 'S': -2.83, 'P': 0}
102 | gg_10 = {'Q': -1.67, 'L': -1.2, 'T': 0.24, 'C': -3.78, 'I': 1.98, 'G': -2.9, 'V': 1.27, 'K': -1.66, 'M': 0.78, 'F': 0.73, 'N': 7.02, 'R': -0.99, 'H': 1.52, 'E': -2.26, 'W': -0.66, 'A': 1.31, 'D': 1.5, 'Y': -0.95, 'S': 0.39, 'P': -0.66}
103 | gg_11 = {'Q': 0.7, 'L': 0.67, 'T': -0.53, 'C': 1.98, 'I': 0.89, 'G': -0.98, 'V': 0.43, 'K': 5.86, 'M': 1.53, 'F': -0.56, 'N': 1.32, 'R': -4.9, 'H': -2.28, 'E': -1.62, 'W': -2.49, 'A': -1.14, 'D': 1.51, 'Y': 1.91, 'S': -2.92, 'P': 0.64}
104 | gg_12 = {'Q': -0.27, 'L': -0.29, 'T': 1.91, 'C': -0.43, 'I': -1.67, 'G': -0.62, 'V': -1.71, 'K': -0.06, 'M': 2.44, 'F': 3.54, 'N': -2.44, 'R': 2.09, 'H': -3.12, 'E': -3.97, 'W': -0.3, 'A': 0.19, 'D': 5.61, 'Y': -1.26, 'S': 1.27, 'P': -0.92}
105 | gg_13 = {'Q': -0.99, 'L': -2.47, 'T': 2.66, 'C': -1.03, 'I': -1.02, 'G': -0.11, 'V': -2.93, 'K': 1.38, 'M': -0.26, 'F': 5.25, 'N': 0.37, 'R': -3.08, 'H': -1.45, 'E': 2.3, 'W': -0.5, 'A': 1.66, 'D': -3.85, 'Y': 1.57, 'S': 2.86, 'P': -0.37}
106 | gg_14 = {'Q': -1.56, 'L': -4.79, 'T': -3.07, 'C': 0.93, 'I': -1.21, 'G': 0.15, 'V': 4.22, 'K': 1.78, 'M': -3.09, 'F': 1.73, 'N': -0.89, 'R': 0.82, 'H': -0.77, 'E': -0.06, 'W': 1.64, 'A': 4.39, 'D': 1.28, 'Y': 0.2, 'S': -1.88, 'P': 0.17}
107 | gg_15 = {'Q': 6.22, 'L': 0.8, 'T': 0.2, 'C': 1.43, 'I': -1.78, 'G': -0.53, 'V': 1.06, 'K': -2.71, 'M': -1.39, 'F': 2.14, 'N': 3.13, 'R': 1.32, 'H': -4.18, 'E': -0.35, 'W': -0.72, 'A': 0.18, 'D': -1.98, 'Y': -0.76, 'S': -2.42, 'P': 0.36}
108 | gg_16 = {'Q': -0.18, 'L': -1.43, 'T': -2.2, 'C': 1.45, 'I': 5.71, 'G': 0.35, 'V': -1.31, 'K': 1.62, 'M': -1.02, 'F': 1.1, 'N': 0.79, 'R': 0.69, 'H': -2.91, 'E': 1.51, 'W': 1.75, 'A': -2.6, 'D': 0.05, 'Y': -5.19, 'S': 1.75, 'P': 0.08}
109 | gg_17 = {'Q': 2.72, 'L': 0.63, 'T': 3.73, 'C': -1.15, 'I': 1.54, 'G': 0.3, 'V': -1.97, 'K': 0.96, 'M': -4.32, 'F': 0.68, 'N': -1.54, 'R': -2.62, 'H': 3.37, 'E': -2.29, 'W': 2.73, 'A': 1.49, 'D': 0.9, 'Y': -2.56, 'S': -2.77, 'P': 0.16}
110 | gg_18 = {'Q': 4.35, 'L': -0.24, 'T': -5.46, 'C': -1.64, 'I': 2.11, 'G': 0.32, 'V': -1.21, 'K': -1.09, 'M': -1.34, 'F': 1.46, 'N': -1.71, 'R': -1.49, 'H': 1.87, 'E': -1.47, 'W': -2.2, 'A': 0.46, 'D': 1.38, 'Y': 2.87, 'S': 3.36, 'P': -0.34}
111 | gg_19 = {'Q': 0.92, 'L': 1.01, 'T': -0.73, 'C': -1.05, 'I': -4.18, 'G': 0.05, 'V': 4.77, 'K': 1.36, 'M': 0.09, 'F': 2.33, 'N': -0.25, 'R': -2.57, 'H': 2.17, 'E': 0.15, 'W': 0.9, 'A': -4.22, 'D': -0.03, 'Y': -3.43, 'S': 2.67, 'P': 0.04}
112 | 
113 | Atch_1 = {'A': 0.591, 'C': 1.343, 'E': 1.357, 'D': 1.05, 'G': 0.384, 'F': 1.006, 'I': 1.239, 'H': 0.336, 'K': 1.831, 'M': 0.663, 'L': 1.019, 'N': 0.945, 'Q': 0.931, 'P': 0.189, 'S': 0.228, 'R': 1.538, 'T': 0.032, 'W': 0.595, 'V': 1.337, 'Y': 0.26}
114 | Atch_2 = {'A': 1.302, 'C': 0.465, 'E': 1.453, 'D': 0.302, 'G': 1.652, 'F': 0.59, 'I': 0.547, 'H': 0.417, 'K': 0.561, 'M': 1.524, 'L': 0.987, 'N': 0.828, 'Q': 0.179, 'P': 2.081, 'S': 1.399, 'R': 0.055, 'T': 0.326, 'W': 0.009, 'V': 0.279, 'Y': 0.83}
115 | Atch_3 = {'A': 0.733, 'C': 0.862, 'E': 1.477, 'D': 3.656, 'G': 1.33, 'F': 1.891, 'I': 2.131, 'H': 1.673, 'K': 0.533, 'M': 2.219, 'L': 1.505, 'N': 1.299, 'Q': 3.005, 'P': 1.628, 'S': 4.76, 'R': 1.502, 'T': 2.213, 'W': 0.672, 'V': 0.544, 'Y': 3.097}
116 | Atch_4 = {'A': 1.57, 'C': 1.02, 'E': 0.113, 'D': 0.259, 'G': 1.045, 'F': 0.397, 'I': 0.393, 'H': 1.474, 'K': 0.277, 'M': 1.005, 'L': 1.266, 'N': 0.169, 'Q': 0.503, 'P': 0.421, 'S': 0.67, 'R': 0.44, 'T': 0.908, 'W': 2.128, 'V': 1.242, 'Y': 0.838}
117 | Atch_5 = {'A': 0.146, 'C': 0.255, 'E': 0.837, 'D': 3.242, 'G': 2.064, 'F': 0.412, 'I': 0.816, 'H': 0.078, 'K': 1.648, 'M': 1.212, 'L': 0.912, 'N': 0.933, 'Q': 1.853, 'P': 1.392, 'S': 2.647, 'R': 2.897, 'T': 1.313, 'W': 0.184, 'V': 1.262, 'Y': 1.512}
118 | 
119 | MinScales_Dict = {'hp':hp, 'hw':hw,
120 |         'sa':sa, 'TOP_IDP':TOP_IDP,
121 |                   'Atch_1':Atch_1,'Atch_2':Atch_2,
122 |                   'Atch_3':Atch_3,'Atch_4':Atch_4,
123 |                   'Atch_5':Atch_5}
124 | #Some scales removed from "full" scales dict, due ot redundnacy, partic if minScales dict is used on subsegments of sequence.
125 | #If MinScales dict is NOT used, then it's HGIHLY recomended to re-add these features!
126 | Scales_Dict = {'hp':hp, 'ja':ja,
127 | 'polarizability':polarizability,'Mutability':Mutability,'Volume':Volume,
128 | 'ASAInTripeptide':ASAInTripeptide,
129 |         'gg_1' : gg_1,'gg_2' : gg_2,'gg_3' : gg_3,'gg_4' : gg_4,'gg_5' : gg_5,
130 | 'gg_6' : gg_6,'gg_7' : gg_7,'gg_8' : gg_8,'gg_9' : gg_9,'gg_10' : gg_10,'gg_11' : gg_11}
131 | #,'gg_12' : gg_12
132 | #,'gg_13' : gg_13,'Atch_1':Atch_1,'Atch_2':Atch_2,'Atch_3':Atch_3,'Atch_4':Atch_4,'Atch_5':Atch_5,
133 | #,'gg_14' : gg_14,'gg_15' : gg_15,
134 | #,'gg_16' : gg_16,'gg_17' : gg_17,'gg_18' : gg_18,'gg_19' : gg_19}
135 | 
136 | # Idea: ICA or PCA of ALL the above scales. Get 7-11 scales from them..
137 | PTMScales_Dict = {
138 | #    'hp': hp,
139 |     'hw': hw,
140 |     'sa': sa,
141 |     'TOP_IDP': TOP_IDP,
142 |     'Atch_1': Atch_1,
143 |     'Atch_2': Atch_2,
144 |     'Atch_3': Atch_3,
145 |     'Atch_4': Atch_4,
146 |     'Atch_5': Atch_5,
147 |     'polarizability': polarizability,
148 |     "ASAInTripeptide": ASAInTripeptide,
149 | }
150 | 
151 | # PTMScales_Avg = {scale: np.median(PTMScales_Dict[scale].values()) for scale in PTMScales_Dict} #ORIG - Py 2.7
152 | PTMScales_Avg = {str(scale): np.median(list(PTMScales_Dict[scale].values())) for scale in PTMScales_Dict}
153 | 
154 | 
155 | ########################################################################################
156 | '''
157 | From PyPro,
158 | Authors: Dongsheng Cao and Yizeng Liang.
159 | :
160 | '''
161 | # def _mean(listvalue):
162 | #     """
163 | #     ########################################################################################
164 | #     The mean value of the list data.
165 | 
166 | #     Usage:
167 | 
168 | #     result=_mean(listvalue)
169 | #     ########################################################################################
170 | #     """
171 | #     return sum(listvalue)/len(listvalue)
172 | # ##############################################################################################
173 | # def _std(listvalue,ddof=1):
174 | #     """
175 | #     ########################################################################################
176 | #     The standard deviation of the list data.
177 | 
178 | #     Usage:
179 | 
180 | #     result=_std(listvalue)
181 | #     ########################################################################################
182 | #     """
183 | #     mean=_mean(listvalue)
184 | #     temp=[math.pow(i-mean,2) for i in listvalue]
185 | #     res=math.sqrt(sum(temp)/(len(listvalue)-ddof))
186 | #     return res
187 | # ##############################################################################################
188 | "TODO: Fix to use proper way of normalizing, AND scaling. (Maybe sci-kit learn's preprocessor?"
189 | def NormalizeAAP(AAP):
190 |     """
191 |     ########################################################################################
192 |     Centralize and normalize amino acid indices (Scales) before calculations.
193 | 
194 |     Usage:
195 | 
196 |     result=NormalizeEachAAP(AAP)
197 | 
198 |     Input: AAP is a dict containing the properties of 20 amino acids.
199 | 
200 |     Output: result is the a dict form containing the normalized properties.
201 |     ########################################################################################
202 |     """
203 |     if len(AAP.values())!=20:
204 |         print ('Some Amino Acids are missing')
205 |     else:
206 |         Result={}
207 |         for i,j in AAP.items():
208 |             Result[i]=(j-_mean(AAP.values()))/_std(AAP.values(),ddof=0)
209 | 
210 |     return Result
211 | ########################################################################################
212 | '''GetAAindex1 Requires the GetAAIndex.py from PyPro: '''
213 |     # def GetAAindex1(self,name,path='.'):
214 |     #     """
215 |     #     Get the amino acid property values from aaindex1
216 | 
217 |     #     Usage:
218 | 
219 |     #     result=GetAAIndex1(name)
220 | 
221 |     #     Input: name is the name of amino acid property (e.g., KRIW790103)
222 | 
223 |     #     Output: result is a dict form containing the properties of 20 amino acids
224 |     #     """
225 | 
226 |     #     return GetAAIndex1(name,path=path)
227 | 
228 | 


--------------------------------------------------------------------------------
/py/asap/features_deps/AAlphabets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Check to make  alphabets, dicts, strings
  3 |  are persistant  and not recalculated each time this method called!!
  4 | 
  5 | Amino acid groupings from
  6 | 'Reduced amino acid alphabets improve the sensitivity...' by
  7 | Peterson, Kondev, et al.
  8 | http://www.rpgroup.caltech.edu/publications/Peterson2008.pdf
  9 | 
 10 | Other alphabets from
 11 | http://bio.math-inf.uni-greifswald.de/viscose/html/alphabets.html
 12 | 
 13 | """
 14 | 
 15 | 'TODO:'
 16 | 'Add AA Propensities (From BioPython, articles, comp.profiler, etc) - eg, AAindex, http://bioinf.icm.uu.se/kbib/project13/convertAAstoProperties/'
 17 | 
 18 | 
 19 | from collections import defaultdict
 20 | 
 21 | # ambiguous amina acids: [ 'aspartic acid or asparagine', 'leucine or isoleucine',
 22 | #  'glutamic acid[E] or glutamine[Q]'] :
 23 | ambiguous_aa = 'BJZX'
 24 | # special amino acids - 'selenocysteine', 'pyrralysine'
 25 | aa_special_alph = 'UO'
 26 | UNKNOWN_AA = "Z" #'unknown amino acid',
 27 | 
 28 | '''
 29 | ILLEGALS = [c for c in ambiguous_aa+aa_special_alph+'Z']
 30 | '''
 31 | ILLEGALS = ['B', 'J', 'Z', 'X', 'U', 'O', 'Z']
 32 | # print(ILLEGALS)
 33 | 
 34 | def TransDict_from_list(groups):
 35 |     '''
 36 |     Given a list of letter groups, returns a dict mapping each group to a
 37 |     single letter from the group - for use in translation.
 38 |     >>> alex6=["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]
 39 |     >>> trans_a6 = TransDict_from_list(alex6)
 40 |     >>> print(trans_a6)
 41 |     {'V': 'A', 'W': 'F', 'T': 'D', 'R': 'D', 'S': 'D', 'P': 'P',
 42 |      'Q': 'D', 'Y': 'F', 'F': 'F',
 43 |      'G': 'G', 'D': 'D', 'E': 'D', 'C': 'C', 'A': 'A',
 44 |       'N': 'D', 'L': 'A', 'M': 'A', 'K': 'D', 'H': 'D', 'I': 'A'}
 45 |     '''
 46 |     transDict = dict()
 47 | 
 48 |     result = {}
 49 |     for group in groups:
 50 |         g_members = sorted(group) #Alphabetically sorted list
 51 |         for c in g_members:
 52 |             # print('c' + str(c))
 53 |             # print('g_members[0]' + str(g_members[0]))
 54 |             result[c] = str(g_members[0]) #K:V map, use group's first letter as represent.
 55 |     # print(result)
 56 |     return result
 57 | 
 58 | def translate_sequence (seq, TranslationDict):
 59 |     '''
 60 |     Given (seq) - a string/sequence to translate,
 61 |     Translates into a reduced alphabet, using a translation dict provided
 62 |     by the TransDict_from_list() method.
 63 |     Returns the string/sequence in the new, reduced alphabet.
 64 |     Remember - in Python string are immutable..
 65 | 
 66 |     '''
 67 |     from_list = []
 68 |     to_list = []
 69 |     for k,v in TranslationDict.items():
 70 |         from_list.append(k)
 71 |         to_list.append(v)
 72 |     # TRANS_seq = seq.translate(str.maketrans(zip(from_list,to_list)))
 73 |     TRANS_seq = seq.translate(str.maketrans(TranslationDict))
 74 |     return TRANS_seq
 75 | 
 76 | def Get_Letters (TranslationDict):
 77 |     '''
 78 |     Given a TranslationDict,
 79 |     return, as string,  the letters retained after translation
 80 |     by that dict.
 81 |     '''
 82 |     e = set(TranslationDict.values())
 83 |     res = sorted (e)
 84 |     return ("".join(res))
 85 | 
 86 | 
 87 | AA20 = 'ACDEFGHIKLMNPQRSTVWY'  #"Standard alphabet"
 88 | 
 89 | 'Invented, based roughly on Ofer8, for Dibasic cleavage prediction'
 90 | OferKR = TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "R","K","H", "DE", "STNQ"])
 91 | 
 92 | ofer14=TransDict_from_list(["A", "D", "KR","E", "N", "TS","Q",
 93 |  "YF", "LIVM", "C", "W", "H", "G", "P"])
 94 | ofer13=TransDict_from_list(["A", "DE", "KR", "N", "TS","Q",
 95 |  "YF", "LIVM", "C", "W", "H", "G", "P"])
 96 | 
 97 | "modifed from wang-wang, Clustering of the Protein Design Alphabets by Using Hierarchical SOM "
 98 | ofer_w8 = TransDict_from_list(["FIL", "CY", "MVW", "HAT", "GP", "RK", "QSN", "DE"])
 99 | # ofer14=TransDict_from_list(['LIVM', 'D', 'G', 'A', 'C', 'N', 'H', 'KE','R', 'W', 'P', 'TSQ', 'YF'])
100 | 
101 | # Ofer7=TransDict_from_list(["C", "G", "P", "FYW", "AVILM","KR", "STNQHDE"])
102 | 
103 | 'Look at: http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1594927&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1594927'
104 | ofer_tail =TransDict_from_list(["FAILV","TS","C","G","P","KR","DE","MWY","NQH"])
105 | 
106 | ofer8=TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "RKH", "DE", "STNQ"])
107 | 
108 | ofer_gbm5 = TransDict_from_list(["ANTSQ", "YFLIVMCWH","DKER" "G", "P"])
109 | gbm4 = TransDict_from_list(["ADKERNTSQ", "YFLIVMCWH", "G", "P"])
110 | sdm12 =TransDict_from_list(
111 |     ["A", "D", "KER", "N",  "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"] )
112 | 
113 | hsdm17 =TransDict_from_list(
114 |   ["A", "D", "KE", "R", "N", "T", "S", "Q", "Y", "F", "LIV",
115 |   "M", "C", "W", "H", "G", "P"])
116 | 
117 | alex6=TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"])
118 | 
119 | shen7 =TransDict_from_list(["AGV","ILFP","YMTS","HNQW","RK","DE","C"])
120 | '''
121 | "Shen 7" From: "Predicting protein-protein interactions based only on sequences information.",
122 | Shen J,Jiang H. et al. PNAS.  2007.
123 |     Suggested ese as trimers, and/or with RNA 4-mers for predicting Protein-interaction,
124 | (Protein-RNA idea, from: "Predicting RNA-Protein Interactions Using Only Sequence Information",
125 |     BMC Bioinformatics. 2011; Dobbs et al)
126 | '''
127 | 
128 | 
129 | #hydrophilic vs. hydrophobic
130 | hp2 =TransDict_from_list(["AGTSNQDEHRKP", "CMFILVWY"])
131 | #Hydrophilic, Hydrophobic, Charged. (Custom Ofer)
132 | hp3 = TransDict_from_list(["AGTSNQP", "CMFILVWY", "RKHED"])
133 | #Hydrophilic, Hydrophobic, Positively Charged. (Custom Ofer)
134 | hp3_Plus = TransDict_from_list(["AGTSNQPHED", "CMFILVWY", "RK"])
135 | 
136 | murphy10 =TransDict_from_list(  ["LVIM", "C", "A", "G", "ST",
137 |  "P", "FYW", "EDNQ", "KR", "H"])
138 | 
139 | aromatic2 =TransDict_from_list(["FHWY", "ADKERNTSQLIVMCGP"])
140 | 
141 | hp_aroma_4 =TransDict_from_list(["H", "CMILV", "FWY", "ADKERNTSQGP"])
142 | 
143 | # 'ofer13KR transdict:'
144 | ofer13KR = {'V': 'I', 'E': 'D', 'G': 'G', 'D': 'D', 'N': 'N', 'L': 'I', 'F': 'F', 'S': 'S', 'P': 'P', 'R': 'R', 'K': 'K', 'Y': 'F', 'T': 'S', 'C': 'C', 'A': 'A', 'Q': 'Q', 'M': 'I', 'H': 'H', 'W': 'W', 'I': 'I'}
145 | 
146 | # https://github.com/biopython/biopython/blob/master/Bio/Alphabet/Reduced.py
147 | murphy15 = {"L": "L",             "V": "L",             "I": "L",
148 |              "M": "L",             "C": "C",             "A": "A",
149 |              "G": "G",             "S": "S",             "T": "T",
150 |              "P": "P",             "F": "F",             "Y": "F",
151 |              "W": "W",             "E": "E",
152 |              "D": "D",             "N": "N",             "Q": "Q",
153 |              "K": "K",             "R": "K",             "H": "H"}
154 | 
155 | murphy_8 = {"L": "L",             "V": "L",             "I": "L",
156 |             "M": "L",
157 |              "C": "L",             "A": "A",             "G": "A",
158 |              "S": "S",             "T": "S",
159 |              "P": "P",             "F": "F",             "Y": "F",
160 |              "W": "F",             "E": "E",
161 |              "D": "E",             "N": "E",             "Q": "E",
162 |              "K": "K",             "R": "K",             "H": "H"}
163 | pc5 = {"I": "A", # Aliphatic
164 |          "V": "A",         "L": "A",
165 |          "F": "R", # Aromatic
166 |          "Y": "R",         "W": "R",         "H": "R",
167 |          "K": "C", # Charged
168 |          "R": "C",         "D": "C",         "E": "C",
169 |          "G": "T", # Tiny
170 |          "A": "T",         "C": "T",         "S": "T",
171 |          "T": "D", # Diverse
172 |          "M": "D",         "Q": "D",         "N": "D",
173 |          "P": "D"}
174 | 
175 | 
176 | 
177 | ### ProFEAT propensity based scales: ####
178 | # modified from ProFEAT + CTD.  (Intended for letter: number use there)
179 | Disorder_3=TransDict_from_list(['ARSQEGKP','ILNCFYVW', 'DHMT'])
180 | Hydrophobicity_3 = TransDict_from_list(['RKEDQN','GASTPHY','CLVIMFW'])
181 | # #'1'stand for Polar; '2'stand for Neutral, '3' stand for Hydrophobicity
182 | Polarity_3 = TransDict_from_list(['LIFWCMVY','PATGS','HQRKNED']) #ProFeat based
183 | # #'1'stand for (4.9-6.2); '2'stand for (8.0-9.2), '3' stand for (10.4-13.0)
184 | Polarizability_3 = TransDict_from_list(['GASDT','CPNVEQIL','KMHFRYW'])
185 | # #'1'stand for (0-0.108); '2'stand for (0.128-0.186), '3' stand for (0.219-0.409)
186 | Charge_3 = TransDict_from_list(['KR','ANCQGHILMFPSTWYV','DE'])
187 | # #'1'stand for Positive; '2'stand for Neutral, '3' stand for Negative
188 | SecondaryStr_3 = TransDict_from_list(['EALMQKRH','VIYCWFT','GNPSD']) #Orig
189 | # #1'stand for Helix; '2'stand for Strand, '3' stand for coil
190 | NormVDWV_3 = TransDict_from_list(['GASTPDC','NVEQIL','MHKFRYW'])
191 | # #1'stand for (0-2.78); '2'stand for (2.95-4.0), '3' stand for (4.03-8.08)
192 | SolventA_3 = TransDict_from_list(['ALFCGIVW','RKQEND','MPSTHY'])
193 | # #1'stand for Buried; '2'stand for Exposed, '3' stand for Intermediate
194 | SurfaceTension_3 = TransDict_from_list(['GQDNAHR','KTSEC','ILMFPWYV'])
195 | # Hierarchical Classification of Protein Folds Using a Novel Ensemble Classifier. PLoS ONE
196 | 
197 | THREE_LETTER_ALPH_NAMES = ['Disorder_3','Hydrophobicity_3',
198 | 'Polarity_3','Polarizability_3','Charge_3','SecondaryStr_3',
199 | 'NormVDWV_3','SolventA_3','SurfaceTension_3']
200 | 
201 | 'Call alphabet by name from this dict, then feed value into translator func:'
202 | REDUCED_ALPHABETS_TRANSDICTS = {
203 | 'ofer14':(ofer14),
204 | 'ofer_w8':ofer_w8,
205 | 'ofer13':(ofer13),
206 | 'ofer8':ofer8,
207 | 'ofer_tail':ofer_tail,
208 | 'gbm4':(gbm4),
209 | 'murphy10':(murphy10),
210 | 'hp_aroma_4':(hp_aroma_4),
211 | 'hp2':(hp2),
212 | 'hp3':(hp3),
213 | 'alex6':(alex6),
214 | 'sdm12':(sdm12),
215 | 'hsdm17':(hsdm17),
216 | 'murphy15':murphy15,
217 | 'pc5':pc5,
218 | 'Disorder_3':Disorder_3,
219 | 'Hydrophobicity_3':Hydrophobicity_3,
220 | 'Polarity_3':Polarity_3,
221 | 'Polarizability_3':Polarizability_3,
222 | 'Charge_3':Charge_3,
223 | 'SecondaryStr_3':SecondaryStr_3,
224 | 'NormVDWV_3':NormVDWV_3,
225 | 'SolventA_3':SolventA_3,
226 | 'hp3_Plus':hp3_Plus,
227 | 'ofer_gbm5':ofer_gbm5,
228 | 'shen7':shen7
229 | }
230 | 
231 | 
232 | def Get_Alph_Letters(REDUCED_ALPHABETS_TRANSDICTS):
233 |     REDUCED_ALPHABETS_LETTERS = defaultdict(str)
234 |     for k,v in REDUCED_ALPHABETS_TRANSDICTS.items():
235 |         REDUCED_ALPHABETS_LETTERS[k]=Get_Letters(v)
236 |     REDUCED_ALPHABETS_LETTERS['AA20'] = 'ACDEFGHIKLMNPQRSTVWY' #Include full, nonreduced alphabet.
237 |     return REDUCED_ALPHABETS_LETTERS
238 | 
239 | 'Make this run once! Not every time method is called! (Potentially)'
240 | REDUCED_ALPHABETS_LETTERS = Get_Alph_Letters(REDUCED_ALPHABETS_TRANSDICTS)
241 | 
242 | ##############################################################################
243 | 
244 | if __name__=="__main__":
245 |     print("ofer13KR transdict:")
246 |     print(ofer13KR)
247 |     print()
248 |     '''Check this all works..'''
249 |     # print(Reduced_Alphabets)
250 |     protein="MQNEEDACLEAGYCLGTTLSSWRLHFMEEQSQSTMLMGIGIGALLTLAFVGIFFFVYRRVRRLRRAEDQQGTDDESDYQTEYEEELPAIPKETYADFQSTGIELDSDSEYEPSMLQGPPSLTSPEQSQDSFPWLPNQDDQGPRLEHPS"
251 |     print(REDUCED_ALPHABETS_TRANSDICTS['gbm4'])
252 |     print(translate_sequence(protein,REDUCED_ALPHABETS_TRANSDICTS['gbm4']))
253 |     print(REDUCED_ALPHABETS_LETTERS)
254 |     print(REDUCED_ALPHABETS_LETTERS['ofer14'])
255 |     # for k,v in REDUCED_ALPHABETS_LETTERS.items():
256 |     #     print (str(k), str(len(set(v))))
257 |     print(translate_sequence(protein,REDUCED_ALPHABETS_TRANSDICTS['Charge_3']))
258 | 
259 | 
260 | 
261 |     '''
262 |     #Internet:
263 |     import string
264 |     s='abracadabra'
265 |     from_list='abcdr'
266 |     to_list='?*!@|'
267 |     print s.translate(string.maketrans(from_list,to_list)),
268 |     # ?*|?!?@?*|?
269 |     '''
270 | 


--------------------------------------------------------------------------------
/py/asap/features_deps/Disorder.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'DanaLab'
  2 | '''
  3 | Look at using:
  4 | Get_ParamScales()  from protfeat - and import scales from AAScales.py?
  5 | (Also, calc. normalized KD scale, and save it (modify file) to AAScales.py and import from there
  6 | = performance.
  7 | Also, AAScales should hold (import) the TDP-IDP scale.  - Dan. )
  8 | 
  9 | netCharge; calculateAminoAcidCharge - why not use built in ones from main ProtFeat?
 10 |  (also - names of methods here/there are SAME!
 11 |     => asking for bugs when importing, calling methods..) | Change method names.
 12 | 
 13 | Look at using different PHs for netcharge calcing. (This would be a Different feature of
 14 | course; i.e diff key name in res-dict)
 15 | '''
 16 | from collections import Counter
 17 | # from ProtFeat import pKa
 18 | 
 19 | pKa     = {'D':3.9, 'E':4.3, 'H':6.1, 'C':8.3, 'Y':10.1, 'K':10.5, 'R':12, 'N-term':8, 'C-term':3.1}
 20 | charges = {'D':-1,  'E':-1,  'H':+1,  'C':-1,  'Y':-1,   'K':1,    'R':1,  'N-term':1, 'C-term':-1}
 21 | 
 22 | 
 23 | # @staticmethod
 24 | def netCharge(seq,pH = 7.2): #maybe ReName, to "subseq_ .." , to avoid confusion with "calculateProteinCharge", get_netCharge, From ProtFeat.py ? (OR use them directly) - D
 25 |     """
 26 | 
 27 |     :param seq:
 28 |     :return:
 29 |     """
 30 |     aa_counts = Counter(seq)
 31 |     # pH = 7.2
 32 |     res = 0.0
 33 | 
 34 |     def calculateAminoAcidCharge(amino_acid, pH):
 35 |         ratio = 1 / (1 + 10 ** (pH - pKa[amino_acid]))
 36 |         if charges[amino_acid] == 1:
 37 |             return ratio
 38 |         else:
 39 |             return ratio - 1
 40 | 
 41 |     for amino_acid in pKa:
 42 |         res += aa_counts[amino_acid] * calculateAminoAcidCharge(amino_acid, pH)
 43 |     return res
 44 | 
 45 | 
 46 | # @staticmethod
 47 | def hydrophobicity(seq):
 48 |     hydropathy = {'A': 1.8, 'C': 2.5, 'D': -3.5, 'E': -3.5, 'F': 2.8, 'G': -0.4, 'H': -3.2, 'I': 4.5, 'K': -3.9, 'L': 3.8,
 49 |           'M': 1.9, 'N': -3.5, 'P': -1.6, 'Q': -3.5, 'R': -4.5, 'S': -0.8, 'T': -0.7, 'U': 0.0, 'V': 4.2, 'W': -0.9,
 50 |           'Y': -1.3, 'B': -3.5, 'X': -0.49, 'Z': -3.5}
 51 |     WINDOW_SIZE = 5
 52 |     NORME = True  # NORME = ?
 53 | 
 54 |     def normalizeHydropathy():
 55 |         """
 56 | 
 57 | 
 58 |         """
 59 |         minimum = min(hydropathy.values())
 60 |         maximum = max(hydropathy.values())
 61 |         for key in hydropathy.keys():
 62 |             oldVal = hydropathy[key]
 63 |             hydropathy[key] = (oldVal - minimum) / (maximum - minimum)
 64 | 
 65 |     def kyteDoolittle(seq, windowSize, normaliz):
 66 |         """
 67 | 
 68 |         :param seq:
 69 |         """
 70 |         seq = seq.strip()
 71 |         if normaliz:
 72 |             normalizeHydropathy()
 73 |         maxJump = int((windowSize - 1) / 2) #MOD
 74 |         result = 0
 75 |         subResultsArr = []
 76 |         for i in range(maxJump):
 77 |             subResultsArr.append(0)
 78 |         for i in range(maxJump, len(seq) - maxJump):
 79 |             summ = 0
 80 |             for j in range(-maxJump, maxJump + 1):
 81 |                 key = seq[i + j]
 82 |                 if key in hydropathy.keys():
 83 |                     summ += hydropathy[key]
 84 |                 else:
 85 |                     #print(key)
 86 |                     pass
 87 |             subResultsArr.append(summ / windowSize)
 88 |             result += summ / windowSize
 89 | 
 90 |         result /= len(seq)
 91 |         return result, subResultsArr
 92 | 
 93 |     return kyteDoolittle(seq, WINDOW_SIZE, NORME)[0]
 94 | 
 95 | 
 96 | def uversky(seq): #As implemented, this gets the foldindex for WHOLE seq; vs segments/window.
 97 |     '''
 98 |     FoldIndex method prediction of disorder.
 99 |     '''
100 |     #Why use sep. Seq? Use seq=self.seq for consistancy/less confusion, ne?
101 |     R = netCharge(seq) #Maybe have this for different PHs.
102 |     H = hydrophobicity(seq)
103 |     uScore = (2.785 * float(H) - 1.151 - float(R))
104 |     return uScore
105 | 
106 | def getDisordered(seq,segments=5):
107 |     '''
108 |     Get predicted disorder for protein, divided into segments (default=5).,
109 |     predicted individually for entirety of each segment; using:
110 |     A) FoldIndex (Uversky) method.
111 | 
112 |     #I'd add other method(s) for getting predicted disordered here also. (EG, TDP-IDP scale, whether seperate or "joint"feature) - D
113 |     '''
114 |     
115 |     # seq = self.seq
116 |     
117 |     length = len(seq)
118 |     window_size = int(length / segments)  # window size 20% of the protein length
119 |     pos = 0
120 |     scores = [0 for _ in range(segments)]
121 | 
122 |     for i in range(segments-1):
123 |         scores[i] = uversky(seq[pos:pos + window_size])
124 |         pos += window_size
125 |     scores[-1] = uversky(seq[pos:])
126 |     
127 |     res = {}
128 |     key = "Disordered window "
129 |     
130 |     for i, uscore in enumerate(scores):
131 |         res[key + str(i)] = 1 if uscore < 0 else 0 #ORIG
132 |         # res[key + str(i)] = 1 if uscore != 0 else 0
133 |         'Binary feature - presence of ANY disordered window:'
134 |         if uscore < (-0.1):
135 |             res['AnyDISORDER_'+str(segments)]=1
136 |         else:
137 |             res['AnyDISORDER_'+str(segments)]=0
138 |     
139 |     return res


--------------------------------------------------------------------------------
/py/asap/features_deps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ddofer/asap/7f592a660a422e24a6f816021cc95459f896f7a0/py/asap/features_deps/__init__.py


--------------------------------------------------------------------------------
/py/asap/parse.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from Bio import SeqIO
  4 | 
  5 | from . import util
  6 | from . import config
  7 | from . import data
  8 | 
  9 | LOGGER = logging.getLogger('PARSE')
 10 | 
 11 | def convert_lf_to_fasta(source, output_file):
 12 | 
 13 |     '''
 14 |     Converts a .lf file, which also contains annotations, to a .fasta file, which contains only the amino-acid sequences
 15 |     of the records.
 16 |     @param source (file handle):
 17 |         The source .lf file to read.
 18 |     @param output_file (file handle):
 19 |         A file handlw with writing permissions to write the output FASTA into.
 20 |     '''
 21 | 
 22 |     full_records = parse_records_from_file(source, extract_annotations = True)
 23 |     fasta_records = [full_record.to_fasta_record() for full_record in full_records]
 24 |     SeqIO.write(fasta_records, output_file, 'fasta')
 25 | 
 26 | def parse_records_from_file(source, extract_annotations, relevant_ids = None, extra_tracks = {}):
 27 | 
 28 |     '''
 29 |     Parses full data records from a file (either .lf format, which also includes annotation masks,
 30 |         or a simple fasta )
 31 |     @param source (file):
 32 |         A file handle to parse the records from.
 33 |     @param extract_annotations (bool):
 34 |         Whether to expect a .lf format which also contains annotations, or a simple .fasta format.
 35 |     @param relevant_ids (collection, optional):
 36 |         An optional list of ids. If None, will do nothing. If provided with a collection,
 37 |     will return only records with the given ids.
 38 |     @param extra_tracks (dict, empty by default):
 39 |         Extra tracks to give the records, given in the following format:
 40 |         {
 41 |             track_name : {
 42 |                 record_id: (seq, padding_value),
 43 |                 ...
 44 |             }
 45 |             ...
 46 |         }
 47 |     @return:
 48 |         A generator for the parsed records (each of type FullDataRecord).
 49 |     '''
 50 | 
 51 |     seqs = list(SeqIO.parse(source, 'fasta'))
 52 |     LOGGER.info('Parsing %d sequencess...' % len(seqs))
 53 | 
 54 |     for seq in seqs:
 55 |         if relevant_ids is None or _format_id(seq.id) in relevant_ids:
 56 |             yield _parse_fasta_record(seq, extra_tracks, extract_annotations)
 57 | 
 58 | def get_record_from_seq(seq, annotation_mask = None, extra_tracks = {}):
 59 | 
 60 |     '''
 61 |     Creates a full data record from sequences.
 62 |     @param seq (string):
 63 |         The amino-acid sequence of the record
 64 |     @param annotation_mask (string, optional):
 65 |         A binary mask (made of 0's and 1's) in the same size of the given sequence to use
 66 |         as an annotation mask. If not provided, the record will not have an annotation mask.
 67 |     @param extra_tracks (dict, empty by default):
 68 |     Extra tracks to give the record, given in the following format:
 69 |         {
 70 |             track_name: (seq, padding_value),
 71 |             ...
 72 |         }
 73 |     @return:
 74 |         A FullDataRecord created from the provided data.
 75 |     '''
 76 | 
 77 |     sequence_tracks = data.SequenceTracks()
 78 |     sequence_tracks.add_track(data.SequenceTrack('aa', seq))
 79 | 
 80 |     if annotation_mask is not None:
 81 |         sequence_tracks.add_track(data.SequenceTrack('annotation', annotation_mask))
 82 | 
 83 |     for track_name, track_data in extra_tracks.items():
 84 |         track_seq, track_padding_value = track_data
 85 |         sequence_tracks.add_track(data.SequenceTrack(track_name, track_seq, track_padding_value))
 86 | 
 87 |     return data.FullDataRecord('N/A', 'N/A', 'N/A', sequence_tracks)
 88 | 
 89 | def parse_track_from_file(source, type):
 90 | 
 91 |     '''
 92 |     Parses the track data of multiple records from a FASTA file .
 93 |     @param source (file):
 94 |         The file handle to parse (in FASTA format)
 95 |     @param type (string):
 96 |         The type of the track to parse (options: seq, disorder, pssm)
 97 |     @return:
 98 |         A dictionary of the following format:
 99 |         {
100 |             record_id: (seq, padding_value),
101 |             ...
102 |         }
103 |     '''
104 | 
105 |     track_file_parser, track_seq_parser, padding_value = _TRACK_TYPE_TO_PARSERS_AND_PADDING[type]
106 |     track_data = {}
107 | 
108 |     for record_id, seq in track_file_parser(source):
109 |         track_data[_format_id(record_id)] = (seq, padding_value)
110 | 
111 |     return track_data
112 | 
113 | def parse_track_from_seq(seq, type):
114 | 
115 |     '''
116 |     Parses the track data of a single record from a raw sequence.
117 |     @param seq (string):
118 |         The raw sequence to parse
119 |     @param type (string):
120 |         The type of the track to parse (options: seq, disorder, pssm)
121 |     @return:
122 |         A tuple containing the parsed track sequence and its padding value.
123 |     '''
124 | 
125 |     track_file_parser, track_seq_parser, padding_value = _TRACK_TYPE_TO_PARSERS_AND_PADDING[type]
126 |     return track_seq_parser(seq), padding_value
127 | 
128 | def _parse_fasta_record(fasta_seq, extra_tracks, extract_annotations):
129 | 
130 |     record_id = _format_id(fasta_seq.id)
131 | 
132 |     if extract_annotations:
133 |         raw_seq_and_mask = str(fasta_seq.seq)
134 |         mask_start_index = util.find_first_index_of(raw_seq_and_mask, '01')
135 |         aa_seq = _fix_aa_seq(raw_seq_and_mask[:mask_start_index])
136 |         annotation_mask = raw_seq_and_mask[mask_start_index:]
137 |     else:
138 |         aa_seq = fasta_seq.seq
139 |         annotation_mask = None
140 | 
141 |     sequence_tracks = data.SequenceTracks()
142 |     sequence_tracks.add_track(data.SequenceTrack('aa', aa_seq))
143 | 
144 |     if annotation_mask is not None:
145 |         sequence_tracks.add_track(data.SequenceTrack('annotation', annotation_mask))
146 | 
147 |     for track_name, extra_track_data in extra_tracks.items():
148 |         if record_id in extra_track_data:
149 |             track_seq, track_padding_value = extra_track_data[record_id]
150 |             sequence_tracks.add_track(data.SequenceTrack(track_name, track_seq, track_padding_value))
151 |         else:
152 |             raise Exception('No record for %s in track %s' % (record_id, track_name))
153 | 
154 |     return data.FullDataRecord(record_id, fasta_seq.name, fasta_seq.description, sequence_tracks)
155 | 
156 | def _parse_seq_track_from_file(source):
157 |     for seq in SeqIO.parse(source, 'fasta'):
158 |         yield seq.id, str(seq.seq)
159 | 
160 | def _parse_seq_track_from_seq(seq):
161 |     return seq
162 | 
163 | def _parse_disorder_track_from_file(source):
164 |     for seq in SeqIO.parse(source, 'fasta'):
165 |         yield seq.id, _parse_disorder_track_from_seq(str(seq.seq))
166 | 
167 | def _parse_disorder_track_from_seq(seq):
168 |     disorder_start_index = util.find_first_index_of(seq, config.DISORDER_OPTIONS)
169 |     return seq[disorder_start_index:]
170 | 
171 | def _parse_pssm_track_from_file(source):
172 |     for raw_record in source.read().split('>')[1:]:
173 |         lines = raw_record.splitlines()
174 |         record_id = lines[0].split(' ')[0]
175 |         pssm = _parse_pssm(lines[1:])
176 |         yield record_id, pssm
177 | 
178 | def _parse_pssm_track_from_seq(seq):
179 |     return _parse_pssm(seq.splitlines())
180 | 
181 | def _parse_pssm(lines):
182 | 
183 |     pssm = []
184 | 
185 |     for line in lines:
186 |         freqs_vector = map(float, line.split(' ')[1:])
187 |         freqs_dict = dict(zip(config.PSSM_AMINO_ACIDS, freqs_vector))
188 |         freqs_dict['_'] = 0.0
189 |         pssm += [freqs_dict]
190 | 
191 |     return pssm
192 | 
193 | def _fix_aa_seq(seq):
194 | 
195 |     fixed_seq = ''
196 | 
197 |     for aa in seq:
198 |         if aa in config.AMINO_ACIDS:
199 |             fixed_seq += aa
200 |         else:
201 |             fixed_seq += '_'
202 | 
203 |     return fixed_seq
204 | 
205 | def _format_id(id):
206 |     return id.replace('|', '__').replace('-', '_')
207 | 
208 | _TRACK_TYPE_TO_PARSERS_AND_PADDING = {
209 |     'seq': (_parse_seq_track_from_file, _parse_seq_track_from_seq, '_'),
210 |     'disorder': (_parse_disorder_track_from_file, _parse_disorder_track_from_seq, '_'),
211 |     'pssm': (_parse_pssm_track_from_file, _parse_pssm_track_from_seq, [dict([(aa, 0.0) for aa in config.PSSM_AMINO_ACIDS] + [('_', 1.0)])]),
212 | }
213 | 


--------------------------------------------------------------------------------
/py/asap/sklearn_extensions.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This module contains extensions required by our project that we wish sklearn supported.
  3 | '''
  4 | 
  5 | from sklearn.ensemble import RandomForestClassifier
  6 | 
  7 | class RandomForestClassifierWithCoef(RandomForestClassifier):
  8 | 
  9 |     '''
 10 |     A small hack required to make sklearn.ensemble.RandomForestClassifier support sklearn.feature_selection.RFECV.
 11 |     '''
 12 | 
 13 |     def fit(self, *args, **kwargs):
 14 |         '''
 15 |         @see sklearn.ensemble.RandomForestClassifier.fit
 16 |         '''
 17 |         super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
 18 |         self.coef_ = self.feature_importances_
 19 | 
 20 | class FeatureSelectionPipeline(object):
 21 |     
 22 |     '''
 23 |     Like sklearn.pipeline.Pipeline, but suitable for feature selection only.
 24 |     Unfortunately we can't use sklearn.pipeline.Pipeline as it is, because it doesn't have the get_support method that we need.
 25 |     This class isn't intended for general purpose, and we do not recommend using it outside the context of this project.
 26 |     '''
 27 |     
 28 |     def __init__(self, feature_selectors):
 29 |         
 30 |         '''
 31 |         @param feature_selectors (list of feature selectors):
 32 |             The list of feature selectors to pipeline together.
 33 |         '''
 34 |         
 35 |         if len(feature_selectors) == 0:
 36 |             raise Exception('Cannot pipeline an empty list of feature selectors')
 37 |             
 38 |         for feature_selector in feature_selectors:
 39 |             for method_name in ['fit', 'transform', 'fit_transform', 'get_support']:
 40 |                 if not hasattr(feature_selector, method_name):
 41 |                     raise Exception('Feature selectors must have a %s method' % method_name)
 42 |         
 43 |         self.feature_selectors = feature_selectors
 44 |         
 45 |     def fit(self, X, y):
 46 |         
 47 |         for feature_selector in self.feature_selectors[:-1]:
 48 |             X = feature_selector.fit_transform(X, y)
 49 |             
 50 |         self.feature_selectors[-1].fit(X, y)
 51 |         
 52 |     def transform(self, X):
 53 |         
 54 |         for feature_selector in self.feature_selectors:
 55 |             X = feature_selector.transform(X)
 56 |             
 57 |         return X
 58 |         
 59 |     def fit_transform(self, X, y):
 60 |         
 61 |         for feature_selector in self.feature_selectors:
 62 |             X = feature_selector.fit_transform(X, y)
 63 |         
 64 |         return X
 65 |         
 66 |     def get_support(self):
 67 |         
 68 |         support = self.feature_selectors[0].get_support()
 69 |         
 70 |         for feature_selector in self.feature_selectors[1:]:
 71 |             support = _embed_vector_in_mask(feature_selector.get_support(), support)
 72 |             
 73 |         return support
 74 |         
 75 | def _embed_vector_in_mask(vector, mask):
 76 |     
 77 |     '''
 78 |     Embedding a vector inside the positive indices of a boolean mask.
 79 |     For example, if given the vector [x1, x2, x3] and the mask [0, 0, 0, 1, 0, 0, 1, 1], then the returned value will
 80 |     be [0, 0, 0, x1, 0, 0, x2, x3].
 81 |     Note that the number of 1's in the mask must be equal to the length of the vector.
 82 |     '''
 83 |     
 84 |     _validate_boolean(mask)
 85 |     
 86 |     if len(vector) != sum(mask):
 87 |         raise Exception('Cannot embed a vector of size %d in a mask with %d 1\'s' % (len(vector), sum(mask)))
 88 |     
 89 |     result = [0] * len(mask)
 90 |     vector_index = 0
 91 |     
 92 |     for i, flag in enumerate(mask):
 93 |         if flag:
 94 |             result[i] = vector[vector_index]
 95 |             vector_index += 1
 96 |             
 97 |     return result
 98 |     
 99 | def _validate_boolean(mask):
100 |     for flag in mask:
101 |         if flag not in [0, 1]:
102 |             raise Exception('Expecting a boolean mask, given %s element' % repr(flag))
103 |         
104 | 


--------------------------------------------------------------------------------
/py/asap/util.py:
--------------------------------------------------------------------------------
 1 | def apply_mask(array, mask):
 2 |     if len(array) == len(mask):
 3 |         return [element for element, flag in zip(array, mask) if flag]
 4 |     else:
 5 |         raise Exception('Cannot apply a mask of a different length')
 6 | 
 7 | def bit_to_bool(bit):
 8 |     return bit == '1'
 9 | 
10 | def find_first_index_of(string, character_list):
11 |     for i, c in enumerate(string):
12 |         if c in character_list:
13 |             return i
14 | 
15 | def format_as_csv_value(value):
16 |     if type(value) == bool:
17 |         if value:
18 |             return '1'
19 |         else:
20 |             return '0'
21 |     if type(value) == float:
22 |         return '%.4f' % value
23 |     else:
24 |         return str(value)
25 | 
26 | def write_csv_line(csv_writer, line):
27 |     csv_writer.writerow(map(format_as_csv_value, line))
28 | 


--------------------------------------------------------------------------------
/py/asap/window_extraction.py:
--------------------------------------------------------------------------------
  1 | from StringIO import StringIO
  2 | import datetime
  3 | import csv
  4 | import logging
  5 | 
  6 | from . import util
  7 | from . import features
  8 | from . import parse
  9 | 
 10 | BASIC_HEADERS = [
 11 |     'peptide_id',
 12 |     'window_hot_index',
 13 |     'window_seq',
 14 |     'window_neighbourhood',
 15 | ]
 16 | 
 17 | ANNOTATION_HEADERS = [
 18 |     'window_annotation_mask',
 19 |     'window_label',
 20 |     'window_only_almost_positive',
 21 | ]
 22 | 
 23 | META_WINDOW_HEADERS = BASIC_HEADERS + ANNOTATION_HEADERS
 24 | 
 25 | LOGGER = logging.getLogger('EXTRACTION')
 26 | 
 27 | class WindowExtractionParams(object):
 28 | 
 29 |     '''
 30 |     Parameters that should be used when extracting windows and their features from full records.
 31 |     '''
 32 | 
 33 |     def __init__(self, window_prefix = 9, window_suffix = 9, neighbourhood_prefix = 5, neighbourhood_suffix = 5, \
 34 |             windows_filter = None, feature_keys = features.DEFAULT_FEATURE_KEYS):
 35 | 
 36 |         '''
 37 |         @param window_prefix, window_suffix (int, both default 7):
 38 |             The number of residues before and after the hot index in each window, where the hot index is the position determining the label of
 39 |             the window (i.e. the window's label is the value of the annotation mask in the hot index). It follows that the total window size
 40 |             is (window_prefix + window_suffix + 1).
 41 |         @param neighbourhood_prefix, neighbourhood_suffix (int, both default 5):
 42 |             The number of residues before and after the hot index in determining the neighbourhood of the window. The neighbourhood can be used
 43 |             during the training process in order to avoid duplicates of very similar windows.
 44 |         @param windows_filter (function, optional):
 45 |             A function to filter the extracted windows by. The function will receive Window objects and should return a bool stating whether to
 46 |             include them or not. If not provided, no filtration will take place in the windows level, and all windows will be extracted.
 47 |         @param feature_keys (list, optional, default features.DEFAULT_FEATURE_KEYS):
 48 |             A list of features to extract for each window. You can see the full list of optional keywords in features.FEARURE_KEY_OPTIONS, where
 49 |             documentation is also provided for each of the feature keys. If not provided, all features will be extracted by default (i.e. will
 50 |             use all of the features in features.FEARURE_KEY_OPTIONS). By default will use features.DEFAULT_FEATURE_KEYS, which is another list
 51 |             containing most of the features, but not all of them, as there are some features that it doesn't make much sense to use at the same
 52 |             time (e.g. both 'aa' and 'aa_reduced'). We anticipate that using the default features should give pretty good results in most
 53 |             scenarios, so fine-tuning the exact used features can be left for late stages of a project.
 54 |             Important note: Features that rely on extra tracks (ss, acc, disorder, pssm) will not be extracted if the tracks are not provided,
 55 |             even if those features are explicitly given in this list.
 56 |         '''
 57 | 
 58 |         self.window_prefix = window_prefix
 59 |         self.window_suffix = window_suffix
 60 |         self.neighbourhood_prefix = neighbourhood_prefix
 61 |         self.neighbourhood_suffix = neighbourhood_suffix
 62 |         self.windows_filter = windows_filter
 63 |         self.feature_keys = feature_keys
 64 | 
 65 |         self.window_size = window_prefix + window_suffix + 1
 66 |         self.window_hot_index = window_prefix
 67 |         self.neighbourhood_size = neighbourhood_prefix + neighbourhood_suffix + 1
 68 | 
 69 | def extract_windows_from_file(source, extract_annotations = False, seqs_filtration_file = None, \
 70 |         extra_tracks_files = {}, csv_output_file = None, window_extraction_params = WindowExtractionParams()):
 71 | 
 72 |     '''
 73 |         Parses a given file with peptide sequences and breaks it into windows with features, outputs a CSV with a row for each window
 74 |         and a column for each feature (along with a few other meta headers).
 75 |     @param source (file):
 76 |         A file handle to parse the peptide sequences from. Can be either a .fasta or .lf format, depending on the extract_annotations
 77 |         parameter.
 78 |     @param extract_annotations (boolean, default False):
 79 |         Whether to expect finding annotation masks inside the given file of sequences. If set to True, will expect getting a .lf file
 80 |         which also contains annotations. If set to False, will expect getting a .fasta file that contains only the sequences. Annotations
 81 |         are required only if one plans using the extracted windows to train a new classifier, rather than using an existing one.
 82 |     @param seqs_filtration_file (file, optional):
 83 |         A fasta format file handle to use for filtering records. If given, will use only sequences with an ID that is also present in the
 84 |         ids of this FASTA file. If not given, will not perform any filtration.
 85 |     @param extra_tracks_files (dict, empty by default):
 86 |         A dictionary for providing extra tracks to extract data from (beside the actual amino-acid sequence and annotations mask). The
 87 |         given dictionary should map from a track name to a file handle containing the track data for each of the records in a FASTA
 88 |         format. The currently supported extra tracks are: ss (secondary-structure), acc (accessibility), disorder and pssm (position-specific
 89 |         scoring matrix).
 90 |     @param csv_output_file (file, optional):
 91 |         A file handle with writing permissions to write the output CSV into. If not provided, will return a StringIO object from which the
 92 |         output CSV can be read.
 93 |     @param window_extraction_params (WindowExtractionParams, default params by default):
 94 |         Parameters to use for extracting the windows.
 95 |     @return:
 96 |         If csv_output_file is given, will return nothing. If csv_output_file is not given, will return a a StringIO object from which the
 97 |         output CSV can be read.
 98 |     '''
 99 | 
100 |     relevant_ids = _get_relevant_ids(seqs_filtration_file)
101 |     extra_tracks = _get_extra_tracks_from_files(extra_tracks_files)
102 |     full_records = list(parse.parse_records_from_file(source, extract_annotations, relevant_ids, extra_tracks))
103 |     parse.LOGGER.info('Final records: %d' % len(full_records))
104 |     _pad_records(full_records, window_extraction_params)
105 |     return _extract_windows(full_records, csv_output_file, window_extraction_params)
106 | 
107 | def extract_windows_from_seq(seq, annotation_mask = None, extra_tracks_data = {}, csv_output_file = None, \
108 |         window_extraction_params = WindowExtractionParams()):
109 | 
110 |     '''
111 |     Breaking a peptide sequence into windows with features, outputting a CSV with a row for each window and a column for each feature
112 |     (along with a few other meta headers).
113 |     @param seq (string):
114 |         The peptide sequence to use, given in a 20 amino-acid alphabet.
115 |     @param annotation_mask (string, optional):
116 |         An annotation mask to use as a labeling for each position along the sequence. Expecting a binary sequence (of 0's and 1's) in
117 |         the same length of the given amino-acid sequence. If not provided, the extracted windows won't have labels, meaning they cannot
118 |         be used for training a new classifier (only fed to an already trained classifier).
119 |     @param extra_tracks_data (dict, empty by default):
120 |         A dictionary for providing extra tracks to extract data from (beside the actual amino-acid sequence and annotations mask). The
121 |         given dictionary should map from track names to their sequence. Currently supported extra tracks are: ss (secondary-structure),
122 |         acc (accessibility), disorder and pssm (position-specific scoring matrix).
123 |     @param csv_output_file (file, optional):
124 |         A file handle with writing permissions to write the output CSV into. If not provided, will return a StringIO object from which
125 |         the output CSV can be read.
126 |     @param window_extraction_params (WindowExtractionParams, default params by default):
127 |         Parameters to use for extracting the windows.
128 |     @return:
129 |         If csv_output_file is given, will return nothing. If csv_output_file is not given, will return a a StringIO object from which the
130 |         output CSV can be read.
131 |     '''
132 | 
133 |     extra_tracks = _get_extra_tracks_from_raw_data(extra_tracks_data)
134 |     full_record = parse.get_record_from_seq(seq, annotation_mask, extra_tracks)
135 |     _pad_record(full_record, window_extraction_params)
136 |     return _extract_windows([full_record], csv_output_file, window_extraction_params)
137 | 
138 | def _get_relevant_ids(seqs_filtration_file):
139 |     if seqs_filtration_file is None:
140 |         return None
141 |     else:
142 |         relevant_ids = parse.parse_track_from_file(seqs_filtration_file, 'seq').keys()
143 |         parse.LOGGER.info('%d records are in the filtration FASTA file' % len(relevant_ids))
144 |         return relevant_ids
145 | 
146 | def _get_extra_tracks_from_files(extra_tracks_files):
147 | 
148 |     extra_tracks = {}
149 | 
150 |     for track_name, track_source in extra_tracks_files.items():
151 |         if track_name in _TRACK_NAME_TO_TYPE:
152 |             track_type = _TRACK_NAME_TO_TYPE[track_name]
153 |             extra_tracks[track_name] = parse.parse_track_from_file(track_source, track_type)
154 |         else:
155 |             raise Exception('Unknown track name: ' + str(track_name))
156 | 
157 |     return extra_tracks
158 | 
159 | def _get_extra_tracks_from_raw_data(extra_tracks_data):
160 | 
161 |     extra_tracks = {}
162 | 
163 |     for track_name, raw_track_seq in extra_tracks_data.items():
164 |         if track_name in _TRACK_NAME_TO_TYPE:
165 |             track_type = _TRACK_NAME_TO_TYPE[track_name]
166 |             extra_tracks[track_name] = parse.parse_track_from_seq(raw_track_seq, track_type)
167 |         else:
168 |             raise Exception('Unknown track name: ' + str(track_name))
169 | 
170 |     return extra_tracks
171 | 
172 | def _pad_records(records, window_extraction_params):
173 |     for record in records:
174 |         _pad_record(record, window_extraction_params)
175 | 
176 | def _pad_record(record, window_extraction_params):
177 |     record.pad(window_extraction_params.window_prefix - 1, window_extraction_params.window_suffix - 1)
178 | 
179 | def _extract_windows(full_records, csv_output_file, window_extraction_params):
180 |     if csv_output_file is None:
181 |         csv_buffer = StringIO()
182 |         _extract_windows_to_csv(full_records, csv_buffer, window_extraction_params)
183 |         csv_buffer.seek(0)
184 |         return csv_buffer
185 |     else:
186 |         _extract_windows_to_csv(full_records, csv_output_file, window_extraction_params)
187 | 
188 | def _extract_windows_to_csv(full_records, output_file, window_extraction_params):
189 | 
190 |     LOGGER.info('Extracting windows with features in CSV format...')
191 |     start = datetime.datetime.now()
192 |     csv_writer = csv.writer(output_file)
193 |     feature_headers = None
194 |     include_annotations = None
195 | 
196 |     for record in full_records:
197 |         for window in record.get_windows(window_extraction_params.window_size):
198 |             feature_headers, include_annotations = _process_window_to_csv(window, csv_writer, window_extraction_params, \
199 |                         feature_headers, include_annotations)
200 | 
201 |     time_diff = datetime.datetime.now() - start
202 |     LOGGER.info('Done. Extraction took %d seconds.' % time_diff.total_seconds())
203 | 
204 | def _process_window_to_csv(window, csv_writer, window_extraction_params, feature_headers, include_annotations):
205 | 
206 |     if feature_headers is None:
207 |         features = window.get_features(window_extraction_params.window_hot_index, window_extraction_params.feature_keys)
208 |         feature_headers = list(sorted(features.keys()))
209 |         include_annotations = window.has_annotation_mask()
210 |         util.write_csv_line(csv_writer, _get_meta_headers(include_annotations) + feature_headers)
211 | 
212 |     if window_extraction_params.windows_filter is None or window_extraction_params.windows_filter(window):
213 |         meta_values = _get_window_meta_values(window, window_extraction_params, include_annotations)
214 |         features = window.get_features(window_extraction_params.window_hot_index, window_extraction_params.feature_keys)
215 |         feature_values = [features[header] for header in feature_headers]
216 |         util.write_csv_line(csv_writer, meta_values + feature_values)
217 | 
218 |     return feature_headers, include_annotations
219 | 
220 | def _get_meta_headers(include_annotations):
221 |     if include_annotations:
222 |         return BASIC_HEADERS + ANNOTATION_HEADERS
223 |     else:
224 |         return BASIC_HEADERS
225 | 
226 | def _get_window_meta_values(window, window_extraction_params, include_annotations):
227 | 
228 |     hot_index = window.original_index + window_extraction_params.window_hot_index
229 |     neighbourhood = window.get_neighbourhood(window_extraction_params.window_hot_index, window_extraction_params.neighbourhood_prefix, \
230 |             window_extraction_params.neighbourhood_suffix)
231 |     meta_values = [window.full_record.id, hot_index, window.get_aa_seq(), neighbourhood]
232 | 
233 |     if include_annotations:
234 |         label = window.get_label(window_extraction_params.window_hot_index)
235 |         is_only_almost_positive = window.is_only_almost_positive(window_extraction_params.window_hot_index)
236 |         meta_values += [window.get_annotation_mask(), label, is_only_almost_positive]
237 | 
238 |     return meta_values
239 | 
240 | _TRACK_NAME_TO_TYPE = {
241 |     'ss': 'seq',
242 |     'acc': 'seq',
243 |     'disorder': 'disorder',
244 |     'pssm': 'pssm',
245 | }
246 | 


--------------------------------------------------------------------------------
/py/cleavepred/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import simple_cleavage_predictor, advanced_cleavage_predictor


--------------------------------------------------------------------------------
/py/cleavepred/api.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from . import project_paths
 4 | 
 5 | class CleavagePredictor(object):
 6 | 
 7 |     '''
 8 |     A predictor trained to predict the cleavage of peptides.
 9 |     There should be only two instances of this class:
10 |     1. simple_cleavage_predictor - Uses only the basic features derived from the amino-acid sequence of peptides.
11 |     2. advanced_cleavage_predictor - Used also features derived from external tools (ss, acc, disorder and pssm).
12 |     '''
13 | 
14 |     def __init__(self, advanced):
15 |         self.advanced = advanced
16 |         self._peptide_predictor = None
17 | 
18 |     def predict(self, seq, extra_tracks_data = {}, proba = False):
19 |         '''
20 |         Predicts cleavage for a given peptide.
21 |         @param seq (string):
22 |             The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet.
23 |         @param extra_tracks_data (dict, empty by default):
24 |             A dictionary for providing extra tracks of the given peptide. If using the simple predictor (i.e. advanced = False), it can
25 |             be left empty. If using the advanced predictor (i.e. advanced = True), must receive all tracks (i.e. ss, acc, disorder
26 |             and pssm). The given dictionary should map from track names to their sequence.
27 |         @param proba (default False):
28 |             Whether to return mask of predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s).
29 |         @return:
30 |             A tuple composed of:
31 |             1. cleavage_mask - If proba = False, it will be a binary string (0's and 1's) representing whether each residue is a cleavage
32 |             site (1) or not (0). If proba = True, it will be a list of floats (between 0 to 1) representing the probability of each residue
33 |             to be a cleavage site. Either way, the length of the returned string/list will correspond to the length of the provided peptide
34 |             sequence.
35 |             2. cleavage_products - A list of strings, each representing the amino-acid sequence of a predicted cleavage product.
36 |         '''
37 |         cleavage_mask = self.get_peptide_predictor().predict_annotations(seq, extra_tracks_data = extra_tracks_data, proba = proba)
38 |         cleavage_products = _get_cleavage_products(seq, cleavage_mask)
39 |         return cleavage_mask, cleavage_products
40 | 
41 |     def get_peptide_predictor(self):
42 | 
43 |         '''
44 |         @return:
45 |             The PeptidePredictor object associated with this cleavage predictor.
46 |         '''
47 | 
48 |         if self._peptide_predictor is None:
49 |             self._peptide_predictor = self._load_peptide_predictor()
50 | 
51 |         return self._peptide_predictor
52 | 
53 |     def _load_peptide_predictor(self):
54 | 
55 |         predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(self.advanced), 'rb')
56 | 
57 |         try:
58 |             return pickle.load(predictor_dump_file)
59 |         finally:
60 |             predictor_dump_file.close()
61 | 
62 | simple_cleavage_predictor = CleavagePredictor(False)
63 | advanced_cleavage_predictor = CleavagePredictor(True)
64 | 
65 | def _get_cleavage_products(seq, cleavage_mask):
66 | 
67 |     products = []
68 |     current_product = ''
69 | 
70 |     for i in range(len(seq)):
71 |     
72 |         current_product += seq[i]
73 |     
74 |         # When we have continuous positive cleavage sites, we consider only the most C-terminus one.
75 |         if _is_cleavage(cleavage_mask[i]) and (i >= len(seq) - 1 or not _is_cleavage(cleavage_mask[i + 1])):
76 |             _add_if_not_empty(products, current_product)
77 |             current_product = ''
78 | 
79 |     _add_if_not_empty(products, current_product)
80 |     return products
81 |     
82 | def _is_cleavage(label):
83 |     if isinstance(label, str):
84 |         return label == '1'
85 |     elif isinstance(label, int) or isinstance(label, float):
86 |         return int(round(label)) == 1
87 |     else:
88 |         raise Exception('Unknown label type: ' + str(type(label)))
89 | 
90 | def _add_if_not_empty(array, string):
91 |     if len(string) > 0:
92 |         array += [string]
93 | 


--------------------------------------------------------------------------------
/py/cleavepred/check_top_features.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Checks the top features predicted for a given dataset.
 3 | Arguments:
 4 | - dataset_name (string): The name of the dataset to use.
 5 | - advanced (boolean): Whether to use all or only some features, corresponding to simple and advanced classifiers respectively.
 6 | '''
 7 | 
 8 | import sys
 9 | import logging
10 | 
11 | import pandas as pd
12 | 
13 | from asap import get_top_features
14 | 
15 | from cleavepred import util
16 | from cleavepred import project_paths
17 | 
18 | logger = logging.getLogger('FEATURES')
19 | 
20 | ### Parse arguments ###
21 | 
22 | project_paths.dataset_name = sys.argv[1].lower()
23 | advanced = util.parse_bool(sys.argv[2])
24 | 
25 | ### Get top features ###
26 | 
27 | windows_file = None
28 | 
29 | def open_files():
30 |     global windows_file
31 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
32 |     
33 | def close_files():
34 |     util.close_files([windows_file])
35 |     
36 | def get_advanced_label():
37 |     if advanced:
38 |         return 'advanced'
39 |     else:
40 |         return 'simple'
41 |     
42 | def check_top_features():
43 |     windows_data_frame = pd.read_csv(windows_file)
44 |     logger.info('Checking top features over %s dataset with %s features...' % (project_paths.dataset_name, get_advanced_label()))
45 |     top_features = get_top_features(windows_data_frame, drop_only_almost_positives = True)
46 |     logger.info('Top features: ' + ', '.join(top_features))
47 |     
48 | if __name__ == '__main__':
49 |     try:
50 |         open_files()
51 |         check_top_features()
52 |     finally:
53 |         close_files()
54 | 


--------------------------------------------------------------------------------
/py/cleavepred/common.py:
--------------------------------------------------------------------------------
 1 | from asap import FEATURE_KEY_OPTIONS, WindowExtractionParams
 2 | from asap.config import POSITIVE_AMINO_ACIDS
 3 | 
 4 | AVAILABLE_TRACKS = [
 5 |     'ss',
 6 |     'acc',
 7 |     'disorder',
 8 |     'pssm',
 9 | ]
10 | 
11 | # Here we prefer using 'aa_reduced' over 'aa'. We give up on some other features.
12 | USED_FEATURES = set(FEATURE_KEY_OPTIONS).difference(['aa', 'accum_charge_left', 'accum_charge_right', 'accum_pos_charge_left', 'accum_pos_charge_right'])
13 | 
14 | def windows_filter(window):
15 |     '''
16 |     We consider only windows with a positively charged amino-acid (i.e. K/R) in the hot index (only then it can be a
17 |     cleavage candidate).
18 |     '''
19 |     return window.get_aa_seq()[window_extraction_params.window_hot_index] in POSITIVE_AMINO_ACIDS
20 | 
21 | window_extraction_params = WindowExtractionParams(window_prefix = 11, window_suffix = 8, neighbourhood_prefix = 5, \
22 |         neighbourhood_suffix = 5, windows_filter = windows_filter, feature_keys = USED_FEATURES)
23 | 


--------------------------------------------------------------------------------
/py/cleavepred/extract_uniprot_annotated_seqs_from_xml.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Extract a .lf file, containing sequences and cleavage annotation masks, out of a UniProt's XML file.
  3 | Arguments:
  4 | - output_file_path (file path, optional): The path to write the output .lf file to. If not provided, will update the project's relevant file.
  5 | '''
  6 | 
  7 | import sys
  8 | import re
  9 | import xml.etree.ElementTree as et
 10 | from StringIO import StringIO
 11 | 
 12 | from cleavepred import util
 13 | from cleavepred import project_paths
 14 | 
 15 | project_paths.dataset_name = 'uniprot'
 16 | 
 17 | if len(sys.argv) > 1:
 18 |     output_file_path = sys.argv[1]
 19 | else:
 20 |     output_file_path = project_paths.get_annotated_seqs_file_path()
 21 | 
 22 | def get_unique(element, xpath):
 23 | 
 24 |     subelements = element.findall(xpath)
 25 | 
 26 |     if len(subelements) == 0:
 27 |         return None
 28 |     if len(subelements) == 1:
 29 |         return subelements[0]
 30 |     else:
 31 |         raise Exception('%d subelements: %s' % (len(subelements), xpath))
 32 | 
 33 | def parse_uniprot_xml(raw_xml_path):
 34 |     raw = util.read_file(raw_xml_path)
 35 |     fixed_raw = re.sub(r'xmlns="[^"]*"', '', raw)
 36 |     return et.fromstring(fixed_raw)
 37 | 
 38 | def get_proteins_with_cleavage_sites(raw_xml_path):
 39 | 
 40 |     root = parse_uniprot_xml(raw_xml_path)
 41 | 
 42 |     for entry in root.findall('./entry'):
 43 | 
 44 |         accession = entry.findall('./accession')[0].text
 45 |         raw_seq = get_unique(entry, './sequence').text
 46 |         seq = re.sub(r'\s', '', raw_seq)
 47 | 
 48 |         signal_peptide_end = 0
 49 |         cleavage_sites = set()
 50 |         skip_protein = False
 51 | 
 52 |         for feature in entry.findall('./feature'):
 53 | 
 54 |             type = feature.get('type').lower()
 55 | 
 56 |             if type in ['peptide', 'chain', 'propeptide', 'signal peptide']:
 57 | 
 58 |                 try:
 59 |                     begin = int(get_unique(feature, './location/begin').get('position'))
 60 |                 except:
 61 |                     begin = None
 62 | 
 63 |                 try:
 64 |                     end = int(get_unique(feature, './location/end').get('position'))
 65 |                 except:
 66 |                     end = None
 67 | 
 68 |                 if type == 'signal peptide':
 69 |                     if end is None:
 70 |                         print ('%s: no end to signal peptide. We will ignore this protein.' % accession)
 71 |                         skip_protein = True
 72 |                         break
 73 |                     else:
 74 |                         signal_peptide_end = max(signal_peptide_end, end)
 75 |                 else:
 76 | 
 77 |                     if begin is not None:
 78 |                         cleavage_sites.add(begin - 1)
 79 |                         cleavage_sites.add(begin - 2)
 80 | 
 81 |                     if end is not None:
 82 |                         if type == 'propeptide':
 83 |                             cleavage_sites.add(end - 1)
 84 |                         else:
 85 |                             cleavage_sites.add(end)
 86 | 
 87 |         if skip_protein:
 88 |             continue
 89 |             
 90 |         cleavage_sites = set([i for i in cleavage_sites if i >= signal_peptide_end + 3 and i < len(seq) - 3 and seq[i] in 'KR'])
 91 |         cleavage_sites_to_remove = set([i - 1 for i in cleavage_sites]) # If 11, we take only the second
 92 |         cleavage_sites = cleavage_sites.difference(cleavage_sites_to_remove)
 93 | 
 94 |         if cleavage_sites: # we don't want samples with no cleavages at all - it's probably a mistake
 95 |             yield accession, seq, cleavage_sites, signal_peptide_end
 96 | 
 97 | def cleavage_sites_to_mask(seq_length, cleavage_sites):
 98 | 
 99 |     mask = ['0'] * seq_length
100 | 
101 |     for cleavage_site in cleavage_sites:
102 |         mask[cleavage_site] = '1'
103 | 
104 |     return ''.join(mask)
105 |     
106 | def remove_xs(seq, mask):
107 |     
108 |     revised_seq = ''
109 |     revised_mask = ''
110 |     
111 |     for aa, label in zip(seq, mask):
112 |         if aa.lower() != 'x':
113 |             revised_seq += aa
114 |             revised_mask += label
115 |             
116 |     return revised_seq, revised_mask
117 | 
118 | def space_seq(seq, chunk_length = 10):
119 |     return ' '.join(util.split_to_chunks(seq, chunk_length))
120 | 
121 | def write_fasta_like_record(file, accession, seq, mask):
122 |     file.write('>' + accession + '\n')
123 |     file.write(space_seq(seq) + '\n')
124 |     file.write(space_seq(mask) + '\n')
125 |     file.write('\n')
126 | 
127 | if __name__ == '__main__':
128 | 
129 |     output_file = open(output_file_path, 'wb')
130 | 
131 |     try:
132 |         for accession, seq, cleavage_sites, signal_peptide_end in get_proteins_with_cleavage_sites(project_paths.get_raw_data_xml_file_path()):
133 |             mask_to_write = cleavage_sites_to_mask(len(seq), cleavage_sites)[signal_peptide_end:]
134 |             seq_to_write = seq[signal_peptide_end:]
135 |             seq_to_write, mask_to_write = remove_xs(seq_to_write, mask_to_write)
136 |             write_fasta_like_record(output_file, accession, seq_to_write, mask_to_write)
137 |     finally:
138 |         output_file.close()
139 | 
140 |     print 'Done.'
141 | 


--------------------------------------------------------------------------------
/py/cleavepred/extract_windows.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to extract the window features for a given dataset.
 3 | Arguments:
 4 | - dataset_name (string): The name of the dataset to extract the window features for.
 5 | - advanced (boolean): Whether to use the extra tracks when extracting the windows, or extracting only the simple sequence-based features.
 6 | - output_file_path (file path, optional): The path to write the output CSV to. If not provided, will update the project's relevant file.
 7 | '''
 8 | 
 9 | import sys
10 | 
11 | import asap
12 | 
13 | from cleavepred import util
14 | from cleavepred import project_paths
15 | from cleavepred.common import window_extraction_params
16 | 
17 | ### Parse arguments ###
18 | 
19 | project_paths.dataset_name = sys.argv[1]
20 | advanced = util.parse_bool(sys.argv[2])
21 | 
22 | if len(sys.argv) > 3:
23 |     output_file_path = sys.argv[3]
24 | else:
25 |     output_file_path = project_paths.get_window_features_file_path(advanced)
26 | 
27 | ### Extract the windows ###
28 | 
29 | annotated_seqs_file = None
30 | seqs_filtration_file = None
31 | csv_output_file = None
32 | extra_tracks_files = {}
33 | 
34 | def open_files():
35 | 
36 |     global annotated_seqs_file, seqs_filtration_file, csv_output_file, extra_tracks_files
37 | 
38 |     annotated_seqs_file = open(project_paths.get_annotated_seqs_file_path(), 'rb')
39 |     seqs_filtration_file = open(project_paths.get_filtered_seqs_file_path(), 'rb')
40 |     csv_output_file = open(output_file_path, 'wb')
41 | 
42 |     if advanced:
43 |         for track_name, track_file_path in project_paths.get_track_file_paths().items():
44 |             extra_tracks_files[track_name] = open(track_file_path, 'rb')
45 | 
46 | def close_files():
47 |     util.close_files([annotated_seqs_file, seqs_filtration_file, csv_output_file])
48 |     util.close_files(extra_tracks_files.values())
49 | 
50 | def extract_windows():
51 |     asap.extract_windows_from_file(annotated_seqs_file, extract_annotations = True, seqs_filtration_file = seqs_filtration_file, \
52 |             extra_tracks_files = extra_tracks_files, csv_output_file = csv_output_file, window_extraction_params = window_extraction_params)
53 | 
54 | if __name__ == '__main__':
55 |     try:
56 |         open_files()
57 |         extract_windows()
58 |     finally:
59 |         close_files()
60 | 


--------------------------------------------------------------------------------
/py/cleavepred/get_disopred.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Script to run disopred3 on a multifasta file, then parse and collate the output.
  3 | Steps:
  4 | 1. Extract each fasta from a copy of original multifasta file. (Stored in a seperate directory)
  5 | 2. (Opt?) Remove original multifasta file.
  6 | 3. Run disopred on each fasta.
  7 | 4. i. gather the data/output for all the fastas. (output file name is the fasta's name).
  8 | 4. ii. Clean the format (for each file), and save to file: "output_feat.diso", (in a format like lf/ss/acc)
  9 | 4.iii. Save this output to the external features folder.
 10 | 5. Import from the standard pipeline (config) / not here.
 11 | '''
 12 | 
 13 | import os
 14 | import subprocess
 15 | from subprocess import call
 16 | import sys
 17 | import csv
 18 | import glob
 19 | import pandas as pd
 20 | # from Bio.Seq import Seq
 21 | # from Bio.SeqRecord import SeqRecord
 22 | from Bio import SeqIO
 23 | 
 24 | 
 25 | #Location of the directory containing "run_disopred.pl"
 26 | DISOPRED_LOCATION = r'/cs/stud/danofer/Desktop/danofer/Software/DISOPRED'
 27 | DISO_PROG = 'run_disopred.pl'
 28 | 
 29 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V3/'
 30 | # FASTA_TARGET = 'NeuroPred_nNames_70ID.fasta'
 31 | 
 32 | #NEW:
 33 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/data/uniprot/UniProtTestSeqs/D3/'
 34 | FASTA_LOCATION = '/a/fr-05/vol/protein/danofer/imac/Desktop/DFTP/'
 35 | 
 36 | # FASTA_TARGET = 'D3_TEST_50_FILT.fasta'
 37 | FASTA_TARGET = 'D3_TEST_50_FILT_mod.fasta'
 38 | 
 39 | SPLIT_FASTAS_DIR = 'splitfasta1'
 40 | split_fastas_dir = os.path.join(FASTA_LOCATION,SPLIT_FASTAS_DIR)
 41 | 
 42 | 
 43 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET)
 44 | 
 45 | ALT_DIR_OUTPUT = os.path.join(FASTA_LOCATION,'altFiltered')
 46 | 
 47 | 
 48 | 
 49 | def parse_DISOPRED():
 50 |     '''
 51 |     Parse all pbdat files in a dir, (Output of DISOPRED)
 52 |     save their output in a format like hssp/ss/acc
 53 |     '''
 54 |     # os.chdir(os.path.dirname(split_fastas_dir))
 55 |     os.chdir(split_fastas_dir)
 56 | 
 57 |     files = glob.glob('*.pbdat')
 58 |     print('amount of .pbdat files:',len(files))
 59 |     # output_file = open('/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V2_ExternalFeat_NP/output_feat.DISO', 'w')
 60 |     # output_file = open(FASTA_LOCATION+'output_feat.DISO', 'w')
 61 |     output_file = open(os.path.join(FASTA_LOCATION,'output_feat.DISO'), 'w')
 62 | 
 63 |     # print('Joined results will be saved to ',(FASTA_LOCATION+'output_feat.DISO'))
 64 |     print('Joined results will be saved to ',os.path.join(FASTA_LOCATION+'output_feat.DISO'))
 65 | 
 66 |     for f in files:
 67 |         accession = str('>'+os.path.splitext(os.path.basename(f))[0])
 68 |         f=open(f)
 69 |         lines = f.readlines()
 70 |         seq = []
 71 |         diso_state=[]
 72 |         for line in lines:
 73 |             parts = line.strip(' \n \t').split(' ')
 74 |             if parts[0] != '#':
 75 |                 seq += parts[1]
 76 |                 diso_state += parts[2]
 77 |         seq = ''.join(seq)
 78 |         diso_state = ''.join(diso_state)
 79 |         output_file.write(accession+'\n')
 80 |         output_file.write((seq)+'\n')
 81 |         output_file.write((diso_state)+'\n')
 82 |         # print(accession)
 83 |     print("Saved to output_feat.DISO")
 84 |     output_file.close()
 85 | 
 86 | 
 87 | def split_fasta(filter = False):
 88 |     '''
 89 |     https://py4bio.wordpress.com/2009/07/22/split_fasta_file/
 90 |     This script takes a fasta file and split it in one file per fasta entry.
 91 |     It saves the outputs fastas in a new directory
 92 |     '''
 93 |     os.chdir(os.path.dirname(FASTA_LOCATION))
 94 |     print("Current working Directory:",os.getcwd())
 95 |     filter_fastas = []
 96 |     file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET)
 97 |     split_output_dir = SPLIT_FASTAS_DIR
 98 | 
 99 |     if filter == True:
100 |         filter_fastas = filter_fasta_queries(fastas_dir=split_fastas_dir)
101 |         split_output_dir = ALT_DIR_OUTPUT
102 | 
103 |     if not os.path.exists(split_output_dir):
104 |         os.makedirs(split_output_dir)
105 | 
106 |     os.chdir(split_output_dir)
107 | 
108 |     i = 0
109 |     read_counts = 0
110 |     for record in SeqIO.parse(open(file_in), "fasta"):
111 |         read_counts += 1
112 |         if not (record.id in filter_fastas):
113 |             # f_out = os.path.join(split_output_dir,record.id+'.fasta')
114 |             f_out = (record.id+'.fasta')
115 |             # f_out =(split_output_dir+record.id+'.fasta')
116 |             print('save to:',f_out)
117 |             # SeqIO.write([record],open(f_out,'w'),"fasta")
118 |             with open(f_out, "w") as handle:
119 |                 SeqIO.write([record], handle, "fasta")
120 |             i += 1
121 | 
122 |     print(read_counts," = Fastas in the original multifasta-file")
123 |     print(i," = # Splitted Fasta files made")
124 | 
125 | 
126 | def call_DISOPRED(split_fastas_list):
127 |     os.chdir(os.path.dirname(DISOPRED_LOCATION))
128 |     print(os.getcwd())
129 |     print("In DisoPred folder")
130 |     for i, fasta in enumerate(split_fastas_list):
131 |         print(i)
132 |         print(fasta)
133 |         subprocess.call([DISOPRED_LOCATION+'/'+DISO_PROG,fasta])
134 |         print()
135 | 
136 | def filter_fasta_queries(fastas_dir=split_fastas_dir):
137 |     '''
138 |     If Disopred job was interrupted in midway -
139 |     This lets us continue (in a new dir) for
140 |     only those sequences that do not have
141 |     Disopred predictions = *.pbdat
142 |     '''
143 |     os.chdir(fastas_dir)
144 |     files = glob.glob('*.pbdat')
145 |     print('# .pbdat files = fastas that were processed succesfully, previously:',len(files))
146 |     ids = [os.path.splitext(os.path.basename(f))[0] for f in files]
147 |     print('len(ids)',len(ids))
148 |     return ids
149 | 
150 | def find_missing():
151 |     '''
152 |     Disopred seems to "miss" some fastas.
153 |     This helps us find them, assuming the
154 |     disopred output is in the same dirr as
155 |     the (split / "filtered") our fasta candidates
156 |     '''
157 |     # os.chdir('/cs/prt3/danofer/CleavePred/Dataset/Uniprot/altFiltered')
158 |     os.chdir(split_fastas_dir)
159 | 
160 |     fastas = [f for f in os.listdir('.') if f.endswith('.fasta')]
161 |     ids = [os.path.splitext(os.path.basename(f))[0] for f in fastas]
162 |     print('# Fastas present:',str(len(ids)))
163 | 
164 |     preds = fastas = [f for f in os.listdir('.') if f.endswith('.pbdat')]
165 |     dis_ids = [os.path.splitext(os.path.basename(f))[0] for f in preds]
166 |     print('# Predictions present:',str(len(dis_ids)))
167 | 
168 |     missing = [a for a in ids if a not in dis_ids]
169 |     print('Missing IDs:')
170 |     print(missing)
171 |     return missing
172 | 
173 | 
174 | if __name__ == '__main__':
175 | 
176 |     # fe = filter_fasta_queries()
177 |     # print('len filter_fasta_queries()',len(fe))
178 |     # split_fasta(filter = True)
179 | 
180 |     SPLIT_F = False
181 |     CALL_DISO = False
182 |     USE_FILTERED = False
183 | 
184 |     RESUME_DISO_PARTIAL = True
185 | 
186 |     if SPLIT_F == True:
187 |         split_fasta()
188 | 
189 |     # split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta')
190 |     split_fastas_list = glob.glob(os.path.join(split_fastas_dir+'/*.fasta'))
191 |     print('\n split_fastas_list in dir: ',len(split_fastas_list))
192 | 
193 |     if USE_FILTERED == True:
194 |         # filt_split_fastas_list = glob.glob(ALT_DIR_OUTPUT+'/*.fasta') #ORIGINAL
195 |         filt_split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta')  #CHANGED #D
196 |         print('\n filt_split_fastas_list ',len(filt_split_fastas_list))
197 |         print(filt_split_fastas_list[0])
198 |         print(filt_split_fastas_list[1])
199 |         call_DISOPRED(filt_split_fastas_list)
200 | 
201 | 
202 |     if CALL_DISO == True:
203 |         call_DISOPRED(split_fastas_list)
204 |         print("\n DISOPRED DONE! \n")
205 | 
206 |     parse_DISOPRED()
207 | 
208 |     missing_ID = find_missing()
209 |     if RESUME_DISO_PARTIAL:
210 |         # fe = filter_fasta_queries()
211 |         print("\n Calling disopred on missing IDs")
212 |         print("Missing: \n",missing_ID)
213 |         call_DISOPRED(split_fastas_list)
214 |         print("\n DISOPRED DONE! \n")
215 | 
216 | 
217 | 


--------------------------------------------------------------------------------
/py/cleavepred/produce_auto_files.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A master script to produce all the required auto-generated files:
 3 | 1. Uniprot's annotated seqs .lf file
 4 | 2. CSVs of the windows with features
 5 | 3. Pickle dump files of trained predictors.
 6 | Just execute this script as it is, with no arguments. Make sure to be in the py/ directory when running it.
 7 | '''
 8 | 
 9 | import sys
10 | import logging
11 | 
12 | # For logger initialization
13 | import asap
14 | 
15 | LOGGING_PREFIX = '********** '
16 | 
17 | logger = logging.getLogger('EXEC')
18 | 
19 | # Uniprot's .lf file
20 | logger.info(LOGGING_PREFIX + 'Running extract_uniprot_annotated_seqs_from_xml.py')
21 | sys.argv = ['']
22 | execfile('cleavepred/extract_uniprot_annotated_seqs_from_xml.py')
23 | 
24 | # Create CSVs
25 | for dataset in ['neuropred', 'uniprot']:
26 |     for advanced in ['false', 'true']:
27 |         logger.info(LOGGING_PREFIX + 'Running extract_windows.py with dataset="%s" and advanced="%s"' % (dataset, advanced))
28 |         sys.argv = ['', dataset, advanced]
29 |         execfile('cleavepred/extract_windows.py')
30 | 
31 | # Create dump files
32 | for advanced in ['false', 'true']:
33 |     logger.info(LOGGING_PREFIX + 'Running train_classifier.py with advanced="%s"' % advanced)
34 |     sys.argv = ['', advanced, 'auto']
35 |     execfile('cleavepred/train_classifier.py')
36 |     
37 | logger.info(LOGGING_PREFIX + 'Finished executing scripts. All auto-generated files should now be updated.')
38 | 


--------------------------------------------------------------------------------
/py/cleavepred/project_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .common import AVAILABLE_TRACKS
 4 | 
 5 | # A global variable to update whenever looking to work on another dataset
 6 | dataset_name = None
 7 | 
 8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 9 | DATA_DIR = os.path.join(BASE_DIR, 'data/cleavage')
10 | 
11 | def get_dataset_dir():
12 |     return os.path.join(DATA_DIR, '%s_dataset' % dataset_name)
13 | 
14 | def get_peptide_predictor_dump_file_path(advanced):
15 |     if advanced:
16 |         return os.path.join(DATA_DIR, 'advanced_peptide_predictor.pkl')
17 |     else:
18 |         return os.path.join(DATA_DIR, 'simple_peptide_predictor.pkl')
19 | 
20 | def get_window_features_file_path(advanced):
21 |     if advanced:
22 |         return os.path.join(get_dataset_dir(), 'window_advanced_features.csv')
23 |     else:
24 |         return os.path.join(get_dataset_dir(), 'window_simple_features.csv')
25 | 
26 | def get_raw_data_xml_file_path():
27 |     # Relevant only for when dataset_name = 'uniprot'
28 |     return os.path.join(get_dataset_dir(), 'raw_data.xml')
29 | 
30 | def get_annotated_seqs_file_path():
31 |     return os.path.join(get_dataset_dir(), 'annotated_seqs.lf')
32 | 
33 | def get_filtered_seqs_file_path():
34 |     return os.path.join(get_dataset_dir(), 'filtered_seqs.fasta')
35 | 
36 | def get_track_file_paths():
37 |     return {track: os.path.join(get_dataset_dir(), 'extra_tracks/seqs.%s' % track) for track  in AVAILABLE_TRACKS}


--------------------------------------------------------------------------------
/py/cleavepred/test_classifier.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to test a classifier that was trained on NeuroPred's dataset against UniProt's dataset
 3 | Arguments:
 4 | - advanced (boolean): Whether to test the advanced or simple classifier.
 5 | '''
 6 | 
 7 | import sys
 8 | import pickle
 9 | import logging
10 | 
11 | import pandas as pd
12 | 
13 | from cleavepred import util
14 | from cleavepred import project_paths
15 | 
16 | ### Parse arguments ###
17 | 
18 | advanced = util.parse_bool(sys.argv[1])
19 | 
20 | ### Configuration ###
21 | 
22 | # We use UniProt's dataset for testing our predictors.
23 | project_paths.dataset_name = 'uniprot'
24 | 
25 | logger = logging.getLogger('TEST')
26 | 
27 | ### Test the classifier ###
28 | 
29 | predictor_dump_file = None
30 | windows_file = None
31 | 
32 | def open_files():
33 |     global predictor_dump_file, windows_file
34 |     predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(advanced), 'rb')
35 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
36 |     
37 | def close_files():
38 |     util.close_files([predictor_dump_file, windows_file])
39 | 
40 | def test_classifier():
41 |     peptide_predictor = pickle.load(predictor_dump_file)
42 |     windows_data_frame = pd.read_csv(windows_file)
43 |     score, roc, sensitivity, precision, specificity, cm = peptide_predictor.window_classifier.test_performance(windows_data_frame, \
44 |             drop_only_almost_positives = True)
45 |     logger.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity))
46 |     logger.info('Confusion matrix:' + '\n' + str(cm))
47 |     
48 | if __name__ == '__main__':
49 |     try:
50 |         open_files()
51 |         test_classifier()
52 |     finally:
53 |         close_files()
54 | 


--------------------------------------------------------------------------------
/py/cleavepred/train_classifier.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to train a classifier from NueroPred's dataset. Will log performance to stdout.
 3 | Arguments:
 4 | - advanced (boolean): Whether to use the advanced features (extracted with the extra tracks), or just the simple ones.
 5 | - predictor_dump_path (file path, optional): The path to dump the trained PeptidePredictor. If not provided, will not dump it at all. If provided
 6 | with the keyword "auto", will dump it to the project's relevant file.
 7 | '''
 8 | 
 9 | import sys
10 | import pickle
11 | 
12 | import pandas as pd
13 | 
14 | from sklearn.feature_selection import VarianceThreshold, SelectFdr
15 | from sklearn.linear_model import LogisticRegressionCV
16 | from sklearn.ensemble import RandomForestClassifier
17 | from sklearn.svm import SVC
18 | from mlxtend.classifier import EnsembleClassifier
19 | 
20 | from asap import train_window_classifier, PeptidePredictor, FeatureSelectionPipeline
21 | 
22 | from cleavepred import util
23 | from cleavepred import project_paths
24 | from cleavepred.common import window_extraction_params
25 | 
26 | ### Parse arguments ###
27 | 
28 | advanced = util.parse_bool(sys.argv[1])
29 | 
30 | if len(sys.argv) > 2:
31 |     if sys.argv[2].lower() == 'auto':
32 |         predictor_dump_path = project_paths.get_peptide_predictor_dump_file_path(advanced)
33 |     else:
34 |         predictor_dump_path = sys.argv[2]
35 | else:
36 |     predictor_dump_path = None
37 | 
38 | ### Configuration ###
39 | 
40 | # We use NeuroPred's dataset for training/validation of our predictors.
41 | project_paths.dataset_name = 'neuropred'
42 | 
43 | ensemble_classifiers = [
44 |     LogisticRegressionCV(Cs = 16, n_jobs = -2, class_weight = 'auto'),
45 |     RandomForestClassifier(n_estimators = 250, bootstrap = True, criterion = 'gini', n_jobs = -2, class_weight = 'auto'),
46 |     SVC(kernel = 'rbf', C = 3.798, probability = True, cache_size = 2400, class_weight = 'auto'),
47 | ]
48 | classifiers = [EnsembleClassifier(clfs = ensemble_classifiers, voting = 'hard')]
49 | 
50 | feature_selector = FeatureSelectionPipeline([
51 |     VarianceThreshold(0.03),
52 |     SelectFdr(alpha = 0.1),
53 | ])
54 | 
55 | ### Train the classifier and dump the predictor ###
56 | 
57 | windows_file = None
58 | predictor_dump_file = None
59 | 
60 | def open_files():
61 |     global windows_file, predictor_dump_file
62 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
63 |     predictor_dump_file = util.open_file(predictor_dump_path, 'wb')
64 | 
65 | def close_files():
66 |     util.close_files([windows_file, predictor_dump_file])
67 | 
68 | def dump_predictor(predictor):
69 |     if predictor_dump_file is not None:
70 |         pickle.dump(predictor, predictor_dump_file)
71 | 
72 | def train_classifier():
73 |     windows_data_frame = pd.read_csv(windows_file)
74 |     window_classifier, classifier_performance = train_window_classifier(windows_data_frame, classifiers = classifiers, \
75 |             drop_only_almost_positives = True, feature_selector = feature_selector, n_folds = 10)
76 |     peptide_predictor = PeptidePredictor(window_classifier, window_extraction_params = window_extraction_params)
77 |     dump_predictor(peptide_predictor)
78 | 
79 | if __name__ == '__main__':
80 |     try:
81 |         open_files()
82 |         train_classifier()
83 |     finally:
84 |         close_files()
85 | 


--------------------------------------------------------------------------------
/py/cleavepred/util.py:
--------------------------------------------------------------------------------
 1 | def split_to_chunks(array, chunk_size):
 2 |     for i in xrange(0, len(array), chunk_size):
 3 |         yield array[i:(i + chunk_size)]
 4 | 
 5 | def parse_bool(raw_value):
 6 |     if raw_value.lower() in ['true', 'yes', '1']:
 7 |         return True
 8 |     elif raw_value.lower() in ['false', 'no', '0']:
 9 |         return False
10 |     else:
11 |         raise Exception('Unrecognized boolean value: ' + str(raw_value))
12 |         
13 | def open_file(path, *args, **argv):
14 |     if path is None:
15 |         return None
16 |     else:
17 |         return open(path, *args, **argv)
18 |         
19 | def read_file(path):
20 |     
21 |     f = open(path, 'rb')
22 |     
23 |     try:
24 |         return f.read()
25 |     finally:
26 |         f.close()
27 | 
28 | def close_file(file):
29 |     if file is not None:
30 |         file.close()
31 |         
32 | def close_files(files):
33 |     for file in files:
34 |         close_file(file)


--------------------------------------------------------------------------------
/py/deeppred/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import simple_cleavage_predictor, advanced_cleavage_predictor


--------------------------------------------------------------------------------
/py/deeppred/api.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from . import project_paths
 4 | 
 5 | class CleavagePredictor(object):
 6 | 
 7 |     '''
 8 |     A predictor trained to predict the cleavage of peptides.
 9 |     There should be only two instances of this class:
10 |     1. simple_cleavage_predictor - Uses only the basic features derived from the amino-acid sequence of peptides.
11 |     2. advanced_cleavage_predictor - Used also features derived from external tools (ss, acc, disorder and pssm).
12 |     '''
13 | 
14 |     def __init__(self, advanced):
15 |         self.advanced = advanced
16 |         self._peptide_predictor = None
17 | 
18 |     def predict(self, seq, extra_tracks_data = {}, proba = False):
19 |         '''
20 |         Predicts cleavage for a given peptide.
21 |         @param seq (string):
22 |             The amino-acid sequence of the peptide to predict the annotations for, given in a 20 amino-acid alphabet.
23 |         @param extra_tracks_data (dict, empty by default):
24 |             A dictionary for providing extra tracks of the given peptide. If using the simple predictor (i.e. advanced = False), it can
25 |             be left empty. If using the advanced predictor (i.e. advanced = True), must receive all tracks (i.e. ss, acc, disorder
26 |             and pssm). The given dictionary should map from track names to their sequence.
27 |         @param proba (default False):
28 |             Whether to return mask of predicted probabilities (floats from between 0 to 1) or binary labels (0s or 1s).
29 |         @return:
30 |             A tuple composed of:
31 |             1. cleavage_mask - If proba = False, it will be a binary string (0's and 1's) representing whether each residue is a cleavage
32 |             site (1) or not (0). If proba = True, it will be a list of floats (between 0 to 1) representing the probability of each residue
33 |             to be a cleavage site. Either way, the length of the returned string/list will correspond to the length of the provided peptide
34 |             sequence.
35 |             2. cleavage_products - A list of strings, each representing the amino-acid sequence of a predicted cleavage product.
36 |         '''
37 |         cleavage_mask = self.get_peptide_predictor().predict_annotations(seq, extra_tracks_data = extra_tracks_data, proba = proba)
38 |         cleavage_products = _get_cleavage_products(seq, cleavage_mask)
39 |         return cleavage_mask, cleavage_products
40 | 
41 |     def get_peptide_predictor(self):
42 | 
43 |         '''
44 |         @return:
45 |             The PeptidePredictor object associated with this cleavage predictor.
46 |         '''
47 | 
48 |         if self._peptide_predictor is None:
49 |             self._peptide_predictor = self._load_peptide_predictor()
50 | 
51 |         return self._peptide_predictor
52 | 
53 |     def _load_peptide_predictor(self):
54 | 
55 |         predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(self.advanced), 'rb')
56 | 
57 |         try:
58 |             return pickle.load(predictor_dump_file)
59 |         finally:
60 |             predictor_dump_file.close()
61 | 
62 | simple_cleavage_predictor = CleavagePredictor(False)
63 | advanced_cleavage_predictor = CleavagePredictor(True)
64 | 
65 | def _get_cleavage_products(seq, cleavage_mask):
66 | 
67 |     products = []
68 |     current_product = ''
69 | 
70 |     for i in range(len(seq)):
71 |     
72 |         current_product += seq[i]
73 |     
74 |         # When we have continuous positive cleavage sites, we consider only the most C-terminus one.
75 |         if _is_cleavage(cleavage_mask[i]) and (i >= len(seq) - 1 or not _is_cleavage(cleavage_mask[i + 1])):
76 |             _add_if_not_empty(products, current_product)
77 |             current_product = ''
78 | 
79 |     _add_if_not_empty(products, current_product)
80 |     return products
81 |     
82 | def _is_cleavage(label):
83 |     if isinstance(label, str):
84 |         return label == '1'
85 |     elif isinstance(label, int) or isinstance(label, float):
86 |         return int(round(label)) == 1
87 |     else:
88 |         raise Exception('Unknown label type: ' + str(type(label)))
89 | 
90 | def _add_if_not_empty(array, string):
91 |     if len(string) > 0:
92 |         array += [string]
93 | 


--------------------------------------------------------------------------------
/py/deeppred/check_top_features.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Checks the top features predicted for a given dataset.
 3 | Arguments:
 4 | - dataset_name (string): The name of the dataset to use.
 5 | - advanced (boolean): Whether to use all or only some features, corresponding to simple and advanced classifiers respectively.
 6 | '''
 7 | 
 8 | import sys
 9 | import logging
10 | 
11 | import pandas as pd
12 | 
13 | from asap import get_top_features
14 | 
15 | from cleavepred import util
16 | from cleavepred import project_paths
17 | 
18 | logger = logging.getLogger('FEATURES')
19 | 
20 | ### Parse arguments ###
21 | 
22 | project_paths.dataset_name = sys.argv[1].lower()
23 | advanced = util.parse_bool(sys.argv[2])
24 | 
25 | ### Get top features ###
26 | 
27 | windows_file = None
28 | 
29 | def open_files():
30 |     global windows_file
31 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
32 |     
33 | def close_files():
34 |     util.close_files([windows_file])
35 |     
36 | def get_advanced_label():
37 |     if advanced:
38 |         return 'advanced'
39 |     else:
40 |         return 'simple'
41 |     
42 | def check_top_features():
43 |     windows_data_frame = pd.read_csv(windows_file)
44 |     logger.info('Checking top features over %s dataset with %s features...' % (project_paths.dataset_name, get_advanced_label()))
45 |     top_features = get_top_features(windows_data_frame, drop_only_almost_positives = True)
46 |     logger.info('Top features: ' + ', '.join(top_features))
47 |     
48 | if __name__ == '__main__':
49 |     try:
50 |         open_files()
51 |         check_top_features()
52 |     finally:
53 |         close_files()
54 | 


--------------------------------------------------------------------------------
/py/deeppred/common.py:
--------------------------------------------------------------------------------
 1 | from asap import FEATURE_KEY_OPTIONS, WindowExtractionParams
 2 | from asap.config import POSITIVE_AMINO_ACIDS
 3 | 
 4 | AVAILABLE_TRACKS = [
 5 |     'ss',
 6 |     'acc',
 7 |     'disorder',
 8 |     'pssm',
 9 | ]
10 | 
11 | # Here we prefer using 'aa_reduced' over 'aa'. We give up on some other features.
12 | USED_FEATURES = set(FEATURE_KEY_OPTIONS).difference(['aa', 'accum_charge_left', 'accum_charge_right', 'accum_pos_charge_left', 'accum_pos_charge_right'])
13 | 
14 | def windows_filter(window):
15 |     '''
16 |     We consider only windows with a positively charged amino-acid (i.e. K/R) in the hot index (only then it can be a
17 |     cleavage candidate).
18 |     '''
19 |     return window.get_aa_seq()[window_extraction_params.window_hot_index] in POSITIVE_AMINO_ACIDS
20 | 
21 | window_extraction_params = WindowExtractionParams(window_prefix = 11, window_suffix = 8, neighbourhood_prefix = 5, \
22 |         neighbourhood_suffix = 5, windows_filter = windows_filter, feature_keys = USED_FEATURES)
23 | 


--------------------------------------------------------------------------------
/py/deeppred/extract_uniprot_annotated_seqs_from_xml.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Extract a .lf file, containing sequences and cleavage annotation masks, out of a UniProt's XML file.
  3 | Arguments:
  4 | - output_file_path (file path, optional): The path to write the output .lf file to. If not provided, will update the project's relevant file.
  5 | '''
  6 | 
  7 | import sys
  8 | import re
  9 | import xml.etree.ElementTree as et
 10 | from StringIO import StringIO
 11 | 
 12 | from cleavepred import util
 13 | from cleavepred import project_paths
 14 | 
 15 | project_paths.dataset_name = 'uniprot'
 16 | 
 17 | if len(sys.argv) > 1:
 18 |     output_file_path = sys.argv[1]
 19 | else:
 20 |     output_file_path = project_paths.get_annotated_seqs_file_path()
 21 | 
 22 | def get_unique(element, xpath):
 23 | 
 24 |     subelements = element.findall(xpath)
 25 | 
 26 |     if len(subelements) == 0:
 27 |         return None
 28 |     if len(subelements) == 1:
 29 |         return subelements[0]
 30 |     else:
 31 |         raise Exception('%d subelements: %s' % (len(subelements), xpath))
 32 | 
 33 | def parse_uniprot_xml(raw_xml_path):
 34 |     raw = util.read_file(raw_xml_path)
 35 |     fixed_raw = re.sub(r'xmlns="[^"]*"', '', raw)
 36 |     return et.fromstring(fixed_raw)
 37 | 
 38 | def get_proteins_with_cleavage_sites(raw_xml_path):
 39 | 
 40 |     root = parse_uniprot_xml(raw_xml_path)
 41 | 
 42 |     for entry in root.findall('./entry'):
 43 | 
 44 |         accession = entry.findall('./accession')[0].text
 45 |         raw_seq = get_unique(entry, './sequence').text
 46 |         seq = re.sub(r'\s', '', raw_seq)
 47 | 
 48 |         signal_peptide_end = 0
 49 |         cleavage_sites = set()
 50 |         skip_protein = False
 51 | 
 52 |         for feature in entry.findall('./feature'):
 53 | 
 54 |             type = feature.get('type').lower()
 55 | 
 56 |             if type in ['peptide', 'chain', 'propeptide', 'signal peptide']:
 57 | 
 58 |                 try:
 59 |                     begin = int(get_unique(feature, './location/begin').get('position'))
 60 |                 except:
 61 |                     begin = None
 62 | 
 63 |                 try:
 64 |                     end = int(get_unique(feature, './location/end').get('position'))
 65 |                 except:
 66 |                     end = None
 67 | 
 68 |                 if type == 'signal peptide':
 69 |                     if end is None:
 70 |                         print ('%s: no end to signal peptide. We will ignore this protein.' % accession)
 71 |                         skip_protein = True
 72 |                         break
 73 |                     else:
 74 |                         signal_peptide_end = max(signal_peptide_end, end)
 75 |                 else:
 76 | 
 77 |                     if begin is not None:
 78 |                         cleavage_sites.add(begin - 1)
 79 |                         cleavage_sites.add(begin - 2)
 80 | 
 81 |                     if end is not None:
 82 |                         if type == 'propeptide':
 83 |                             cleavage_sites.add(end - 1)
 84 |                         else:
 85 |                             cleavage_sites.add(end)
 86 | 
 87 |         if skip_protein:
 88 |             continue
 89 |             
 90 |         cleavage_sites = set([i for i in cleavage_sites if i >= signal_peptide_end + 3 and i < len(seq) - 3 and seq[i] in 'KR'])
 91 |         cleavage_sites_to_remove = set([i - 1 for i in cleavage_sites]) # If 11, we take only the second
 92 |         cleavage_sites = cleavage_sites.difference(cleavage_sites_to_remove)
 93 | 
 94 |         if cleavage_sites: # we don't want samples with no cleavages at all - it's probably a mistake
 95 |             yield accession, seq, cleavage_sites, signal_peptide_end
 96 | 
 97 | def cleavage_sites_to_mask(seq_length, cleavage_sites):
 98 | 
 99 |     mask = ['0'] * seq_length
100 | 
101 |     for cleavage_site in cleavage_sites:
102 |         mask[cleavage_site] = '1'
103 | 
104 |     return ''.join(mask)
105 |     
106 | def remove_xs(seq, mask):
107 |     
108 |     revised_seq = ''
109 |     revised_mask = ''
110 |     
111 |     for aa, label in zip(seq, mask):
112 |         if aa.lower() != 'x':
113 |             revised_seq += aa
114 |             revised_mask += label
115 |             
116 |     return revised_seq, revised_mask
117 | 
118 | def space_seq(seq, chunk_length = 10):
119 |     return ' '.join(util.split_to_chunks(seq, chunk_length))
120 | 
121 | def write_fasta_like_record(file, accession, seq, mask):
122 |     file.write('>' + accession + '\n')
123 |     file.write(space_seq(seq) + '\n')
124 |     file.write(space_seq(mask) + '\n')
125 |     file.write('\n')
126 | 
127 | if __name__ == '__main__':
128 | 
129 |     output_file = open(output_file_path, 'wb')
130 | 
131 |     try:
132 |         for accession, seq, cleavage_sites, signal_peptide_end in get_proteins_with_cleavage_sites(project_paths.get_raw_data_xml_file_path()):
133 |             mask_to_write = cleavage_sites_to_mask(len(seq), cleavage_sites)[signal_peptide_end:]
134 |             seq_to_write = seq[signal_peptide_end:]
135 |             seq_to_write, mask_to_write = remove_xs(seq_to_write, mask_to_write)
136 |             write_fasta_like_record(output_file, accession, seq_to_write, mask_to_write)
137 |     finally:
138 |         output_file.close()
139 | 
140 |     print 'Done.'
141 | 


--------------------------------------------------------------------------------
/py/deeppred/extract_windows.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to extract the window features for a given dataset.
 3 | Arguments:
 4 | - dataset_name (string): The name of the dataset to extract the window features for.
 5 | - advanced (boolean): Whether to use the extra tracks when extracting the windows, or extracting only the simple sequence-based features.
 6 | - output_file_path (file path, optional): The path to write the output CSV to. If not provided, will update the project's relevant file.
 7 | '''
 8 | 
 9 | import sys
10 | 
11 | import asap
12 | 
13 | from cleavepred import util
14 | from cleavepred import project_paths
15 | from cleavepred.common import window_extraction_params
16 | 
17 | ### Parse arguments ###
18 | 
19 | project_paths.dataset_name = sys.argv[1]
20 | advanced = util.parse_bool(sys.argv[2])
21 | 
22 | if len(sys.argv) > 3:
23 |     output_file_path = sys.argv[3]
24 | else:
25 |     output_file_path = project_paths.get_window_features_file_path(advanced)
26 | 
27 | ### Extract the windows ###
28 | 
29 | annotated_seqs_file = None
30 | seqs_filtration_file = None
31 | csv_output_file = None
32 | extra_tracks_files = {}
33 | 
34 | def open_files():
35 | 
36 |     global annotated_seqs_file, seqs_filtration_file, csv_output_file, extra_tracks_files
37 | 
38 |     annotated_seqs_file = open(project_paths.get_annotated_seqs_file_path(), 'rb')
39 |     seqs_filtration_file = open(project_paths.get_filtered_seqs_file_path(), 'rb')
40 |     csv_output_file = open(output_file_path, 'wb')
41 | 
42 |     if advanced:
43 |         for track_name, track_file_path in project_paths.get_track_file_paths().items():
44 |             extra_tracks_files[track_name] = open(track_file_path, 'rb')
45 | 
46 | def close_files():
47 |     util.close_files([annotated_seqs_file, seqs_filtration_file, csv_output_file])
48 |     util.close_files(extra_tracks_files.values())
49 | 
50 | def extract_windows():
51 |     asap.extract_windows_from_file(annotated_seqs_file, extract_annotations = True, seqs_filtration_file = seqs_filtration_file, \
52 |             extra_tracks_files = extra_tracks_files, csv_output_file = csv_output_file, window_extraction_params = window_extraction_params)
53 | 
54 | if __name__ == '__main__':
55 |     try:
56 |         open_files()
57 |         extract_windows()
58 |     finally:
59 |         close_files()
60 | 


--------------------------------------------------------------------------------
/py/deeppred/get_disopred.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Script to run disopred3 on a multifasta file, then parse and collate the output.
  3 | Steps:
  4 | 1. Extract each fasta from a copy of original multifasta file. (Stored in a seperate directory)
  5 | 2. (Opt?) Remove original multifasta file.
  6 | 3. Run disopred on each fasta.
  7 | 4. i. gather the data/output for all the fastas. (output file name is the fasta's name).
  8 | 4. ii. Clean the format (for each file), and save to file: "output_feat.diso", (in a format like lf/ss/acc)
  9 | 4.iii. Save this output to the external features folder.
 10 | 5. Import from the standard pipeline (config) / not here.
 11 | '''
 12 | 
 13 | import os
 14 | import subprocess
 15 | from subprocess import call
 16 | import sys
 17 | import csv
 18 | import glob
 19 | import pandas as pd
 20 | # from Bio.Seq import Seq
 21 | # from Bio.SeqRecord import SeqRecord
 22 | from Bio import SeqIO
 23 | 
 24 | 
 25 | #Location of the directory containing "run_disopred.pl"
 26 | DISOPRED_LOCATION = r'/cs/stud/danofer/Desktop/danofer/Software/DISOPRED'
 27 | DISO_PROG = 'run_disopred.pl'
 28 | 
 29 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V3/'
 30 | # FASTA_TARGET = 'NeuroPred_nNames_70ID.fasta'
 31 | 
 32 | #NEW:
 33 | # FASTA_LOCATION = '/cs/prt3/danofer/CleavePred/data/uniprot/UniProtTestSeqs/D3/'
 34 | FASTA_LOCATION = '/a/fr-05/vol/protein/danofer/imac/Desktop/DFTP/'
 35 | 
 36 | # FASTA_TARGET = 'D3_TEST_50_FILT.fasta'
 37 | FASTA_TARGET = 'D3_TEST_50_FILT_mod.fasta'
 38 | 
 39 | SPLIT_FASTAS_DIR = 'splitfasta1'
 40 | split_fastas_dir = os.path.join(FASTA_LOCATION,SPLIT_FASTAS_DIR)
 41 | 
 42 | 
 43 | file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET)
 44 | 
 45 | ALT_DIR_OUTPUT = os.path.join(FASTA_LOCATION,'altFiltered')
 46 | 
 47 | 
 48 | 
 49 | def parse_DISOPRED():
 50 |     '''
 51 |     Parse all pbdat files in a dir, (Output of DISOPRED)
 52 |     save their output in a format like hssp/ss/acc
 53 |     '''
 54 |     # os.chdir(os.path.dirname(split_fastas_dir))
 55 |     os.chdir(split_fastas_dir)
 56 | 
 57 |     files = glob.glob('*.pbdat')
 58 |     print('amount of .pbdat files:',len(files))
 59 |     # output_file = open('/cs/prt3/danofer/CleavePred/Dataset/Neuropred/V2_ExternalFeat_NP/output_feat.DISO', 'w')
 60 |     # output_file = open(FASTA_LOCATION+'output_feat.DISO', 'w')
 61 |     output_file = open(os.path.join(FASTA_LOCATION,'output_feat.DISO'), 'w')
 62 | 
 63 |     # print('Joined results will be saved to ',(FASTA_LOCATION+'output_feat.DISO'))
 64 |     print('Joined results will be saved to ',os.path.join(FASTA_LOCATION+'output_feat.DISO'))
 65 | 
 66 |     for f in files:
 67 |         accession = str('>'+os.path.splitext(os.path.basename(f))[0])
 68 |         f=open(f)
 69 |         lines = f.readlines()
 70 |         seq = []
 71 |         diso_state=[]
 72 |         for line in lines:
 73 |             parts = line.strip(' \n \t').split(' ')
 74 |             if parts[0] != '#':
 75 |                 seq += parts[1]
 76 |                 diso_state += parts[2]
 77 |         seq = ''.join(seq)
 78 |         diso_state = ''.join(diso_state)
 79 |         output_file.write(accession+'\n')
 80 |         output_file.write((seq)+'\n')
 81 |         output_file.write((diso_state)+'\n')
 82 |         # print(accession)
 83 |     print("Saved to output_feat.DISO")
 84 |     output_file.close()
 85 | 
 86 | 
 87 | def split_fasta(filter = False):
 88 |     '''
 89 |     https://py4bio.wordpress.com/2009/07/22/split_fasta_file/
 90 |     This script takes a fasta file and split it in one file per fasta entry.
 91 |     It saves the outputs fastas in a new directory
 92 |     '''
 93 |     os.chdir(os.path.dirname(FASTA_LOCATION))
 94 |     print("Current working Directory:",os.getcwd())
 95 |     filter_fastas = []
 96 |     file_in = os.path.join(FASTA_LOCATION,FASTA_TARGET)
 97 |     split_output_dir = SPLIT_FASTAS_DIR
 98 | 
 99 |     if filter == True:
100 |         filter_fastas = filter_fasta_queries(fastas_dir=split_fastas_dir)
101 |         split_output_dir = ALT_DIR_OUTPUT
102 | 
103 |     if not os.path.exists(split_output_dir):
104 |         os.makedirs(split_output_dir)
105 | 
106 |     os.chdir(split_output_dir)
107 | 
108 |     i = 0
109 |     read_counts = 0
110 |     for record in SeqIO.parse(open(file_in), "fasta"):
111 |         read_counts += 1
112 |         if not (record.id in filter_fastas):
113 |             # f_out = os.path.join(split_output_dir,record.id+'.fasta')
114 |             f_out = (record.id+'.fasta')
115 |             # f_out =(split_output_dir+record.id+'.fasta')
116 |             print('save to:',f_out)
117 |             # SeqIO.write([record],open(f_out,'w'),"fasta")
118 |             with open(f_out, "w") as handle:
119 |                 SeqIO.write([record], handle, "fasta")
120 |             i += 1
121 | 
122 |     print(read_counts," = Fastas in the original multifasta-file")
123 |     print(i," = # Splitted Fasta files made")
124 | 
125 | 
126 | def call_DISOPRED(split_fastas_list):
127 |     os.chdir(os.path.dirname(DISOPRED_LOCATION))
128 |     print(os.getcwd())
129 |     print("In DisoPred folder")
130 |     for i, fasta in enumerate(split_fastas_list):
131 |         print(i)
132 |         print(fasta)
133 |         subprocess.call([DISOPRED_LOCATION+'/'+DISO_PROG,fasta])
134 |         print()
135 | 
136 | def filter_fasta_queries(fastas_dir=split_fastas_dir):
137 |     '''
138 |     If Disopred job was interrupted in midway -
139 |     This lets us continue (in a new dir) for
140 |     only those sequences that do not have
141 |     Disopred predictions = *.pbdat
142 |     '''
143 |     os.chdir(fastas_dir)
144 |     files = glob.glob('*.pbdat')
145 |     print('# .pbdat files = fastas that were processed succesfully, previously:',len(files))
146 |     ids = [os.path.splitext(os.path.basename(f))[0] for f in files]
147 |     print('len(ids)',len(ids))
148 |     return ids
149 | 
150 | def find_missing():
151 |     '''
152 |     Disopred seems to "miss" some fastas.
153 |     This helps us find them, assuming the
154 |     disopred output is in the same dirr as
155 |     the (split / "filtered") our fasta candidates
156 |     '''
157 |     # os.chdir('/cs/prt3/danofer/CleavePred/Dataset/Uniprot/altFiltered')
158 |     os.chdir(split_fastas_dir)
159 | 
160 |     fastas = [f for f in os.listdir('.') if f.endswith('.fasta')]
161 |     ids = [os.path.splitext(os.path.basename(f))[0] for f in fastas]
162 |     print('# Fastas present:',str(len(ids)))
163 | 
164 |     preds = fastas = [f for f in os.listdir('.') if f.endswith('.pbdat')]
165 |     dis_ids = [os.path.splitext(os.path.basename(f))[0] for f in preds]
166 |     print('# Predictions present:',str(len(dis_ids)))
167 | 
168 |     missing = [a for a in ids if a not in dis_ids]
169 |     print('Missing IDs:')
170 |     print(missing)
171 |     return missing
172 | 
173 | 
174 | if __name__ == '__main__':
175 | 
176 |     # fe = filter_fasta_queries()
177 |     # print('len filter_fasta_queries()',len(fe))
178 |     # split_fasta(filter = True)
179 | 
180 |     SPLIT_F = False
181 |     CALL_DISO = False
182 |     USE_FILTERED = False
183 | 
184 |     RESUME_DISO_PARTIAL = True
185 | 
186 |     if SPLIT_F == True:
187 |         split_fasta()
188 | 
189 |     # split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta')
190 |     split_fastas_list = glob.glob(os.path.join(split_fastas_dir+'/*.fasta'))
191 |     print('\n split_fastas_list in dir: ',len(split_fastas_list))
192 | 
193 |     if USE_FILTERED == True:
194 |         # filt_split_fastas_list = glob.glob(ALT_DIR_OUTPUT+'/*.fasta') #ORIGINAL
195 |         filt_split_fastas_list = glob.glob(split_fastas_dir+'/*.fasta')  #CHANGED #D
196 |         print('\n filt_split_fastas_list ',len(filt_split_fastas_list))
197 |         print(filt_split_fastas_list[0])
198 |         print(filt_split_fastas_list[1])
199 |         call_DISOPRED(filt_split_fastas_list)
200 | 
201 | 
202 |     if CALL_DISO == True:
203 |         call_DISOPRED(split_fastas_list)
204 |         print("\n DISOPRED DONE! \n")
205 | 
206 |     parse_DISOPRED()
207 | 
208 |     missing_ID = find_missing()
209 |     if RESUME_DISO_PARTIAL:
210 |         # fe = filter_fasta_queries()
211 |         print("\n Calling disopred on missing IDs")
212 |         print("Missing: \n",missing_ID)
213 |         call_DISOPRED(split_fastas_list)
214 |         print("\n DISOPRED DONE! \n")
215 | 
216 | 
217 | 


--------------------------------------------------------------------------------
/py/deeppred/produce_auto_files.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A master script to produce all the required auto-generated files:
 3 | 1. Uniprot's annotated seqs .lf file
 4 | 2. CSVs of the windows with features
 5 | 3. Pickle dump files of trained predictors.
 6 | Just execute this script as it is, with no arguments. Make sure to be in the py/ directory when running it.
 7 | '''
 8 | 
 9 | import sys
10 | import logging
11 | 
12 | # For logger initialization
13 | import asap
14 | 
15 | LOGGING_PREFIX = '********** '
16 | 
17 | logger = logging.getLogger('EXEC')
18 | 
19 | # Uniprot's .lf file
20 | logger.info(LOGGING_PREFIX + 'Running extract_uniprot_annotated_seqs_from_xml.py')
21 | sys.argv = ['']
22 | execfile('cleavepred/extract_uniprot_annotated_seqs_from_xml.py')
23 | 
24 | # Create CSVs
25 | for dataset in ['neuropred', 'uniprot']:
26 |     for advanced in ['false', 'true']:
27 |         logger.info(LOGGING_PREFIX + 'Running extract_windows.py with dataset="%s" and advanced="%s"' % (dataset, advanced))
28 |         sys.argv = ['', dataset, advanced]
29 |         execfile('cleavepred/extract_windows.py')
30 | 
31 | # # Create dump files
32 | # for advanced in ['false', 'true']:
33 | #     logger.info(LOGGING_PREFIX + 'Running train_classifier.py with advanced="%s"' % advanced)
34 | #     sys.argv = ['', advanced, 'auto']
35 | #     execfile('cleavepred/train_classifier.py')
36 | 
37 | logger.info(LOGGING_PREFIX + 'Finished executing scripts. All auto-generated files should now be updated.')
38 | 


--------------------------------------------------------------------------------
/py/deeppred/project_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from .common import AVAILABLE_TRACKS
 4 | 
 5 | # A global variable to update whenever looking to work on another dataset
 6 | dataset_name = None
 7 | 
 8 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 9 | # DATA_DIR = os.path.join(BASE_DIR, 'data/cleavage')
10 | DATA_DIR = os.path.join(BASE_DIR, 'data/deep/cleavage')
11 | 
12 | def get_dataset_dir():
13 |     return os.path.join(DATA_DIR, '%s_dataset' % dataset_name)
14 | 
15 | def get_peptide_predictor_dump_file_path(advanced):
16 |     if advanced:
17 |         return os.path.join(DATA_DIR, 'advanced_peptide_predictor.pkl')
18 |     else:
19 |         return os.path.join(DATA_DIR, 'simple_peptide_predictor.pkl')
20 | 
21 | def get_window_features_file_path(advanced):
22 |     if advanced:
23 |         return os.path.join(get_dataset_dir(), 'window_advanced_features.csv')
24 |     else:
25 |         return os.path.join(get_dataset_dir(), 'window_simple_features.csv')
26 | 
27 | def get_raw_data_xml_file_path():
28 |     # Relevant only for when dataset_name = 'uniprot'
29 |     return os.path.join(get_dataset_dir(), 'raw_data.xml')
30 | 
31 | def get_annotated_seqs_file_path():
32 |     return os.path.join(get_dataset_dir(), 'annotated_seqs.lf')
33 | 
34 | def get_filtered_seqs_file_path():
35 |     return os.path.join(get_dataset_dir(), 'filtered_seqs.fasta')
36 | 
37 | def get_track_file_paths():
38 |     return {track: os.path.join(get_dataset_dir(), 'extra_tracks/seqs.%s' % track) for track  in AVAILABLE_TRACKS}


--------------------------------------------------------------------------------
/py/deeppred/test_classifier.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to test a classifier that was trained on NeuroPred's dataset against UniProt's dataset
 3 | Arguments:
 4 | - advanced (boolean): Whether to test the advanced or simple classifier.
 5 | '''
 6 | 
 7 | import sys
 8 | import pickle
 9 | import logging
10 | 
11 | import pandas as pd
12 | 
13 | from cleavepred import util
14 | from cleavepred import project_paths
15 | 
16 | ### Parse arguments ###
17 | 
18 | advanced = util.parse_bool(sys.argv[1])
19 | 
20 | ### Configuration ###
21 | 
22 | # We use UniProt's dataset for testing our predictors.
23 | project_paths.dataset_name = 'uniprot'
24 | 
25 | logger = logging.getLogger('TEST')
26 | 
27 | ### Test the classifier ###
28 | 
29 | predictor_dump_file = None
30 | windows_file = None
31 | 
32 | def open_files():
33 |     global predictor_dump_file, windows_file
34 |     predictor_dump_file = open(project_paths.get_peptide_predictor_dump_file_path(advanced), 'rb')
35 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
36 |     
37 | def close_files():
38 |     util.close_files([predictor_dump_file, windows_file])
39 | 
40 | def test_classifier():
41 |     peptide_predictor = pickle.load(predictor_dump_file)
42 |     windows_data_frame = pd.read_csv(windows_file)
43 |     score, roc, sensitivity, precision, specificity, cm = peptide_predictor.window_classifier.test_performance(windows_data_frame, \
44 |             drop_only_almost_positives = True)
45 |     logger.info('score = %f, roc = %f, sensitivity = %f, precision = %f, specificity = %f' % (score, roc, sensitivity, precision, specificity))
46 |     logger.info('Confusion matrix:' + '\n' + str(cm))
47 |     
48 | if __name__ == '__main__':
49 |     try:
50 |         open_files()
51 |         test_classifier()
52 |     finally:
53 |         close_files()
54 | 


--------------------------------------------------------------------------------
/py/deeppred/train_classifier.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A script to train a classifier from NueroPred's dataset. Will log performance to stdout.
 3 | Arguments:
 4 | - advanced (boolean): Whether to use the advanced features (extracted with the extra tracks), or just the simple ones.
 5 | - predictor_dump_path (file path, optional): The path to dump the trained PeptidePredictor. If not provided, will not dump it at all. If provided
 6 | with the keyword "auto", will dump it to the project's relevant file.
 7 | '''
 8 | 
 9 | import sys
10 | import pickle
11 | 
12 | import pandas as pd
13 | 
14 | from sklearn.feature_selection import VarianceThreshold, SelectFdr
15 | from sklearn.linear_model import LogisticRegressionCV
16 | from sklearn.ensemble import RandomForestClassifier
17 | from sklearn.svm import SVC
18 | from mlxtend.classifier import EnsembleClassifier
19 | 
20 | from asap import train_window_classifier, PeptidePredictor, FeatureSelectionPipeline
21 | 
22 | from cleavepred import util
23 | from cleavepred import project_paths
24 | from cleavepred.common import window_extraction_params
25 | 
26 | ### Parse arguments ###
27 | 
28 | advanced = util.parse_bool(sys.argv[1])
29 | 
30 | if len(sys.argv) > 2:
31 |     if sys.argv[2].lower() == 'auto':
32 |         predictor_dump_path = project_paths.get_peptide_predictor_dump_file_path(advanced)
33 |     else:
34 |         predictor_dump_path = sys.argv[2]
35 | else:
36 |     predictor_dump_path = None
37 | 
38 | ### Configuration ###
39 | 
40 | # We use NeuroPred's dataset for training/validation of our predictors.
41 | project_paths.dataset_name = 'neuropred'
42 | 
43 | ensemble_classifiers = [
44 |     LogisticRegressionCV(Cs = 16, n_jobs = -2, class_weight = 'auto'),
45 |     RandomForestClassifier(n_estimators = 250, bootstrap = True, criterion = 'gini', n_jobs = -2, class_weight = 'auto'),
46 |     SVC(kernel = 'rbf', C = 3.798, probability = True, cache_size = 2400, class_weight = 'auto'),
47 | ]
48 | classifiers = [EnsembleClassifier(clfs = ensemble_classifiers, voting = 'hard')]
49 | 
50 | feature_selector = FeatureSelectionPipeline([
51 |     VarianceThreshold(0.03),
52 |     SelectFdr(alpha = 0.1),
53 | ])
54 | 
55 | ### Train the classifier and dump the predictor ###
56 | 
57 | windows_file = None
58 | predictor_dump_file = None
59 | 
60 | def open_files():
61 |     global windows_file, predictor_dump_file
62 |     windows_file = open(project_paths.get_window_features_file_path(advanced), 'rb')
63 |     predictor_dump_file = util.open_file(predictor_dump_path, 'wb')
64 | 
65 | def close_files():
66 |     util.close_files([windows_file, predictor_dump_file])
67 | 
68 | def dump_predictor(predictor):
69 |     if predictor_dump_file is not None:
70 |         pickle.dump(predictor, predictor_dump_file)
71 | 
72 | def train_classifier():
73 |     windows_data_frame = pd.read_csv(windows_file)
74 |     window_classifier, classifier_performance = train_window_classifier(windows_data_frame, classifiers = classifiers, \
75 |             drop_only_almost_positives = True, feature_selector = feature_selector, n_folds = 10)
76 |     peptide_predictor = PeptidePredictor(window_classifier, window_extraction_params = window_extraction_params)
77 |     dump_predictor(peptide_predictor)
78 | 
79 | if __name__ == '__main__':
80 |     try:
81 |         open_files()
82 |         train_classifier()
83 |     finally:
84 |         close_files()
85 | 


--------------------------------------------------------------------------------
/py/deeppred/util.py:
--------------------------------------------------------------------------------
 1 | def split_to_chunks(array, chunk_size):
 2 |     for i in xrange(0, len(array), chunk_size):
 3 |         yield array[i:(i + chunk_size)]
 4 | 
 5 | def parse_bool(raw_value):
 6 |     if raw_value.lower() in ['true', 'yes', '1']:
 7 |         return True
 8 |     elif raw_value.lower() in ['false', 'no', '0']:
 9 |         return False
10 |     else:
11 |         raise Exception('Unrecognized boolean value: ' + str(raw_value))
12 |         
13 | def open_file(path, *args, **argv):
14 |     if path is None:
15 |         return None
16 |     else:
17 |         return open(path, *args, **argv)
18 |         
19 | def read_file(path):
20 |     
21 |     f = open(path, 'rb')
22 |     
23 |     try:
24 |         return f.read()
25 |     finally:
26 |         f.close()
27 | 
28 | def close_file(file):
29 |     if file is not None:
30 |         file.close()
31 |         
32 | def close_files(files):
33 |     for file in files:
34 |         close_file(file)


--------------------------------------------------------------------------------
/web/cleavage/context_processors.py:
--------------------------------------------------------------------------------
1 | from django.conf import settings
2 | 
3 | def settings_access(request):
4 |     return {'settings': settings}


--------------------------------------------------------------------------------
/web/cleavage/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------
/web/cleavage/settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Django settings for cleavepred project.
 3 | """
 4 | 
 5 | import sys
 6 | import os
 7 | 
 8 | ### Local Environment-Dependent Settings ###
 9 | 
10 | SITE_URL = ''
11 | 
12 | ### Preparations ###
13 | 
14 | BASE_DIR = os.path.dirname(__file__)
15 | PROJECT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
16 | PROJECT_PY_DIR = os.path.join(PROJECT_DIR, 'py')
17 | 
18 | # In order to load 'cleavepred' module later on
19 | sys.path += [PROJECT_PY_DIR]
20 | 
21 | ### Development vs. Production status ###
22 | 
23 | DEBUG = True
24 | 
25 | TEMPLATE_DEBUG = True
26 | 
27 | ALLOWED_HOSTS = []
28 | 
29 | ### Security ###
30 | 
31 | SECRET_KEY = 'ij*hpack1#brf--5b_1nd7$cz*8h*y=b!y#_bd48v5kcdg0*vd'
32 | 
33 | ### Localization ###
34 | 
35 | TIME_ZONE = 'UTC'
36 | LANGUAGE_CODE = 'en-us'
37 | 
38 | ### URLs ###
39 | 
40 | ROOT_URLCONF = 'urls'
41 | 
42 | ### Context Processors, Middlewares & Apps ###
43 | 
44 | TEMPLATE_CONTEXT_PROCESSORS = (
45 |     'django.contrib.auth.context_processors.auth',
46 |     'django.core.context_processors.debug',
47 |     'django.core.context_processors.media',
48 |     'django.core.context_processors.static',
49 |     'django.contrib.messages.context_processors.messages',
50 |     'context_processors.settings_access'
51 | )
52 | 
53 | MIDDLEWARE_CLASSES = (
54 |     'django.contrib.sessions.middleware.SessionMiddleware',
55 |     'django.middleware.csrf.CsrfViewMiddleware',
56 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
57 |     'django.middleware.common.CommonMiddleware',
58 |     'django.contrib.messages.middleware.MessageMiddleware',
59 | )
60 | 
61 | INSTALLED_APPS = (
62 |     'django.contrib.staticfiles',
63 |     'django.contrib.sessions',
64 |     'django.contrib.contenttypes',
65 |     'django.contrib.auth',
66 |     'django.contrib.admin',
67 |     'django.contrib.humanize',
68 | )
69 | 
70 | ### Templates ###
71 | 
72 | TEMPLATE_DIRS = (
73 |     os.path.join(BASE_DIR, 'templates'),
74 | )
75 | 
76 | ### Static Files ###
77 | 
78 | STATIC_URL = '/static/'
79 | 


--------------------------------------------------------------------------------
/web/cleavage/templates/base.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |     <head>
 3 |         <title>CleavePred - A Machine-Learning model for predicting cleavage products of neuropeptide precursors</title>
 4 |     </head>
 5 |     <body style = "font-size: 15px; color: #101010;">
 6 |         <div>
 7 |             <a href = "{{ settings.SITE_URL }}/" style = "text-decoration: none; color: inherit;"><h1 style = "margin-top: 50px; text-align: center; font-family: Copperplate, 'Copperplate Gothic Light', fantasy; font-size: 45px;">CleavePred</h1></a>
 8 |             <div style = "text-align: center; font-weight: bold; font-size: 18px;">A Machine-Learning model for predicting cleavage products of neuropeptide precursors</div>
 9 |             <hr style = "width: 800px;"/>
10 |         </div>
11 |         <div style = "margin-top: 50px; margin-bottom: 50px;">
12 |             {% block content %}{% endblock %}
13 |         </div>
14 |         <div style = "margin-bottom: 25px; text-align: center; font-size: 14px;">
15 |             <hr/>
16 |             <p>CleavePred is powered by <a href = "https://github.com/ddofer/asap">ASAP</a>, a generic API for easily learning local protein annotations with minimal fine-tuning using powerful feature engineering combined with standard Machine-Learning models. Inside ASAP's <a href = "https://github.com/ddofer/asap">GitHub project</a>, you will also find the source code of CleavePred, which comes with a handy API as well. To learn more about either of ASAP's or CleavePred's APIs (which offer more options than this website), read the Wiki page in GitHub (in particular <a href = "https://github.com/ddofer/asap/wiki/Getting-Started:-A-Basic-Tutorial">this tutorial</a>). To learn more about the underlying algorithm, please read our paper "ASAP: A Machine-Learning Framework for Local Protein Properties". <span style = "font-weight: bold;">If you found our work to be useful for your research, please cite it.</span></p>
17 |             <p>For any issue/request, feel free to contact us: Nadav Brandes (<a href = "mailto:nadav.brandes@mail.huji.ac.il" target = "_blank">nadav.brandes@mail.huji.ac.il</a>) and Dan Ofer (<a href = "mailto:ddofer@gmail.com" target = "_blank">ddofer@gmail.com</a>).</p>
18 |         </div>
19 |     </body>
20 | </html>


--------------------------------------------------------------------------------
/web/cleavage/templates/cleavage-prediction.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block content %}
 4 |     <div style = "margin-left: 20px;">
 5 |         <p style = "text-align: center;">Following are the prediction results. Below each amino-acid is its predicted probability of being a cleavage site.</p>
 6 |         {% for record_id, labeled_aa_chunks, cleavage_products in seqs_data %}
 7 |             <div style = "margin-top: 50px; font-weight: bold;">&gt {{ record_id }}</div>
 8 |             <table>
 9 |                 {% for labeled_aa_chunk in labeled_aa_chunks %}
10 |                     <tr>
11 |                         {% for labeled_aa in labeled_aa_chunk %}
12 |                             <td style = "padding-top: 20px; padding-right: 30px;"><span style = "background-color: {{ labeled_aa.background_color }};">{{ labeled_aa.aa }}</span></td>
13 |                         {% endfor %}
14 |                     </tr>
15 |                     <tr>
16 |                         {% for labeled_aa in labeled_aa_chunk %}
17 |                             <td style = "color: {{ labeled_aa.probability_color }};">{{ labeled_aa.cleavage_probability|floatformat:2 }}</td>
18 |                         {% endfor %}
19 |                     </tr>
20 |                 {% endfor %}
21 |             </table>
22 |             <div style = "margin-top: 30px;"><span style = "text-decoration: underline;">Predicted cleavage products</span>:</div>
23 |             <ul style = "width: 800px;">
24 |                 {% for cleavage_product in cleavage_products %}
25 |                     <li style = "word-wrap: break-word;">{{ cleavage_product }}</li>
26 |                 {% endfor %}
27 |             </ul>
28 |             <div style = "margin-top: 30px; text-align: center;"><a href = "{{ settings.SITE_URL }}/">Go back to home page</a></div>
29 |         {% endfor %}
30 |     </div>
31 | {% endblock %}


--------------------------------------------------------------------------------
/web/cleavage/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block content %}
 4 |     <div style = "text-align: center;">
 5 |         <div>Please fill below the amino-acid sequences to predict in FASTA format. You may either upload a FASTA file, or paste it as a free text.</div>
 6 |         <form action = "{{ settings.SITE_URL }}/cleavage-prediction/" method = "POST" enctype = "multipart/form-data">
 7 |             {% csrf_token %}
 8 |             <div style = "margin: 10px;"><textarea name = "seqs-text" rows = "20" cols = "70"></textarea></div>
 9 |             <div><input type = "file" name = "seqs-file" value = "lol" style = "margin-right: 50px; border: 1px solid #c0c0c0;"/><input type = "submit" value = "Submit"/></div>
10 |         </form>
11 |     </div>
12 | {% endblock %}
13 | 


--------------------------------------------------------------------------------
/web/cleavage/urls.py:
--------------------------------------------------------------------------------
 1 | from django.conf.urls import patterns, url
 2 | from django.views.generic import TemplateView
 3 | 
 4 | import views
 5 | 
 6 | urlpatterns = patterns('',
 7 |     
 8 |     url(r'^$',
 9 |         TemplateView.as_view(template_name = 'home.html'),
10 |     ),
11 |     
12 |     url(r'^cleavage-prediction/$',
13 |         views.cleavage_prediction,
14 |     ),
15 | )
16 | 


--------------------------------------------------------------------------------
/web/cleavage/views.py:
--------------------------------------------------------------------------------
  1 | from StringIO import StringIO
  2 | import logging
  3 | 
  4 | from Bio import SeqIO
  5 | 
  6 | from django.template import RequestContext
  7 | from django.shortcuts import render_to_response
  8 | 
  9 | from cleavepred import simple_cleavage_predictor
 10 | from cleavepred.util import split_to_chunks
 11 | 
 12 | LOGGER = logging.getLogger('WEB')
 13 | 
 14 | def cleavage_prediction(request):
 15 |     seqs_data = _get_seqs_data(_get_raw_seqs(request))
 16 |     return render_to_response('cleavage-prediction.html', {'seqs_data': seqs_data}, context_instance = RequestContext(request))
 17 |     
 18 | def _get_raw_seqs(request):
 19 |     if 'seqs-file' in request.FILES:
 20 |         return request.FILES['seqs-file'].read()
 21 |     else:
 22 |         return request.POST.get('seqs-text')
 23 |     
 24 | def _get_seqs_data(raw_seqs):
 25 |     
 26 |     raw_seqs = _fix_string_newlines(unicode(raw_seqs)) # This patch is required for some reason, because Django somehow corrupts uploaded files
 27 |     LOGGER.info('Received a %d bytes long FASTA' % len(raw_seqs))
 28 |     seqs_fasta = StringIO(_fix_fasta_if_needed(raw_seqs))
 29 |     records = list(SeqIO.parse(seqs_fasta, 'fasta'))
 30 |     LOGGER.info('About to process %d records' % len(records))
 31 |     
 32 |     for record in records:
 33 |         
 34 |         id = record.id
 35 |         seq = str(record.seq)
 36 |         LOGGER.info('Processing record %s: %s' % (id, seq))
 37 |         
 38 |         cleavage_mask, cleavage_products = simple_cleavage_predictor.predict(seq, proba = True)
 39 |         labeled_aa_chunks = split_to_chunks([_LabeledAminoAcid(aa, label) for aa, label in zip(seq, cleavage_mask)], _RESIDUES_TO_DISPAY_PER_ROW)
 40 |         yield id, labeled_aa_chunks, cleavage_products
 41 |         
 42 | def _fix_fasta_if_needed(raw_fasta):
 43 |     if _has_fasta_headers(raw_fasta):
 44 |         return raw_fasta
 45 |     else:
 46 |         return _DEFAULT_FASTA_HEADER + '\n' + raw_fasta
 47 |     
 48 | def _has_fasta_headers(raw_fasta):
 49 |     
 50 |     for line in raw_fasta.splitlines():
 51 |         if line.startswith('>'):
 52 |             return True
 53 |             
 54 |     return False
 55 |     
 56 | def _fix_string_newlines(string):
 57 |     
 58 |     fixed_string = ''
 59 |     
 60 |     for i in xrange(len(string)):
 61 |         if string[i] == '\r' and i < len(string) - 1 and string[i + 1] != '\n':
 62 |             fixed_string += '\r\n'
 63 |         else:
 64 |             fixed_string += string[i]
 65 |             
 66 |     return fixed_string
 67 |     
 68 | class _LabeledAminoAcid(object):
 69 |     
 70 |     def __init__(self, aa, cleavage_probability):
 71 |         self.aa = aa
 72 |         self.cleavage_probability = cleavage_probability
 73 |         
 74 |     def is_cleavage(self):
 75 |         return self.cleavage_probability >= 0.5
 76 |         
 77 |     def background_color(self):
 78 |         if self.is_cleavage():
 79 |             return '#ff0000'
 80 |         else:
 81 |             return '#ffffff'
 82 |         
 83 |     def probability_color(self):
 84 |         if self.cleavage_probability <= 0.0:
 85 |             return '#aaaaaa'
 86 |         if self.cleavage_probability < 0.1:
 87 |             return '#666666'
 88 |         if self.cleavage_probability < 0.3:
 89 |             return '#aaaa00'
 90 |         if self.cleavage_probability < 0.5:
 91 |             return '#ffaa00'
 92 |         else:
 93 |             return '#ff0000'
 94 |             
 95 |     def __repr__(self):
 96 |         return '<%s>' % self.aa
 97 | 
 98 | _DEFAULT_FASTA_HEADER = '>input_seq'
 99 | _RESIDUES_TO_DISPAY_PER_ROW = 20
100 | 


--------------------------------------------------------------------------------
/web/cleavage/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for cleavepred project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.6/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
12 | 
13 | from django.core.wsgi import get_wsgi_application
14 | application = get_wsgi_application()
15 | 


--------------------------------------------------------------------------------