├── .gitattributes
├── README.md
├── classification
├── code
│ ├── .ipynb_checkpoints
│ │ └── .gitignore
│ └── train_and_predict.ipynb
├── inputs
│ ├── 2GFP_above_parent123457.pkl
│ ├── GFP_data.pkl
│ ├── X_and_terms.pkl
│ ├── all_lit_chimeras_gaps.txt
│ ├── gfp_props.pkl
│ ├── lit_alignment_and_contacts.pkl
│ └── props.pkl
└── outputs
│ ├── 2GFP_above_parent.pkl
│ ├── 2GFP_above_parent.pkl.txt
│ ├── matern52_bin0.1_max_peak_False.pkl
│ └── matern52_bin0.1_max_peak_False.txt
└── regression
├── .ipynb_checkpoints
└── .gitignore
├── GP_matern_5_2_kernel.ipynb
├── GP_matern_5_2_kernel_LASSO.ipynb
├── GP_tools.py
├── __pycache__
└── .gitignore
├── chimera_tools.py
├── encoding_tools.py
├── inputs
├── Ephys_data_formatted.csv
├── alignment_and_contacts_C1C2.pkl
├── lit_alignment_and_contacts_pro2.pkl
├── shmetis_c_10_21_0
│ └── chimeras.output
└── shmetis_n_10_21_0
│ └── chimeras.output
├── lasso_tools.py
└── outputs
├── green_norm_matern_kernel.pdf
├── green_norm_matern_kernel_CV_fig1.pdf
├── green_norm_matern_kernel_LASSO_CV.pdf
├── kinetics_off_matern_kernel.pdf
├── kinetics_off_matern_kernel_CV_fig1.pdf
├── kinetics_off_matern_kernel_LASSO_CV.pdf
├── matern_green_norm_0.025_LASSO.csv
├── matern_kernel_gen10_green_norm.csv
├── matern_kernel_gen10_kinetics_off.csv
├── matern_kernel_gen10_max_peak.csv
├── matern_kinetics_off_0.03_LASSO.csv
├── matern_max_peak_0.05_LASSO.csv
├── max_peak_matern_kernel.pdf
├── max_peak_matern_kernel_CV_fig1.pdf
└── max_peak_matern_kernel_LASSO_CV.pdf
/.gitattributes:
--------------------------------------------------------------------------------
1 | *pkl filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # channels
2 | ## Code to reproduce the paper *Machine learning-guided channelrhodopsin engineering enables minimally-invasive optogenetics*. Gaussian process models for optimizing channelrhodopsin properties.
3 |
4 | ### Computing Environment:
5 |
6 | This was originally developed using Anaconda Python 3.6 and the following packages and versions:
7 |
8 | 1. numpy 1.13.3
9 | 2. pandas 0.20.3
10 | 3. scipy 0.19.1
11 | 4. sklearn 0.19.0
12 | 5. gpmodel (https://github.com/yangkky/gpmodel)
13 |
14 | ### File structure
15 |
16 | The repository is divided into two self-contained directories containing all the code and inputs for the regression and classification models, respectively. For regression, the GP code is here. For classification, the GP code is in the gpmodel repository (https://github.com/yangkky/gpmodel)
--------------------------------------------------------------------------------
/classification/code/.ipynb_checkpoints/.gitignore:
--------------------------------------------------------------------------------
1 | *checkpoint.ipynb
2 |
--------------------------------------------------------------------------------
/classification/code/train_and_predict.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pickle\n",
10 | "import os\n",
11 | "\n",
12 | "import pandas as pd\n",
13 | "import numpy as np\n",
14 | "from scipy import stats\n",
15 | "from sklearn import metrics\n",
16 | "from sklearn import model_selection\n",
17 | "from gpmodel import gpmodel, gpkernel, chimera_tools"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 2,
23 | "metadata": {},
24 | "outputs": [
25 | {
26 | "data": {
27 | "text/html": [
28 | "
\n",
29 | "\n",
42 | "
\n",
43 | " \n",
44 | " \n",
45 | " | \n",
46 | " block_k | \n",
47 | " cyan_norm | \n",
48 | " cyan_peak | \n",
49 | " cyan_ss | \n",
50 | " generation | \n",
51 | " green_norm | \n",
52 | " green_peak | \n",
53 | " green_ss | \n",
54 | " kinetics_off | \n",
55 | " m | \n",
56 | " ... | \n",
57 | " log_cyan_ss | \n",
58 | " log_green_norm | \n",
59 | " log_green_peak | \n",
60 | " log_green_ss | \n",
61 | " log_max_peak | \n",
62 | " log_max_ss | \n",
63 | " log_red_peak | \n",
64 | " log_red_ss | \n",
65 | " log_kinetics_off | \n",
66 | " bin0.1_max_peak | \n",
67 | "
\n",
68 | " \n",
69 | " \n",
70 | " \n",
71 | " 0 | \n",
72 | " n0221012201 | \n",
73 | " NaN | \n",
74 | " 0.014452 | \n",
75 | " 0.003170 | \n",
76 | " 4 | \n",
77 | " NaN | \n",
78 | " 0.011225 | \n",
79 | " 0.002416 | \n",
80 | " NaN | \n",
81 | " 59.0 | \n",
82 | " ... | \n",
83 | " -5.754181 | \n",
84 | " NaN | \n",
85 | " -4.489609 | \n",
86 | " -6.025555 | \n",
87 | " -4.236934 | \n",
88 | " -5.754181 | \n",
89 | " -4.513885 | \n",
90 | " -6.321817 | \n",
91 | " NaN | \n",
92 | " -1 | \n",
93 | "
\n",
94 | " \n",
95 | " 1 | \n",
96 | " n0010000000 | \n",
97 | " 1.0 | \n",
98 | " 0.260871 | \n",
99 | " 0.173618 | \n",
100 | " 1 | \n",
101 | " 0.184076 | \n",
102 | " 0.048020 | \n",
103 | " 0.036310 | \n",
104 | " 11.525 | \n",
105 | " 19.0 | \n",
106 | " ... | \n",
107 | " -1.750895 | \n",
108 | " -1.692407 | \n",
109 | " -3.036137 | \n",
110 | " -3.315654 | \n",
111 | " -1.343730 | \n",
112 | " -1.750895 | \n",
113 | " -4.623666 | \n",
114 | " -7.591407 | \n",
115 | " 2.444519 | \n",
116 | " 1 | \n",
117 | "
\n",
118 | " \n",
119 | " 2 | \n",
120 | " n0001000000 | \n",
121 | " 1.0 | \n",
122 | " 1.034604 | \n",
123 | " 0.925893 | \n",
124 | " 1 | \n",
125 | " 0.359432 | \n",
126 | " 0.371870 | \n",
127 | " 0.353229 | \n",
128 | " 70.550 | \n",
129 | " 3.0 | \n",
130 | " ... | \n",
131 | " -0.076997 | \n",
132 | " -1.023229 | \n",
133 | " -0.989210 | \n",
134 | " -1.040639 | \n",
135 | " 0.034019 | \n",
136 | " -0.076997 | \n",
137 | " -4.643232 | \n",
138 | " -6.434030 | \n",
139 | " 4.256322 | \n",
140 | " 1 | \n",
141 | "
\n",
142 | " \n",
143 | " 3 | \n",
144 | " c1112001101 | \n",
145 | " 1.0 | \n",
146 | " 1.234327 | \n",
147 | " 0.920585 | \n",
148 | " 9 | \n",
149 | " 0.373256 | \n",
150 | " 0.460720 | \n",
151 | " 0.441420 | \n",
152 | " 269.125 | \n",
153 | " 44.0 | \n",
154 | " ... | \n",
155 | " -0.082746 | \n",
156 | " -0.985491 | \n",
157 | " -0.774965 | \n",
158 | " -0.817759 | \n",
159 | " 0.210526 | \n",
160 | " -0.082746 | \n",
161 | " -4.235193 | \n",
162 | " -5.591161 | \n",
163 | " 5.595176 | \n",
164 | " 1 | \n",
165 | "
\n",
166 | " \n",
167 | " 4 | \n",
168 | " c2202121120 | \n",
169 | " NaN | \n",
170 | " 0.009388 | \n",
171 | " 0.000000 | \n",
172 | " 4 | \n",
173 | " NaN | \n",
174 | " 0.008662 | \n",
175 | " 0.000000 | \n",
176 | " NaN | \n",
177 | " 62.0 | \n",
178 | " ... | \n",
179 | " NaN | \n",
180 | " NaN | \n",
181 | " -4.748757 | \n",
182 | " NaN | \n",
183 | " -4.563973 | \n",
184 | " -8.729060 | \n",
185 | " -4.563973 | \n",
186 | " -8.729060 | \n",
187 | " NaN | \n",
188 | " -1 | \n",
189 | "
\n",
190 | " \n",
191 | "
\n",
192 | "
5 rows × 32 columns
\n",
193 | "
"
194 | ],
195 | "text/plain": [
196 | " block_k cyan_norm cyan_peak cyan_ss generation green_norm \\\n",
197 | "0 n0221012201 NaN 0.014452 0.003170 4 NaN \n",
198 | "1 n0010000000 1.0 0.260871 0.173618 1 0.184076 \n",
199 | "2 n0001000000 1.0 1.034604 0.925893 1 0.359432 \n",
200 | "3 c1112001101 1.0 1.234327 0.920585 9 0.373256 \n",
201 | "4 c2202121120 NaN 0.009388 0.000000 4 NaN \n",
202 | "\n",
203 | " green_peak green_ss kinetics_off m ... log_cyan_ss \\\n",
204 | "0 0.011225 0.002416 NaN 59.0 ... -5.754181 \n",
205 | "1 0.048020 0.036310 11.525 19.0 ... -1.750895 \n",
206 | "2 0.371870 0.353229 70.550 3.0 ... -0.076997 \n",
207 | "3 0.460720 0.441420 269.125 44.0 ... -0.082746 \n",
208 | "4 0.008662 0.000000 NaN 62.0 ... NaN \n",
209 | "\n",
210 | " log_green_norm log_green_peak log_green_ss log_max_peak log_max_ss \\\n",
211 | "0 NaN -4.489609 -6.025555 -4.236934 -5.754181 \n",
212 | "1 -1.692407 -3.036137 -3.315654 -1.343730 -1.750895 \n",
213 | "2 -1.023229 -0.989210 -1.040639 0.034019 -0.076997 \n",
214 | "3 -0.985491 -0.774965 -0.817759 0.210526 -0.082746 \n",
215 | "4 NaN -4.748757 NaN -4.563973 -8.729060 \n",
216 | "\n",
217 | " log_red_peak log_red_ss log_kinetics_off bin0.1_max_peak \n",
218 | "0 -4.513885 -6.321817 NaN -1 \n",
219 | "1 -4.623666 -7.591407 2.444519 1 \n",
220 | "2 -4.643232 -6.434030 4.256322 1 \n",
221 | "3 -4.235193 -5.591161 5.595176 1 \n",
222 | "4 -4.563973 -8.729060 NaN -1 \n",
223 | "\n",
224 | "[5 rows x 32 columns]"
225 | ]
226 | },
227 | "execution_count": 2,
228 | "metadata": {},
229 | "output_type": "execute_result"
230 | }
231 | ],
232 | "source": [
233 | "with open('../inputs/props.pkl', 'rb') as f:\n",
234 | " df = pickle.load(f)\n",
235 | "df.head()"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 3,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "def select_X_and_Y(df, all_X, y_column):\n",
245 | " not_dropped = ~pd.isnull(df[y_column])\n",
246 | " not_dropped = pd.Series(not_dropped, index=df.index)\n",
247 | " Ys = df[not_dropped][y_column]\n",
248 | " gens = df[not_dropped]['generation']\n",
249 | " Ys.index = df[not_dropped]['name']\n",
250 | " Xs = all_X.loc[Ys.index]\n",
251 | " return Xs, Ys, gens"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 4,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "tasks = ['bin0.1_max_peak', '']\n",
261 | "lits = [False]\n",
262 | "mtypes = [gpmodel.GPClassifier]"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 5,
268 | "metadata": {},
269 | "outputs": [],
270 | "source": [
271 | "with open('../inputs/X_and_terms.pkl', 'rb') as f:\n",
272 | " X_all, terms = pickle.load(f)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 6,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stdout",
282 | "output_type": "stream",
283 | "text": [
284 | "../outputs/matern52_bin0.1_max_peak_False.pkl\n",
285 | "[ 20.88092844] [ 76.59353427]\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "def train_and_save(df, task, fname, mtype, guesses=None):\n",
291 | " X, y, _ = select_X_and_Y(df, X_all, task)\n",
292 | " X = X.values\n",
293 | " y = y.values \n",
294 | " k = gpkernel.MaternKernel('5/2')\n",
295 | " clf = mtype(k, guesses=guesses)\n",
296 | " clf.fit(X, y)\n",
297 | " clf.dump(fname)\n",
298 | " return clf\n",
299 | "\n",
300 | "for task, lit, mtype in zip(tasks, lits, mtypes):\n",
301 | " fname = '../outputs/matern52_' + task + '_' + str(lit) + '.pkl'\n",
302 | " if lit:\n",
303 | " clf = train_and_save(df, task, fname, mtype)\n",
304 | " else:\n",
305 | " clf = train_and_save(df[df['generation'] != 8], task, fname, mtype)\n",
306 | " print(fname)\n",
307 | " print(clf.hypers, clf.ML)"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 7,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "data": {
317 | "text/plain": [
318 | "['matern52_bin0.1_max_peak_False']"
319 | ]
320 | },
321 | "execution_count": 7,
322 | "metadata": {},
323 | "output_type": "execute_result"
324 | }
325 | ],
326 | "source": [
327 | "cls_dict = {True:gpmodel.GPClassifier.load, False: gpmodel.GPRegressor.load}\n",
328 | "clfs = [cls_dict['bin' in path]('../outputs/' + path) for path in os.listdir('../outputs/') if path != '.DS_Store']\n",
329 | "fnames = ['.'.join(path.split('.')[:-1]) for path in os.listdir('../outputs/') if path != '.DS_Store']\n",
330 | "fnames"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": 8,
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "for fname in fnames:\n",
340 | " with open('../outputs/' + fname + '.txt', 'w') as f:\n",
341 | " if 'bin' in fname:\n",
342 | " f.write('name,p,mu,var\\n')\n",
343 | " else:\n",
344 | " f.write('name,mu,var\\n')"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 9,
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "name": "stdout",
354 | "output_type": "stream",
355 | "text": [
356 | "0\n",
357 | "100\n",
358 | "200\n",
359 | "300\n",
360 | "400\n",
361 | "500\n",
362 | "600\n",
363 | "700\n",
364 | "800\n",
365 | "900\n",
366 | "1000\n",
367 | "CPU times: user 7h 41min 48s, sys: 15min 36s, total: 7h 57min 24s\n",
368 | "Wall time: 8h 6min\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "%%time\n",
374 | "df = pd.read_csv('../inputs/all_lit_chimeras_gaps.txt', index_col=0)\n",
375 | "with open('../inputs/lit_alignment_and_contacts.pkl', 'rb') as f:\n",
376 | " ss, contacts = pickle.load(f)\n",
377 | "amino_acids = ('G', 'A', 'L', 'M', 'F', 'W', 'K', 'Q', 'E', 'S',\n",
378 | " 'P', 'V', 'I', 'C', 'Y', 'H', 'R', 'N', 'D', 'T', '-')\n",
379 | "sample_space = [amino_acids for _ in ss]\n",
380 | "n_splits = 1000\n",
381 | "n_per = len(df.index) // n_splits\n",
382 | "inds = [df.index[n * n_per: (n+1) * n_per]\n",
383 | " for n in range(n_splits)]\n",
384 | "inds.append(df.index[n_splits * n_per::])\n",
385 | "seq_terms = chimera_tools.make_sequence_terms(sample_space)\n",
386 | "struct_terms = chimera_tools.contacting_terms(sample_space, contacts)\n",
387 | "all_terms = seq_terms + struct_terms\n",
388 | "\n",
389 | "for i, ind in enumerate(inds):\n",
390 | " seqs = df.loc[ind]['sequence'].values\n",
391 | " if len(seqs) == 0:\n",
392 | " continue\n",
393 | " if i % (n_splits // 10) == 0:\n",
394 | " print(i)\n",
395 | " struct_X, _ = chimera_tools.make_contact_X(seqs, sample_space, contacts, contact_terms=struct_terms)\n",
396 | " seq_X, _ = chimera_tools.make_sequence_X(seqs,\n",
397 | " sample_space=sample_space,\n",
398 | " sequence_terms=seq_terms)\n",
399 | " all_X = np.concatenate([seq_X, struct_X], axis=1)\n",
400 | " for clf, fname in zip(clfs, fnames):\n",
401 | " preds = pd.DataFrame(index=df.loc[ind]['name'].values)\n",
402 | " if 'bin' in fname:\n",
403 | " pi, mu, var = clf.predict(all_X)\n",
404 | " preds['pi'] = pi\n",
405 | " else:\n",
406 | " mu, var = clf.predict(all_X)\n",
407 | " var = np.diag(var)\n",
408 | " preds['mu'] = mu\n",
409 | " preds['var'] = var\n",
410 | " with open('../outputs/' + fname + '.txt', 'a') as f:\n",
411 | " preds.to_csv(f, header=False)"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": 15,
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "name": "stdout",
421 | "output_type": "stream",
422 | "text": [
423 | "../outputs/2GFP_above_parent.pkl\n",
424 | "[ 0.36061643 0.04737465] [ 117.78582814]\n"
425 | ]
426 | }
427 | ],
428 | "source": [
429 | "with open('../inputs/GFP_data.pkl', 'rb') as f:\n",
430 | " X, y = pickle.load(f)\n",
431 | "\n",
432 | "def train_and_save(X, y, fname, mtype):\n",
433 | " k = gpkernel.PolynomialKernel(2)\n",
434 | " clf = mtype(k, guesses=None)\n",
435 | " clf.fit(X, y)\n",
436 | " clf.dump(fname)\n",
437 | " return clf\n",
438 | "\n",
439 | "task = 'GFP_above_parent'\n",
440 | "mtype = gpmodel.GPClassifier\n",
441 | "fname = '../outputs/2' + task + '.pkl'\n",
442 | "\n",
443 | "clf = train_and_save(X, y, fname, mtype)\n",
444 | "print(fname)\n",
445 | "print(clf.hypers, clf.ML)"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 21,
451 | "metadata": {},
452 | "outputs": [
453 | {
454 | "name": "stdout",
455 | "output_type": "stream",
456 | "text": [
457 | "0\n",
458 | "100\n",
459 | "200\n",
460 | "300\n",
461 | "400\n",
462 | "500\n",
463 | "600\n",
464 | "700\n",
465 | "800\n",
466 | "900\n",
467 | "1000\n",
468 | "CPU times: user 1h, sys: 16min 42s, total: 1h 16min 42s\n",
469 | "Wall time: 1h 7min 57s\n"
470 | ]
471 | }
472 | ],
473 | "source": [
474 | "%%time\n",
475 | "with open('../outputs/' + fname + '.txt', 'w') as f:\n",
476 | " f.write('name,p,mu,var\\n')\n",
477 | " \n",
478 | "df = pd.read_csv('../inputs/all_lit_chimeras_gaps.txt', index_col=0)\n",
479 | "with open('../inputs/lit_alignment_and_contacts.pkl', 'rb') as f:\n",
480 | " ss, contacts = pickle.load(f)\n",
481 | "amino_acids = ('G', 'A', 'L', 'M', 'F', 'W', 'K', 'Q', 'E', 'S',\n",
482 | " 'P', 'V', 'I', 'C', 'Y', 'H', 'R', 'N', 'D', 'T', '-')\n",
483 | "sample_space = [amino_acids for _ in ss]\n",
484 | "n_splits = 1000\n",
485 | "n_per = len(df.index) // n_splits\n",
486 | "inds = [df.index[n * n_per: (n+1) * n_per]\n",
487 | " for n in range(n_splits)]\n",
488 | "inds.append(df.index[n_splits * n_per::])\n",
489 | "seq_terms = chimera_tools.make_sequence_terms(sample_space)\n",
490 | "struct_terms = chimera_tools.contacting_terms(sample_space, contacts)\n",
491 | "all_terms = seq_terms + struct_terms\n",
492 | "\n",
493 | "\n",
494 | "for i, ind in enumerate(inds):\n",
495 | " seqs = df.loc[ind]['sequence'].values\n",
496 | " if len(seqs) == 0:\n",
497 | " continue\n",
498 | " if i % (n_splits // 10) == 0:\n",
499 | " print(i)\n",
500 | " struct_X, _ = chimera_tools.make_contact_X(seqs, sample_space, contacts, contact_terms=struct_terms)\n",
501 | " seq_X, _ = chimera_tools.make_sequence_X(seqs,\n",
502 | " sample_space=sample_space,\n",
503 | " sequence_terms=seq_terms)\n",
504 | " all_X = np.concatenate([seq_X, struct_X], axis=1)\n",
505 | " \n",
506 | " \n",
507 | " preds = pd.DataFrame(index=df.loc[ind]['name'].values)\n",
508 | " pi, mu, var = clf.predict(all_X)\n",
509 | " preds['pi'] = pi\n",
510 | " var = np.diag(var)\n",
511 | " preds['mu'] = mu\n",
512 | " preds['var'] = var\n",
513 | " with open('../outputs/' + fname + '.txt', 'a') as f:\n",
514 | " preds.to_csv(f, header=False)"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": []
523 | }
524 | ],
525 | "metadata": {
526 | "kernelspec": {
527 | "display_name": "Python [conda env:python36]",
528 | "language": "python",
529 | "name": "conda-env-python36-py"
530 | },
531 | "language_info": {
532 | "codemirror_mode": {
533 | "name": "ipython",
534 | "version": 3
535 | },
536 | "file_extension": ".py",
537 | "mimetype": "text/x-python",
538 | "name": "python",
539 | "nbconvert_exporter": "python",
540 | "pygments_lexer": "ipython3",
541 | "version": "3.6.2"
542 | }
543 | },
544 | "nbformat": 4,
545 | "nbformat_minor": 2
546 | }
547 |
--------------------------------------------------------------------------------
/classification/inputs/2GFP_above_parent123457.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:12d19745f73a6d7cab15890a3cbcd3f70f2ad268f7ca201879532b0677f9cfa2
3 | size 1194349133
4 |
--------------------------------------------------------------------------------
/classification/inputs/GFP_data.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e0a3f2014964954309a59cfe998723256b540cd9328338ce14b8fdd2135624df
3 | size 1191884280
4 |
--------------------------------------------------------------------------------
/classification/inputs/X_and_terms.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b6cab1d93f08394f25d5a9479da3d1007133a21682a120f1823d39641b5891aa
3 | size 966127630
4 |
--------------------------------------------------------------------------------
/classification/inputs/gfp_props.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d8285bfe4bf5c90da7b7ba2c405118ed04653f80aaeba278affc0767f29881b0
3 | size 241396
4 |
--------------------------------------------------------------------------------
/classification/inputs/lit_alignment_and_contacts.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:890911d5dd9fa16f61d7c78db99cfe59a33f4b40cda23e4e4b22bc241ce96c17
3 | size 19286
4 |
--------------------------------------------------------------------------------
/classification/inputs/props.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:177031fb6fd9f95b0a0cac78c0c5579a55f37d652c4bf193bd7601d0d4551e22
3 | size 123292
4 |
--------------------------------------------------------------------------------
/classification/outputs/2GFP_above_parent.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4c32d6190b2233f5fde3198b74215b13f3022eab904c2cfb71b26f7138ac7ed3
3 | size 1193369038
4 |
--------------------------------------------------------------------------------
/classification/outputs/matern52_bin0.1_max_peak_False.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:38911daf0d4028dac936d9cdef8e8ae42d290c1c8777cf76fed81e971e3f23f5
3 | size 596315951
4 |
--------------------------------------------------------------------------------
/regression/.ipynb_checkpoints/.gitignore:
--------------------------------------------------------------------------------
1 | *checkpoint.ipynb
2 |
--------------------------------------------------------------------------------
/regression/GP_matern_5_2_kernel.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false,
8 | "deletable": true,
9 | "editable": true,
10 | "scrolled": true
11 | },
12 | "outputs": [],
13 | "source": [
14 | "from __future__ import division\n",
15 | "import numpy as np\n",
16 | "import matplotlib.pyplot as plt\n",
17 | "import seaborn as sns\n",
18 | "import os\n",
19 | "import pandas as pd\n",
20 | "import pickle\n",
21 | "\n",
22 | "# ML imports\n",
23 | "from sklearn import linear_model\n",
24 | "from sklearn.cross_validation import train_test_split\n",
25 | "from sklearn.model_selection import LeaveOneOut\n",
26 | "from sklearn.metrics.pairwise import euclidean_distances\n",
27 | "from scipy.spatial import distance\n",
28 | "from scipy import optimize, linalg\n",
29 | "import scipy\n",
30 | "from sklearn.model_selection import KFold # import KFold\n",
31 | "\n",
32 | "# custom imports\n",
33 | "import encoding_tools as encoding\n",
34 | "import chimera_tools as chimera\n",
35 | "import GP_tools as GP\n",
36 | "\n",
37 | "# import scipy\n",
38 | "import seaborn as sns\n",
39 | "\n",
40 | "# define plotting settings\n",
41 | "sns.set_context(\"paper\")\n",
42 | "sns.set_style(\"white\")\n",
43 | "\n",
44 | "# Plot adjustments:\n",
45 | "plt.rcParams.update({'ytick.labelsize': 12})\n",
46 | "plt.rcParams.update({'xtick.labelsize': 12})\n",
47 | "plt.rcParams.update({'axes.labelsize': 14})\n",
48 | "plt.rcParams.update({'legend.fontsize': 12})\n",
49 | "plt.rcParams.update({u'axes.titlesize': 16})\n",
50 | "sns.color_palette('colorblind')\n",
51 | "plt.close('all')"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "metadata": {
57 | "deletable": true,
58 | "editable": true
59 | },
60 | "source": [
61 | "## Load model inputs"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 2,
67 | "metadata": {
68 | "collapsed": false,
69 | "deletable": true,
70 | "editable": true,
71 | "scrolled": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "# load ephys data\n",
76 | "path_inputs = 'inputs/'\n",
77 | "df_input = pd.read_csv(path_inputs+'Ephys_data_formatted.csv')\n",
78 | "\n",
79 | "# load library files\n",
80 | "file_c = path_inputs + 'shmetis_c_10_21_0/chimeras.output'\n",
81 | "file_n = path_inputs + 'shmetis_n_10_21_0/chimeras.output'\n",
82 | "\n",
83 | "# add sequence information to dataframe based on chimera code\n",
84 | "df_input = chimera.chimera_code2seq_convert(file_c,file_n,df_input)\n",
85 | "\n",
86 | "# load contact information\n",
87 | "fname_1 = path_inputs + 'alignment_and_contacts_C1C2.pkl'\n",
88 | "\n",
89 | "# load the contact map\n",
90 | "with open(fname_1, 'rb') as f:\n",
91 | " ss, contacts = pickle.load(f)\n",
92 | " \n",
93 | "# only use the first three parents\n",
94 | "ss = [i[0:3] for i in ss]"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "metadata": {
100 | "deletable": true,
101 | "editable": true
102 | },
103 | "source": [
104 | "## Data formating"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 3,
110 | "metadata": {
111 | "collapsed": true,
112 | "deletable": true,
113 | "editable": true
114 | },
115 | "outputs": [],
116 | "source": [
117 | "def data_format(property_, df):\n",
118 | " # test data only includes gen 10\n",
119 | " df_test_data = df[df.gen == 10]\n",
120 | "\n",
121 | " # remove ChR_29_10 & ChR_30_10 for kinetics and spectra because currents too low for accurate measurements\n",
122 | " if property_ == 'green_norm' or property_ == 'kinetics_off':\n",
123 | " df_test_data = df_test_data[df_test_data.chimera != 'ChR_29_10']\n",
124 | " df_test_data = df_test_data[df_test_data.chimera != 'ChR_30_10']\n",
125 | "\n",
126 | " # training data excludes test data (gen 10)\n",
127 | " df_data = df[df.gen != 10]\n",
128 | "\n",
129 | " # make a seperate dataframe for the selected property\n",
130 | " df_select = pd.DataFrame()\n",
131 | " df_select['prop'] = df_data[str(property_)]\n",
132 | " df_select['seq'] = df_data['seq']\n",
133 | " df_select['block_k'] = df_data['block_k']\n",
134 | " df_select['chimera'] = df_data['chimera']\n",
135 | " df_select.dropna(inplace=True)\n",
136 | "\n",
137 | " # normalize training data\n",
138 | " log_data = np.log(df_select.prop.values)\n",
139 | " y = (log_data - np.mean(log_data))/np.std(log_data)\n",
140 | " seq = df_select.seq.values\n",
141 | "\n",
142 | " # make a seperate dataframe for the selected property for test set\n",
143 | " df_select_test = pd.DataFrame()\n",
144 | " df_select_test['prop'] = df_test_data[str(property_)]\n",
145 | " df_select_test['seq'] = df_test_data['seq']\n",
146 | " df_select_test['block_k'] = df_test_data['block_k']\n",
147 | " df_select_test['chimera'] = df_test_data['chimera']\n",
148 | " df_select_test.dropna(inplace=True)\n",
149 | "\n",
150 | " # normalize test data\n",
151 | " log_data_test = np.log(df_select_test.prop.values)\n",
152 | " y_true_test = (log_data_test - np.mean(log_data))/np.std(log_data)\n",
153 | " seq_test = df_select_test.seq.values\n",
154 | " return log_data, y, seq, y_true_test, seq_test, df_select, df_select_test"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {
160 | "deletable": true,
161 | "editable": true
162 | },
163 | "source": [
164 | "## Encodings"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 4,
170 | "metadata": {
171 | "collapsed": true,
172 | "deletable": true,
173 | "editable": true
174 | },
175 | "outputs": [],
176 | "source": [
177 | "def encoding_inputs(df_select, df_select_test, ss, contacts):\n",
178 | " # one_hot_encode based on sequence & structure\n",
179 | " X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n",
180 | " X = np.array(X)\n",
181 | "\n",
182 | " # also encode the test sequences\n",
183 | " X_true_test = encoding.one_hot_(df_select_test['seq'].values, ss, contacts)\n",
184 | " X_true_test = np.array(X_true_test)\n",
185 | " \n",
186 | " return X, X_true_test"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {
192 | "deletable": true,
193 | "editable": true
194 | },
195 | "source": [
196 | "## Train on split training data"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 5,
202 | "metadata": {
203 | "collapsed": true,
204 | "deletable": true,
205 | "editable": true,
206 | "scrolled": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "def cross_validation(X, log_data, property_):\n",
211 | " path_outputs = 'outputs/'\n",
212 | "\n",
213 | " kf = KFold(n_splits=20) # Define the split\n",
214 | " kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator\n",
215 | "\n",
216 | " mu_s = []\n",
217 | " var_s = []\n",
218 | " y_s = []\n",
219 | " \n",
220 | " for train_index, test_index in kf.split(X):\n",
221 | " X_train, X_test = X[train_index], X[test_index]\n",
222 | "\n",
223 | " log_data_train, log_data_test = log_data[train_index], log_data[test_index]\n",
224 | "\n",
225 | " y_train = (log_data_train - np.mean(log_data_train))/np.std(log_data_train)\n",
226 | " y_test = (log_data_test - np.mean(log_data_train))/np.std(log_data_train)\n",
227 | "\n",
228 | " initial_guess = [0.1,10]\n",
229 | "\n",
230 | " # take the log of the initial guess for optimiziation \n",
231 | " initial_guess_log = np.log(initial_guess)\n",
232 | "\n",
233 | " # optimize to fit model\n",
234 | " result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X_train,y_train), method='L-BFGS-B')#,\n",
235 | "\n",
236 | " # next set of hyper prams \n",
237 | " prams_me = [np.exp(result.x[0])**2, np.exp(result.x[1])]\n",
238 | "\n",
239 | " # next used trained GP model to predict on test data\n",
240 | " mu, var = GP.predict_GP(X_train, y_train, X_test, prams_me)\n",
241 | " \n",
242 | " # un normalize\n",
243 | " y_test_real = np.exp(y_test*np.std(log_data_train) + np.mean(log_data_train))\n",
244 | " mu_real = np.exp(mu*np.std(log_data_train) + np.mean(log_data_train))\n",
245 | " \n",
246 | " mu_s.append(mu)\n",
247 | " var_s.append(var)\n",
248 | " y_s.append(y_test)\n",
249 | "\n",
250 | " # reformat all\n",
251 | " y_s_all = [j for i in y_s for j in i]\n",
252 | " mu_s_all = [j for i in mu_s for j in i]\n",
253 | "\n",
254 | " # plot results\n",
255 | " plt.figure('My GP test set evaluation', figsize=(1.5, 1.5))\n",
256 | " plt.plot(y_s_all, mu_s_all, 'o', ms=3, color='k')\n",
257 | "\n",
258 | "\n",
259 | " # calculate correlation \n",
260 | " measured = y_s_all\n",
261 | " predicted = mu_s_all\n",
262 | "\n",
263 | " par = np.polyfit(measured, predicted, 1, full=True)\n",
264 | " slope=par[0][0]\n",
265 | " intercept=par[0][1]\n",
266 | "\n",
267 | " # calc correlation \n",
268 | " variance = np.var(predicted)\n",
269 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(measured, predicted)])\n",
270 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n",
271 | " \n",
272 | " print('20-fold corss validation of GP regression model')\n",
273 | " print('R = %0.2f'% np.sqrt(Rsqr))\n",
274 | "\n",
275 | " max_x = np.max(y_s_all)\n",
276 | " min_x = np.min(y_s_all)\n",
277 | " \n",
278 | " plt.plot([min_x, max_x], [slope*min_x+intercept, slope*max_x+intercept], '-', color='k')\n",
279 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel_CV_fig1.pdf', bbox_inches='tight', transparent=True)\n",
280 | " plt.show()\n",
281 | " return measured, predicted\n"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {
287 | "deletable": true,
288 | "editable": true
289 | },
290 | "source": [
291 | "## Evaluate on whole training set"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 6,
297 | "metadata": {
298 | "collapsed": false,
299 | "deletable": true,
300 | "editable": true,
301 | "scrolled": false
302 | },
303 | "outputs": [],
304 | "source": [
305 | "def ML_train(X, y):\n",
306 | " # test the optimization of the hyp-prams\n",
307 | " initial_guess = [0.9,0.9]\n",
308 | "\n",
309 | " # take the log of the initial guess for optimiziation \n",
310 | " initial_guess_log = np.log(initial_guess)\n",
311 | "\n",
312 | " # optimize to fit model\n",
313 | " result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X,y), method='L-BFGS-B')\n",
314 | " \n",
315 | " print('Full GP regression model')\n",
316 | " print('Hyperparameters: ' + str(np.exp(result.x[0])) + ' ' + str(np.exp(result.x[1])))\n",
317 | "\n",
318 | " # next set of hyper prams \n",
319 | " final_prams = [np.exp(result.x[0]), np.exp(result.x[1])]\n",
320 | " \n",
321 | " return final_prams\n",
322 | " \n",
323 | "def ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_):\n",
324 | " path_outputs = 'outputs/'\n",
325 | " \n",
326 | " # next use trained GP model to predict full test set\n",
327 | " mu_true_test, var_true_test = GP.predict_GP(X, y, X_true_test, final_prams)\n",
328 | "\n",
329 | " # convert the true test predications and y back to unnormalized data\n",
330 | " y_test_real = np.exp(y_true_test*np.std(log_data) + np.mean(log_data))\n",
331 | " mu_test_real = np.exp(mu_true_test*np.std(log_data) + np.mean(log_data))\n",
332 | "\n",
333 | " if property_ != 'kinetics_off':\n",
334 | " \n",
335 | " par = np.polyfit(y_test_real, mu_test_real, 1, full=True)\n",
336 | " slope=par[0][0]\n",
337 | " intercept=par[0][1]\n",
338 | " \n",
339 | " # coefficient of determination, plot text\n",
340 | " variance = np.var(mu_test_real)\n",
341 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(y_test_real, mu_test_real)])\n",
342 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n",
343 | " print('GP regression model test set')\n",
344 | " print('R = %0.3f'% np.sqrt(Rsqr))\n",
345 | " \n",
346 | " # plot and measure correlation\n",
347 | " plt.figure('True test', figsize=(1.5, 1.5))\n",
348 | " plt.plot(y_test_real, mu_test_real, 'o', ms=3, color='k')\n",
349 | " \n",
350 | " max_x = np.max(y_test_real)\n",
351 | " plt.plot([0, max_x], [intercept, slope*max_x+intercept], '-', color='k')\n",
352 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel.pdf', bbox_inches='tight', transparent=True)\n",
353 | " plt.show()\n",
354 | "\n",
355 | " elif property_ == 'kinetics_off':\n",
356 | " \n",
357 | " par = np.polyfit(np.log10(y_test_real), np.log10(mu_test_real), 1, full=True)\n",
358 | " slope=par[0][0]\n",
359 | " intercept=par[0][1]\n",
360 | " \n",
361 | " # coefficient of determination, plot text\n",
362 | " variance = np.var(np.log10(mu_test_real))\n",
363 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(np.log10(y_test_real), np.log10(mu_test_real))])\n",
364 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n",
365 | " print('GP regression model test set')\n",
366 | " print('R = %0.3f'% np.sqrt(Rsqr))\n",
367 | " \n",
368 | " # plot and measure correlation\n",
369 | " plt.figure('True test', figsize=(1.5, 1.5))\n",
370 | " plt.plot(np.log10(y_test_real), np.log10(mu_test_real), 'o', ms=3, color='k')\n",
371 | " \n",
372 | " max_x = np.max(y_test_real)\n",
373 | " min_x = np.min(y_test_real)\n",
374 | " \n",
375 | " plt.plot([np.log10(min_x), np.log10(max_x)], [np.log10(slope*min_x+intercept), np.log10(slope*max_x+intercept)], '-', color='k')\n",
376 | " \n",
377 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel.pdf', bbox_inches='tight', transparent=True)\n",
378 | " plt.show()\n",
379 | "\n",
380 | " # export csv with predicted values\n",
381 | " df_select_test['y'] = y_true_test\n",
382 | " df_select_test['mu'] = mu_true_test\n",
383 | " df_select_test['y_real'] = y_test_real\n",
384 | " df_select_test['mu_real'] = mu_test_real\n",
385 | "\n",
386 | " df_select_test.to_csv(path_outputs+ 'matern_kernel_gen10_'+str(property_)+'.csv')\n",
387 | " return"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {
393 | "deletable": true,
394 | "editable": true
395 | },
396 | "source": [
397 | "# Train models are different properties "
398 | ]
399 | },
400 | {
401 | "cell_type": "markdown",
402 | "metadata": {
403 | "deletable": true,
404 | "editable": true
405 | },
406 | "source": [
407 | "### Max_peak"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 7,
413 | "metadata": {
414 | "collapsed": false,
415 | "deletable": true,
416 | "editable": true
417 | },
418 | "outputs": [
419 | {
420 | "name": "stdout",
421 | "output_type": "stream",
422 | "text": [
423 | "20-fold corss validation of GP regression model\n",
424 | "R = 0.77\n"
425 | ]
426 | },
427 | {
428 | "data": {
429 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHcAAABvCAYAAADWvF98AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADFdJREFUeJztnV1oHNUbxp/dyX40VaOU4JbGVAomwTR+paXVaoOipaIt\nVREiQawXRpMl3bqhGBCNRUhRsoKRGMSKpRgvVExLL/xCmoCCF1UUY4XSIm1EN2xjQ01N083u87/Q\nmf/sZGZnZnd2ZnYzPyh0Z2f2vDnPed/zno+Z8ZEkPCoSv9MGeJQOT9wKxhO3gvHErWA8cSuYKjsK\nuXz5MiYnJ1FbWwtBEOwosqLJZDJIpVJYv349wuGw5nm2iDs5OYmOjg47ilpWjI6OYsOGDZrf2yJu\nbW2tZEwkErGjyIommUyio6NDqlctbBFXDMWRSAR1dXV2FFk29Pb2Ynh4GNFoFIlEwtS1el2cl1A5\nQG9vL8LhsCTswsIC3n777ZzjlkAbmJqaYkNDA6empuwozvWEQiECYDgcZmtrKwGwtbU153g+jNan\n57kOEI1GEQ6H0d3djcnJSQDAL7/8knPcCjxxHYL/rdfIBU0kEpifnzfd92phS0LlkYu8n7VSTCWe\n5zqA1eFXC09cE1iVzZoNvwWXW2zml81m+cILL/DgwYNFZ3dux2g2KxKPxxkKhRiPx4sqNxAI5JRr\nS7Z85swZPPXUU/jss8+K+ZmywWw4lfetauh55OzsLLq6upBOp+H3+9HV1WXO4GJa1P79+zk2NrZs\nPNcs8Xic4XCY8Xhc1YsFQSAACoKQc102m+WHH37I66+/nitXruQbb7zBdDotfW+0Pi2ZxPDE1Q/B\nYkgHIJ2jJu7p06e5bds2AuCuXbt47ty5Jb/lTWLYjF4Ijkaj0v/Fc2KxmDQ/vHfvXgwMDKCxsRFf\nfvkldu7cibGxMdxwww3OJVSk+z3XqsRGr4x8IVh5jojo0T6fj36/X/JmedKmTOS8sCzDbJZrV3nn\nz5/nqlWrCIDV1dX84YcfVBuA8pgXlmXYNWmgVp5aSCWJw4cPo6mpCTMzMwD+3V1x++23q46BC56W\nLLqZGsBpz9XDSNiWn2MmzCu9+Omnn6bP5yMAPv744+zs7FziqXrYGpatMsYpjIRR+TlmwrwYUvfs\n2cP+/n4pYw4EAgXnAl5YNoEYRpubmzWz0mg0CkEQkE6npY1p+cK8GI4BoL6+HkNDQ9i/fz82btyI\nUCiEnp4e3Qy7aEw1mQJxu+eK6HmkGY+VZ8H4z1uV1a2WPBnB89w8aI0b9RIhufcqr5Wfn81msXXr\nVgD/X7cFgNbW1pxrtBIly7bbmGoyBeI2zxW9ShAEzT5Py0u1rhWPB4NB3n333TleKwiCKe/UixCe\n5+ZB9FAAmpvTtIZPovdmMpmc/rKzsxOCIGBxcRHJZBKBQCDHa80MYywbuhluTkXgNs8Vkfd5et4i\nz2yV88RffPEF161bx0AgwE2bNtHv9+f0tcqFgWIp26GQHVOFWuXmS27k4ovnPvvss2xvbycAbt26\nlSdPnswRXhAE6Z+Vf0/ZilvqqcJCG49c/Ewmw5GREdbU1DAcDrOqqorPP/+8dJ5c0FL8PWUrbqHD\nA73fFAXVWkM1yk8//cTNmzcTAHfv3s1gMKgbzq3+e8pW3FIg9x4j4iq9jyTn5ua4b98+CoLAxsZG\nHj9+nCRzNpXbhSeuDOVynJ4nyfvNcDjMY8eOce3atQyFQrzrrrukpTm15MoOKl7cUiZeouf6/X7e\ndNNNBMD6+noGAgHJ8+XJlfyzHfZVvLilTLwWFxc5NDTEq6++mrW1tfzggw/o9/uliQllyM63CF8K\n+2wT9/jx43z44Ye5bds29vT08O+//y7YGDOIFdra2mrpcOP777/nhg0bCIAtLS1SCJbPNpmxr2w9\nd2Zmhps3b+Zvv/1Gknz99dfZ399fsDGFoOwfC+XixYvcu3cv/X4/m5ub+c033ywZs8rFEsOueHee\nloilCM+2iHv06FE+88wzOYXecccdzGazBRmjRb4KUstszTI2Nsa6ujpWVVVREATGYjHd35YLn69h\nle0495133uFLL70kfU6n02xoaFgSmosV14qd/mrHzp49y507dxIAt2/frjtmVZYhdgv5wm/ZjnNH\nRkZUxb106VJeY8yGKiMVpDb3q7WDMJ1Oc3BwkCtXrmQkEuFDDz3EYDCoK5RWeXZji7hHjhzhc889\nJ33+/fffuXHjRl1jjCy5mUVt7ldtB+ETTzzB2267jT6fj93d3ZydnS0odNq9o1KOLeKeP3+ed955\np5RQDQ4Osq+vT9cYsaLV9ugWilJQpWfNzs4yGo3S5/Px1ltv5XfffbfkWr3kKF95dmLbUGh8fJw7\nduzg9u3b2dnZyQsXLhg2ppQVJHpWKBTiRx99xNWrV7O6upqDg4M5992oXVNKb7QinFf8JIYe8Xic\nwWCQN954IwFwx44dPHv2rOa54rCm2MZm9J6hYhrQshb3ypUrPHDgAFesWME1a9bw008/XTI8k5Ov\nws0uDBhZ9C+2AS1bcb/99luuX7+efr+fsViMFy9e1PUmtfGsclHA6KYVO/riZSfuX3/9xc7OTsnL\nTpw4IX1ndtM5yZwFAruX9PSoiA1yRrZ4ksTo6Ciamppw6NAhVFVV4Z577snZRmpkw5nWOYIggCRO\nnDhh2jbHcVNLU96PA52pvVOnTvH+++8nAD722GOmZpj0MLOnym7K0nPlt1cMDw9Lx5XetLCwgFdf\nfRUtLS04deoUjh07hk8++QQtLS0AgObmZkvsYZ4Xt9h952BBuKmliZmpz+fTHJaMj4+zqamJgiBw\n3759nJubk76z0puc9Ew9yjKhyrd8l0qluHv3bgLgpk2b+OOPPy65Xi+UmplA0JvxcpKyFFdtSJLN\nZvn+++9z1apVrKmp4cjICDOZTEF2FDOn7SZPLgtx9bzh119/ZVtbGwGwvb2df/75Z1F2FDOn7eRc\nspKyEFfLG/bs2SNtUFu3bh0///xzS+1xc8g1QlmIq+YNX331Vc59Nj09PZq/W4woVm5Ut5uyEFfO\n9PQ0Ozo6CIBr1qwxtC+qmH7Q7EZ1N1E249xsNot3330X9fX1GB0dxQMPPIBz584hHo8XPKtkBPm1\nsVgM4XAYsVismD/FfTjZ0n7++Wdu2bKFAKR9wWaTnFJuTHdrP2xbWC7kkbzz8/Ps6+tjVVUVGxoa\n+PXXXy+55cNIxZYynLpp6KPEFnFPnz7NJ598krfccospcT/++GMGg0G+8sornJ+fX3K+0Yotpbhu\nGvoosaXPHR0dxaOPPooHH3zQ1HWPPPIIZmZm0N/fr/ouOqN9aSn7SqtfJuEEuuJOTEzg5ptvXvLv\nyJEjePnll7Fr1y7ThQqCgKuuukr6rLZ8lk6n8eabb+ZdUqsEAUqKFWGi2Ad7KsOwco7ZzcmNE5TN\nUAhYGobFJ8YIgoDu7m7TT1ori4V0O7CiJZX6kbxmkxs3Z7pWUFaeq4bc+8z2rWWxkG4Hbmppcird\n+4qhLD1X7q2e91mAUy1NLQP2vNUYrvdcZQbc29uLxcVFKUP2KB7HxFWG3eHhYWQyGQQCAW9SwiIc\n7XMp2zrq9bElwKk+wutfC8f1fa7nqaXHsTdfJxIJr28tMa4a53pYiyduBeMqcb3VHGtxlbglf4nS\nMsNV4noZtLU4li2r4WXQ1mKLuJlMBgCQTCbtKK7iEetRrFctbBE3lUoBADo6OuwobtmQSqWwdu1a\nze99ZJ5nA1jE5cuXMTk5idraWukd7R6Fk8lkkEqlpLeBamGLuB7O4Kps2cNaPHErGE/cCsb2ce7R\no0fx3nvvwefzYcWKFXjxxRel50c5zfj4OBKJBK5cuYLGxkYMDAzk3PbiNKbrzoa1ZYkzZ85wy5Yt\nnJ6eJvnvM6Xa2trsNEETo29acYpC6s5WcaempqR34JH/Pmm9ubmZCwsLdpqhitE3rThFIXVXkrA8\nMTGBrq6uJccHBgakuwJJ4sCBA7jvvvsQDAZLYYYpkskkIpGI9DkSiWBubg6XLl1yRWiuq6tDXV0d\nAON1VxJx29racPLkSc3v//nnH/T19SGZTOLgwYOlMME02WxW9bjf766c00zd2W75H3/8gfb2dgiC\ngMOHD+Oaa66x2wRVVq9eLU2TAsD09DRqampQXV3toFW5mK67kncWMi5cuMB7772Xb731lp3FGsLo\nm1acopC6s3X6cWRkBENDQ2hoaMg5fujQIVx33XV2maHJxMQEEokE0uk06uvr8dprr+Haa6912iwA\nhdWdN7dcwbgrW/CwFE/cCsYTt4LxxK1gPHErGE/cCsYTt4LxxK1g/geT6zWgANqEbAAAAABJRU5E\nrkJggg==\n",
430 | "text/plain": [
431 | ""
432 | ]
433 | },
434 | "metadata": {},
435 | "output_type": "display_data"
436 | },
437 | {
438 | "name": "stdout",
439 | "output_type": "stream",
440 | "text": [
441 | "Full GP regression model\n",
442 | "Hyperparameters: 0.0486636299687 19.6621464374\n",
443 | "GP regression model test set\n",
444 | "R = 0.927\n"
445 | ]
446 | },
447 | {
448 | "data": {
449 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAG4AAABvCAYAAAANB/VeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC2pJREFUeJztnW1MU9cfx7/10lKfUIk45jrdDMMXK26JZjMR11EQHG6y\nETU1sk19gcmKLMCysTF1m8SHmfqA6dSI82HiSCDLEEUdDltdzIyBTCUmmtQ164JFIEAUUOD293/h\n4M9TH723tw/nk5DQ9txzvjnf8zs999xzTmVERGAEHeOkFsDwDWZckMKMC1KYcUEKMy5IiRAqo8eP\nH6OxsRExMTHgOE6obMMWnufR0tICtVoNpVI56nPBjGtsbMSaNWuEyo7xH2VlZViwYMGo9z0yrqqq\nCkeOHIFMJsP48eNRVFSEhISEYWliYmIGC4qNjRVAcnhjt9uxZs2awXodBbnBYrHQokWLqLm5mYiI\nTCYTaTSaUelsNhvFx8eTzWZzlyVjDLq6uujWrVuDr93Vp9uIUygUKC4uxowZMwAAarUara2t6O3t\nhUKhELSVhSudnZ1IT0+HzWbDP//849E1bo1TqVRQqVQAACLC9u3bodVqmWkC0d7ejrS0NNy5cwfn\nz5/3+DqPByfd3d0oLCyE3W5HaWmpTyIZw2ltbUVqaiqsVit+//33MQchzvDoPq6pqQk6nQ4cx+HE\niROIioryWSzjKQ8ePIBWq4XNZkNdXZ1XpgEeRFxHRweysrKQmZmJnJwcn4Uy/s/9+/eRnJyMtrY2\nXLp0CWq12us83Ebczz//jPv376O2thYZGRmDf+3t7T6JDnQKCgqgVCpRUFAgSv7//vsvNBoNOjo6\nYDabfTINgPvbAU8JlduByMhIAkBKpVLwvK1WK82ZM4dUKhXdvXvXZVp39cnmKkeg1+uhVCrxySef\nCJqvxWLBW2+9BYfDgcuXL+OVV155pvyYcSMwGAzo6emBwWAQLM+7d+9Co9FALpfDbDbj5ZdfHpXG\n2y6aGScyt2/fhkajwaRJk2A2mzFr1qwx0xmNRjx58gQ//PCDR/ky40Tk5s2bePvttxEdHQ2TyYQX\nXnjBaVqvu2ihvnhDZXDiivz8fIqMjKT8/Pxh/49FfX09RUdH02uvvUYPHjzwuix39cmM84KhI05X\no89r167R1KlTaf78+dTW1uZTWWxUKSBDuzNnXdvVq1eRkpKCuXPn4uLFi4iOjhZHjE/NwYcWEg6Y\nTCaaOHEiJSYmUmdnp9v0rrpb1lWKxMhKr62tpfHjx1NSUhI9fPjQozxcdbfMOA9xN9gYydBKr6mp\nocjISFqyZAl1dXV5VaZSqWQR9yx4O9U1UOkZGRmkUCho2bJl1NPTI5geNjjxEG/vowwGA06ePImz\nZ88iPT0dv/zyy5irsUTDXy0k1Dh16hRxHEfx8fGkUCg87mI9hUWcCBw/fhxZWVnQ6XSwWq3o7e31\neKpKKJhxXlJaWop169bho48+wvHjx5GTkyPK0wS3+Cu0QwGj0UgAKDs7m3ieF7Us1lUKxN69e6HX\n65GTk4ODBw9i3Dhpq44Z5wE7d+5EXl4eCgoKUFJSAplMJrUkZpw7tm7disLCQnz11VfYtWtXQJgG\nMOOcQkTYtGkTNm/ejG+++QbFxcUBYxog4G6dUIKI8MUXX2DXrl3Ytm0bvvzyS6kljYIZNwIiQl5e\nHvbt2weDwYD8/HypJY0JM24IDocDer0eBw8exP79+wN6ATAz7j94nkd2djaOHj2KQ4cOITs7W2pJ\nLmHGAejv78f69etx8uRJ/Pjjj1i7dq3UktwS9sb19fXhww8/REVFBX766aeg2Q4d1sb19vZCp9Oh\nuroa5eXlWLlypdSSPCZsjXvy5AlWrFiBCxcuoLKyEhkZGVJL8oqwNK6npwcffPABTCYTfv31V6Sn\np0styWvCzriuri4sX74cV69exenTp5Gamiq1JJ8IK+MePnyIZcuWob6+HjU1NUhKSpJaks94NFdJ\nRCgsLMSRI0fE1iManZ2dSEtLw19//YULFy64NE3szY1C4NY4i8WCjz/+GOfOnfOHHlHQ6/WYNm0a\nGhoaUFtbi8TERJfpvd05IwVujSsrK0NmZibeeecdf+gRnNbWVhw4cAD0dCki3nzzTbfXiLW5UUjc\nfsdt3rwZAPDnn3+KLkZompubkZKSAqVSCZ7nPZ57NBgMgm5sFIOQHZw0NTUhOTkZ7e3tuH79Ol59\n9VWpJQlKSBpns9mg1WrR3d0Ns9mMuXPnSi1JcELOOKvVCq1Wi/7+fpjNZsTFxUktSRRCaumCxWKB\nRqMBEeHy5cshaxrgRcTt2LFDTB3PzJ07d6DVajFhwgTU1dXhxRdflFqSqIRExA2cbBAVFQWz2Rzy\npgEBbFxBQQEiIiIQERHhcgZj4GSD6dOnw2QyYebMmX5UKR0Ba5zRaATP8+B53ukMRkNDA5KSkjBz\n5kxcunQJzz33nJ9VSkfAGqfX68FxHDiOG3MG49q1a9BqtZgzZw7q6uqcn10cqvhrk4KQXLlyhSZP\nnkwLFy6kjo4O0cuTgpDb9GEymbB06VK8/vrr+O233zBlyhSpJUlCUBl38eJFpKen44033sC5c+cw\nefJkqSVJRtAYV1NTg3fffReLFy/GmTNnMHHiRKklSUpQGFdVVYX3338fKSkpqKqqwoQJE6SWJDkB\nb1xlZSUyMzPB8zzi4uL8e7JBABPQxp06dQo6nQ7A03X9hw4dklhR4BCwxg2cbLB69Wrk5uYG/BNp\nfxOQj3UOHz6MDRs2YO3atTh8+DA4jsOePXuklhVQSBJxQ1dRjVxRZTQakZ2djYSEBJSVleHzzz9/\npvxDFn/d6Q/F2YGdu3fvJgCUm5tLCoXC52PkxTyC3l8E5MzJWAd2LliwAPn5+fjss8+wd+9erw5+\nGRlhwbBK65nxVwtxhsPhoG+//ZYAUFFRETkcDq/LDoUIG0lARtyQRoOvv/4aW7ZswXfffTd4soG3\n31FhEWEj8VcLGUleXh5xHEcAaMeOHcM+C8UI8paAjDgiQklJCXieh0wmQ1FR0bAn3WEZQd7irxYy\nAM/zNG/ePAIw6i+cI2wkARVxeXl5kMvluHnz5qjPnD3pZoyNqMYNHWT09/ejpKQEDocDMpkMSqVy\n8IgljuPw6aefwmg0hvZNs5CIGdoDg4zIyEhatWoVyWQyksvlg8fgDj0FnA1IhiPpKegDhsTFxVFE\nRARVVFQ4vd7VUe7hiKTfcf39/ejr68O9e/dQWVmJFStWjEoz0J0CEPx320IasVpId3c3yWQyAkBy\nudzpdayLHBvJIq66uhocx0Eul2Pjxo1O07F7Nh8Rq4XwPE/t7e1CZR92SBZx48aNw9SpU8XKPuwR\n7Ak4z/MAALvdLlSWYc1APQ7U60gEM66lpQUAgub0uWChpaUFs2fPHvW+jIhIiAIeP36MxsZGxMTE\ngOM4IbIMa3ieR0tLC9Rq9ZhLEgUzjuFfAnZ5HsM1zLgghRkXpIhinMlkwnvvvYe0tDTk5ubi0aNH\nYhQjGFVVVVi+fDkyMjKg0+lw69YtqSW5R+g7/ra2Nlq4cCH9/fffRET0/fff05YtW4QuRjAsFgst\nWrSImpubiejpT0JrNBppRXmA4BH3xx9/ICEhAS+99BIAYPXq1aiurgYF6OBVoVCguLgYM2bMAACo\n1Wq0trait7dXYmWuEXzvgN1uR2xs7ODr2NhYPHr0CF1dXZg0aZLQxT0zKpUKKpUKwNNFTNu3b4dW\nq4VCoZBYmWsEN87hcIz5vtQ/lOeO7u5uFBYWwm63o7S0VGo5bhG8Np9//vnB6S/g6ZmRU6ZMCehd\npE1NTdDpdOA4DidOnEBUVJTUktwiuHGJiYm4ceMGrFYrAKC8vBzJyclCFyMYHR0dyMrKQmpqKvbs\n2RM0O15FmfIym80wGAzo6+vDrFmzsHPnzoB9xHPgwAGUlJQgPj5+2PvHjh3DtGnTJFLlHjZXGaQE\n9oiB4RRmXJDCjAtSmHFBCjMuSGHGBSnMuCCFGRek/A+4SJtQR/DLjgAAAABJRU5ErkJggg==\n",
450 | "text/plain": [
451 | ""
452 | ]
453 | },
454 | "metadata": {},
455 | "output_type": "display_data"
456 | }
457 | ],
458 | "source": [
459 | "# select the property of interest\n",
460 | "property_ = 'max_peak'\n",
461 | "\n",
462 | "# format data for property \n",
463 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n",
464 | "\n",
465 | "# encode sequences\n",
466 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n",
467 | "\n",
468 | "# train and CV model\n",
469 | "measured_CV, predicted_CV = cross_validation(X, log_data, property_)\n",
470 | "\n",
471 | "# train model on whole test set\n",
472 | "final_prams = ML_train(X, y)\n",
473 | "\n",
474 | "# use model to predict on test set and evaluate accuracy\n",
475 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)"
476 | ]
477 | },
478 | {
479 | "cell_type": "markdown",
480 | "metadata": {
481 | "deletable": true,
482 | "editable": true
483 | },
484 | "source": [
485 | "### Norm_green"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 8,
491 | "metadata": {
492 | "collapsed": false,
493 | "deletable": true,
494 | "editable": true
495 | },
496 | "outputs": [
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "20-fold corss validation of GP regression model\n",
502 | "R = 0.90\n"
503 | ]
504 | },
505 | {
506 | "data": {
507 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHUAAABvCAYAAADSSY9BAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADQ1JREFUeJztnW1MHEUYx//L3t0ulBSsBU4LbTUI8eBiU0hFMMW2avqh\nvZrWUhRCIkZDIFjv+FK1xDSRWo1XbQ1gbFWiUdTEWDBNVGoEpdYm9S3QalRUAqFQrmKr0N4dd48f\nyG7vFe51b9nuL7nkbnd2Z7L/Z56ZeWZuhyEigoqiSEp0AVRijyqqAlFFVSCqqApEFVWBaKTI5OrV\nqxgcHERGRgZYlpUiS0XjcrkwOTmJwsJC8Dzvd14SUQcHB1FVVSVFVtcV7777LoqLi/2OSyJqRkaG\nWAi9Xi9FlopmfHwcVVVV4nP1RRJRBZer1+uRnZ0tRZaKoKmpCa2trWhoaIDVavU7H6wpUztKMqOp\nqQk8z4uC2u12HDx4EAzDICsrK6R7qKLKiKamJhw8eBB2ux1tbW1oaGgAwzDi+QsXLoR0H1VUGdHa\n2ip+r6+vh9VqRSSheVVUGdHQ0ACe52GxWMQ21LOmhooqagLxbD8BwGq1or6+Hq2treIxs9kc/o1J\nAkZGRigvL49GRkakyE62WCwW4jiOLBYLERExDEMAiGEY8TwAAkAsyxLHcfTEE0+IxwS5FnqeqqgS\nwnEcASCe54mI/MQSzguiCoKHK6rqfiVEaDMLCgrA8zwyMzMBAMnJyWAYBna7HQBQVFSEqqoqMAyD\n5ORk8fpQ21dVVAmxWq24cuUKBgcHYbfbcfnyZRARrly54pXuhx9+wLFjx2AwGHD27FlYLBbwPB9y\n+6qKmgCEGltfXw9grmZ64na7UVpaim+++QarV68WjSFQVCkgUrQlapu6ME6n06vj5HQ6xXO+HSy1\nTV0EXL58GSaTCQCg0WhgNpuh0WjEIc+hQ4fEKFMoqKKGiO+YMlYMDw+jrKwM/f39OH78OJxOp+hm\nhdgvAC93vSBSuBYluF/f4Ugs+PbbbykrK4tWrVpFNTU1Xi6WaM7t8jzvdYxIHafGjGAPOFI++OAD\n4nme9Ho96XQ6cVwaitGoosoIi8VCOp2OSktLCQDt2rWLdDqdV5AhMzPTr8b6oooqIwQBAVBzczO5\nXC7RAwg1VfgIXiGQwKqoMmFycpJWrFhBAGjz5s1+5wVxi4qKvAQN5JIXep5RL2fp7e2F1WqFw+FA\nfn4+9u/fj9TU1Ghvqyh+/vlnbNmyBXa7Hf39/SgrK/NLY7VaAwYX2traQu/1CkRjfRcvXqSSkhL6\n888/iYjoxRdfpGeffdYv3fVUU31dZk9PD6WlpdHtt99OQ0NDMckjru63q6uLHnvsMa/M1q5dS263\nO6xCKAlPl/naa68Ry7J033330dTUVMj3sFgsxLIssSwbsMMU14jS+Pi415JPvV6P//77D9PT09Hc\ndtHS1NSE2dlZJCUlwWAwoK6uDkQEg8GA9PT0eQMYvgvOXC4XXC5XyFEkLyK1SCKi9vZ2am5uFn87\nnU7Ky8uj6enpsCxLKQi1NCkpiRiG8Rt7CueFCXDPWuhZwxNaU2+66SZMTk6KvycmJpCWloaUlJRo\nbrtoaGpqgkajQVJSEjQaDfLy8sAwDFiWRXd3N9asWQMAKCgoAHBtdgaAXyzXc+bGarVidnYWs7Oz\noc/MeBKNZdpsNrrrrrvEjtJLL71Ee/bsCduyFiueKxWET05ODv34449e532HJNFGp+I+Tu3t7aWt\nW7fS5s2b6fHHHw/YIVCqqIKbFATNysqisbExr/PC2FNwt8ECCuGgBh/iiNvtpv379xMAevDBB/36\nEgKeNTYWEwPqfGqccDgcqK2txdNPP41169ahu7sbzc3NAdN6tpe+qx7iQsTmEkPLWgx4uk2bzUbr\n168nrVZLHR0d89a+WLhbX1T3GwGBhBCE0+l0lJubS8uWLaO+vj4xfbCOj6/gapuaIALVPIvFQlqt\nljiOo/z8fPrtt9/8rgskmK/gUrSpqqgBCDT4P3r0KGk0Gtq4cSP9/fffAa+bbwjj2fuNdrJdFTVC\nBIE4jqPi4mICQEajkRwOR9BrggkW66Uwau/Xg3AWjzU0NIDjOGRnZ+PMmTMAgF9//RVarTboNcHW\n50rS4/UkJqYTpWVJRTg1ZnR0lNauXUspKSlkMpnmjcVKjVpTPQi1xlRXVyM7OxsDAwNwOp3Izc0F\nMPeqm0OHDvmlj9fy0YiRg2XJiY8//tgvnuu5hohlWb9r4rF8dD7UmroAxcXFYBgGDMNg/fr12L59\nO3Jzc8FxHIqKisSavXv3bvA8j927d/vVTMnbzIWQg2UlEvjUSpZlyWw2+6XzHJZIXTN9UWvqAixf\nvlz8zjAMXC4X2tvb/dIJf4EQ3poiq5rpw3Ut6u+//46LFy8CALRaLcxmc1CxfCexw/prodTIwV1E\nQ6Sx1N7eXlq2bBmlp6eTTqcL6fp4BOcjQfERpUjat7feeou0Wi2Vl5eTzWaLa17xQPFtqqdbXGi8\n6Ha78dRTT+GRRx5BdXU1Pv/8c9x4440R5SVr5GBZsSLY7ArHcdTY2Eg7duwgAHTgwAEym81erlQu\nrjUUFO9+PQkUUBeEZhiGkpOT6aOPPvI6LhjAfMEFuaF49+tJoF5pRUUFACAlJQVfffUVtm/fDmAR\nudJIkINlxRJPN9rd3U1LliyhNWvWLJh3LOY5pXLhinW/wR6g4FY1Gg0xDEMmk4n+/fffmOU7H1L1\njhXrfj0jPJ7U1dWBZVnMzs7CYrHg1ltvxfLlyyWZQZGNS4+rSYVoWZEQyF1OTU3RvffeSxqNhl5/\n/XUiks/YMpYo1v36UltbSwzDEMdxdOLECfF4rF/AIQeuC1G//vprcZZFp9PFJQ85odg2VeCdd97B\npk2bsGLFCnAcB6PRKK9VCAkgKlG7urpgMpmwbds2VFZWYmBgIFblWhC32429e/eipqYGlZWVGBoa\nEneqCueVb4okUhcwNDREZWVlNDExQURzsx7l5eUhuYtox3MzMzNUUVFBAKilpcXrdQRKbEN9iVub\nOjIyQl9++aX422azUUFBAdnt9gULEU2P9Pz587Ru3TrieZ4+/PDDSIu/qIm6Te3r64PBYPD7nDlz\nBvfcc49Q2/H8889j48aN0Ol0C3qHSMdzAwMDuPPOOzE8PIy+vj7s3LkzrOuvG6K1munpaWpsbKSd\nO3fSpUuXIrKsUDh+/DilpqaS0Wik4eHhiO+jBOLa+x0bG0NlZSVYlsXbb7+NpUuXxsrWRIgIhw8f\nxtatW1FeXo6TJ09i5cqVMc8nXGS31teTSK1lamqKNmzYQK+++mrUlhUMp9NJ9fX1BICefPJJmp2d\njbS4MSeRkaq4vcaus7MT58+fR09PD3p6esTjHR0duOGGG6I2tkuXLmHXrl04ceIE2tvbUVdXF/U9\nY0lDQ0Nkr5iTAjlYli9//PEHGQwGWrp0KX322WdxLt3iI+4vnIw1p06dwrZt25CamopTp07BYDAk\nukiLDlmFCTs7O7FhwwbcdtttOH36tCpohMhCVCLCvn378PDDD2PHjh344osvgm7NrLIwCXe/V69e\nRW1tLTo7O7Fv3z40NzdHtL2kyjUSKuqFCxfwwAMP4Pvvv0dnZycqKysTWRzFkDBRz549iy1btmBm\nZga9vb0oKSlJVFEUR0La1IGBAZSWlmLJkiU4ffq0KmiMSYioDocDNTU1OHnyJFavXp2IIiiahLjf\noqIiv50IVWKHLIY0KrFl0Ykq69kRmbDoRA22iFvlGotOVNmsgpcxCY8ohUuwnZZUriGJqC6XC8Dc\nPjYq0SM8R+G5+iKJqMI2J1VVVVJkd90wOTmJVatW+R1niIjinbmwyDojIwMsy8Y7O8XjcrkwOTmJ\nwsJCcZ8bTyQRVUVaFl3vV2VhVFEViCqqApHtOLWrqwtvvPEGGIZBcnIynnnmGRiNRr90Bw4cwKef\nfoq0tDQAwC233IJXXnlF0rKGstuzpDtCS7ewMXTC+UddRUUFfffddxKWzptQdnsOdUfoWCFL96vT\n6fDcc88hMzMTAFBYWAibzQaHw+GVzuFw4Ny5c3jzzTdhMpnQ2NiIsbExScva398Po9Eozgs/9NBD\n+OSTT0Aeg4pQ0sSShIoa7T/qJiYmUFJSAovFgq6uLtxxxx2or6+P28MKRCi7PUu9I3RC29Ty8nKc\nO3cu6PmZmRns2bMH4+PjOHr0qN/5nJwcHDlyRPz96KOPoq2tDaOjo8jJyYlLmX1xu90BjyclJYWV\nJpbI0v0Cof2j7pdffsGxY8e8jhHRvHvHxJpQdnuWekdoWYr6zz//oLq6Gvfffz9efvnlgKEwYM7S\nW1paMDIyAgB47733kJ+f7+Xq4s3dd9+Nn376CX/99RcA4P3338emTZvCThNLZBkmbG9vx+HDh5GX\nl+d1vKOjA6Ojo9i7dy+6uroAzA19jhw5ApfLBb1ej5aWFtx8882Slrevrw9WqxVOpxMrV67ECy+8\ngJGREa9yBkqTnp4el/LIUlSV6JCl+1WJDlVUBaKKqkBUURWIKqoCUUVVIKqoCkQVVYH8D73TxLYM\nMBhoAAAAAElFTkSuQmCC\n",
508 | "text/plain": [
509 | ""
510 | ]
511 | },
512 | "metadata": {},
513 | "output_type": "display_data"
514 | },
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "Full GP regression model\n",
520 | "Hyperparameters: 0.104064905203 38.5362857546\n",
521 | "GP regression model test set\n",
522 | "R = 0.964\n"
523 | ]
524 | },
525 | {
526 | "data": {
527 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHgAAABvCAYAAAAntwTxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC3NJREFUeJztnWlME2kYx/8tCo26SqJ4xDNuEA24Hig7um5cr0CFUgME\no2iUoMRjF6V+AKPBjRtFjfWMRINnYt31LkGjxKhFlitqPJb1ILorIi5abzFAS3n2wy4NSLud0mNm\nuu/vS9POM/M+nf888848877PyIiIwPBZ5EI7wPAsTGAfhwns4zCBfRwmsI/TydsNNjQ0oKKiAkFB\nQfDz8/N28z6HxWKB0WhEWFgYFApFu+VeF7iiogJJSUnebtbn0el0GDduXLvfeQlMRFi9ejWCg4OR\nkpLSbrnBYIBWq4XJZEJISAg2btyIbt262dxWUFCQ1aG+ffs68x8YNqitrUVSUpJ1v7aDHPDo0SOa\nP38+ffXVV7R///52y1+/fk0cx9Gff/5JRERbtmyhdevW2d1edXU1DRs2jKqrqx01zbCBRqOhgIAA\n0mg0ROR4fzq8yNLpdIiLi4NSqbS5/Ndff8XIkSMxZMgQAMCcOXOQn58PYgkyj7Bnzx40NjYiJyeH\nl71DgbOysjBr1iy7y2tra9ucavv27Yu6ujp8+vSJlwMM51i+fDkUCgWWLVvGy97li6zm5mabv8vl\n7A7ME2i1Wmi1Wt72LqvQr18/GI1G6/cXL16gR48e6NKli6ubZthg1apVUCgUWLVqFS97lwWeNGkS\n7ty5gydPngAAfvnlF0ybNs3VzTLssHPnTjQ2NmLnzp287Dsk8G+//Qa1Wg0A6NmzJ7Kzs5GWlgal\nUonKykpkZGR0ZLMMT+DFK3wiYrdJrtK7d28CQL179yYiN9wmMcTFy5cv23w6ggns4zCBJURDQwNk\nMhkA/rehTGCJYDQaMX36dMjlcnTu3BkrV67ktR4TWAI8ePAAHMfh4cOHSEhIcCqJxAQWOVeuXMGE\nCRPg7++PsrIy6PV69+aiGcJx6NAhREZGYuzYsSgpKcGXX37pdC6a3QeLEIvFQpmZmQSAUlJSyGQy\n2bVl98ESo76+HrNnz8amTZuwefNm5ObmonPnztblzuaiWQSLiNraWoqIiCCFQkGnTp2yaRMQEEAA\nSKFQEBGLYMlQUVGBr7/+GlVVVSgsLER8fLxNO2f7YCawCCgoKMA333yDL774AuXl5YiIiLBrq9Vq\nUV9fz/uZMBNYYPbu3Yvo6GhMnDgRxcXFGDx4sFu3zwQWCIvFAo1Gg6VLlyI1NRX5+fno3r2729vx\n+rhoBlBXV4ekpCTk5+dj+/btWLFihTXH7G6YwF6mpqYGKpUKlZWVyMvLg0ql8mh7TGAvcuvWLcTE\nxEAmk6GoqAhjxozxeJusD/YS+fn5+Pbbb9GnTx+Ul5d7RVyACexxiAg7duyAWq3GtGnTcO3aNfTv\n399r7TOBPUhTUxO+//57pKenIz09HWfOnLE7Z8tTsD7YQ3z48AGzZ8/GpUuXkJOTg6VLlwriBxPY\nA1RVVSEmJgZVVVU4f/48IiMjBfOFCexmrl+/DpVKhYCAAJSUlCAsLExQf1gf7EZOnz6NyZMnY/Dg\nwSgvLxdcXIAJ7BaICFu2bEFCQgKio6Nx9epV0UxuZwK7iMlkwuLFi5GRkYHMzEwcP35cVBPvmMBO\n8Ploirdv30KpVOLIkSM4cOAAsrOzxTdt1kODE+wi5REdrUdTPHr0iEJCQigwMJCuXLkimE9sRIcb\naRlNoVarwXEczGYzSktLMWXKFKFdswu7TXICrVaL8PBwJCcnY/z48dDr9ejVq5fQbv0nvCLYYDBA\npVIhMjISaWlpqKura2ezadMmfPfdd1Cr1VCr1bynVkgFIsL69euRlJSEhIQEhIeHY8CAAfxHNwqF\no3M83zJJiYmJdPPmTZf7DDHS0NBA8+bNIwD0448/UnNzc7vRjULhch/Mp0ySyWTCvXv3cPDgQcTG\nxuKHH37A8+fPPXRIepdXr15h+vTpOHHiBI4ePYp169ZBJpM5P8NAIBwKzKdM0osXL8BxHDQaDfLy\n8jBq1CgsW7ZM8rWyHj58CI7jcP/+fVy+fLlNCUZnRzcKhUOB+ZRJGjhwIHJzczF06FDIZDKkpKTg\n6dOnePbsmfs89TKJiYkYPnw4nj17ho8fP+Ls2bNCu9QhHArMp0zSgwcPoNfr26xHRG2mXEiJw4cP\n4+TJkwCAxsZGmEwm3rP5xIZDgfmUSZLL5diwYQOqq6sBAMeOHUNISIho8rF8aW5uxpo1a5CcnIzQ\n0FAEBAQgPDxcEn2tXfhcqRkMBlKpVBQVFUWpqan09u1bunv3LsXGxlpt9Ho9RUdHU1RUFC1cuJBq\namo6dNUnBBqNhvz9/WnYsGEEgPz8/Cg9PV1ot3jhaH+yVCUR+fv7EwACQJ06dRLF7Q9fWKrSAb//\n/ru1UvqcOXOQlpZm85Ts9LRNseDlA05UEVxQUEDdu3en0NBQayLHHmJJbHwOi2A77Nu3DzNnzgTH\ncSguLrYmcuwhlcRGO7x8wAkewU1NTaTRaAgALVmyhMxmsyB+uAsWwa349OkT4uPjsX37dmzbtg05\nOTno1Ml9D9RE2U97+YATLIJrampo7Nix1KVLF8rLy/NIG0L00yyCAdy+fRsRERGora1FUVERYmNj\nAbg/4kTZT3vtUPsXb0fwuXPnqGvXrjR69Oh2bYr1ytgZJBnB7ogsIsKuXbsQGxuLqVOnoqioCAMG\nDGhjI8qIczfePd74RXDryPr8PUF8MJvNtHz5cgJAK1eupKamJne4LkokmarUaDRtxIUTp9H379+T\nUqkkuVxOe/bscZfbosXR/hTloLvPXx2Tk5PD6zT69OlTxMTE4MmTJzh//jyioqI86aYkEKXAreH7\nnqDWk76Ki4sxcuRIL3gnfkR5keUsZ86cweTJkzFo0CCUl5czcVshaYHp30lf8fHxmDlzJgwGg+QG\nGXgayQpsNpuRmpqKjIwMZGRk4MSJE6Ka9CUWRN8H2+Ldu3dISEhAYWEhcnNzsWjRIqFdEi2SE/iP\nP/5AdHQ0/vrrL1y8eJG9Rs8BkjpFl5SUgOM4mEwmlJaWMnF5IBmBf/75Z0ydOhXBwcEoKyvDiBEj\nhHZJEoheYCLCTz/9hLlz5yIuLg6XL1+2/756RjtELXBjYyMWLFiArKwsZGVlQafTYc2aNeJ7qC5m\nvJo4Jf6PC1+9ekX9+/cnABQVFWX93Rce8bkTST4urKysBMdxqKmpAfDP/OQW/heP+NyI6AQuLCwE\nx3GQy+VITk5uJ6ZUZvWJBVEJfOTIEcyYMQOjRo1CaWkpDh48iNDQUGzbtg3jxo0T2j1JIgqBm5ub\nsXbtWixcuBBmsxkGgwEbNmwAANy8ebPNJ8M5BBe4vr4ec+fOtQraQst0zfDw8DafDOcQXOBz585B\nr9fj+PHjVhFlMpm1371x4waICDdu3BDSTcnCKxdtMBig1WphMpkQEhKCjRs3titszcfGFnFxcZgx\nYwYCAwORmJjYsX/BsIvDCH7z5g1Wr16N3bt3o6CgAAMHDsTWrVudtrGHn58fAgMDO+Y9wyEOI9hW\nlR21Wm2tNsPXpgWLxQLgn+IuDNdp2Y8t+/VzHAr8X1V2Wk7BfGxaaKn30bpiDcN1jEajzdfiORSY\nT5UdPjYthIWFQafTISgoCH5+fo6aZzjAYrHAaDTaLT7uUOB+/frhzp071u+2quzwsWlBoVCwpIWb\n+a8XWrqlyg4fG4YwyIgcl6MrLCyEVquF2WzGoEGDsHnzZlRXV2Pt2rXIy8uza8OujoWHl8AM6SJo\nJotPmWJGW4gImZmZOHDgAC97wQR2JTnyf+Xx48dYsGABLly4wHsdwQTmU6aY0RadToe4uDgolUre\n6wg2LtqZ5AjjH7KysgAAZWVlvNcRLIKdSY4wOo5ge5NPmWKG6wgmMEuOeAfB+uCePXsiOzsbaWlp\nbZIjDPfCEh0+Drui8XGYwD4OE9jHYQL7OExgH4cJ7OMwgX0cJrCP8zesO4V8ZRISBQAAAABJRU5E\nrkJggg==\n",
528 | "text/plain": [
529 | ""
530 | ]
531 | },
532 | "metadata": {},
533 | "output_type": "display_data"
534 | }
535 | ],
536 | "source": [
537 | "# select the property of interest\n",
538 | "property_ = 'green_norm'\n",
539 | "\n",
540 | "# format data for property \n",
541 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n",
542 | "\n",
543 | "# encode sequences\n",
544 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n",
545 | "\n",
546 | "# train and CV model\n",
547 | "measured_CV, predicted_CV = cross_validation(X, log_data, property_)\n",
548 | "\n",
549 | "# train model on whole test set\n",
550 | "final_prams = ML_train(X, y)\n",
551 | "# use model to predict on test set and evaluate accuracy\n",
552 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)"
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {
558 | "deletable": true,
559 | "editable": true
560 | },
561 | "source": [
562 | "### Kinetics_off"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": 9,
568 | "metadata": {
569 | "collapsed": false,
570 | "deletable": true,
571 | "editable": true
572 | },
573 | "outputs": [
574 | {
575 | "name": "stdout",
576 | "output_type": "stream",
577 | "text": [
578 | "20-fold corss validation of GP regression model\n",
579 | "R = 0.79\n"
580 | ]
581 | },
582 | {
583 | "data": {
584 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHwAAABvCAYAAAAuXKSLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADLZJREFUeJztnX9oG+Ufx9+Xa5N0lLai3aKr2m7asa11SipWVuxUnA5m\nGU62drWM/YBqslp7cW6gTtD9chh1wzmlc+hkU6bMBhlUELrJQBDEqbWKKJNlbO3SgWJrlzTJ5/vH\nuHwvySW9y13u8uN5QaC5e+55nt77+Xye5z7Pc084IiIwigaL2RVgGAsTvMhgghcZTPAigwleZJQY\nUci1a9cwPDyM6upq8DxvRJEFTSQSQSAQQENDA+x2u6prDRF8eHgYnZ2dRhRVVBw7dgxNTU2qrjFE\n8OrqagDXK+hwOIwosqAZHR1FZ2dn7L6qwRDBRTfucDhQU1NjRJFZx+Px4ODBg3C73fB6vabUIZPu\nkQ3aMuTgwYMIBoN49913za6KKpjgGeJ2u2G32+FyucyuiioMcemFiNfrNc2Va4FZuEY8Hg/sdjs8\nHo/ZVVEEE1wj+daXM8E1km99OevDNZJvfTmz8CKDCW4yRg/6mOAmIx30GSF+UQlutDUpKU866DNk\nxE8G4Pf7qb6+nvx+vxHFpcRmsxEAstvtOVmeIAhkt9tJEIS449FolKampmLftdzPorJwPR+hROtt\nampKacVqy/N6vZiamoob9fv9ftTV1aGsrAyCIGiud1FZuJ6I1it+MvEagiCQzWZLsmii61Z95MgR\nqqioSCqDWbgJiNbrdDoz9hqp+uzLly+jra0NGzduRFtbG5555hn9gjuqm4hMS9y2bRsdPnw4ZZpC\ntHC1yFlzYp8djUbp+PHjdMMNN9Ds2bPp5MmTsnlpuZ+aBP/jjz+oq6uL7rrrLib4DMw0gLty5Qqt\nXr2aANCTTz5JV65cSZmXaS792LFjeOKJJ7BixQrtrqbASTeAO3nyJBYvXoyhoSF88sknOHHiREbL\nlxShuonIwFx6Zly9epXWrVtHAOjxxx+ny5cvK7qODdpMQksg59SpU2hoaMCpU6fw4YcfwufzGbLA\nkwmugUwiY//88w82btyIlStXorGxEcPDw1i/fj04jstiTf8PE1wDagMrX3/9NRobG3HixAm89957\nGBwcNHwVb8EKrjVuruR6uciYHBMTE3C5XHjkkUcwf/58/Pzzz+ju7jbMquNQ3etngBmDNq1xc73i\n7mfOnKF58+ZRWVkZ7d+/nyKRiKb8iNigTRatcfNU1yv1HFNTU+jr68OyZcswZ84ctLe344UXXsDW\nrVszqo9uaG5uCiikxzIllv/tt99SfX09Wa1W2rdvH4XDYUXXpYutS2EWbhAejwfhcBg8z8t6jmAw\niO3bt2Pp0qWoqKjADz/8gK1bt4LneUUeh82H5wBSq0tnpZ2dncRxHFksFnrttdcoFAplVJbcfHgi\npsXSlZLPgktFlhMkGAzSjh07YlOYVqs163ViguuM1KqdTicBIKfTmZTup59+onvuuYd4nqf77rtP\ntv9V2i+ruYYJrjNSq060cJvNRs899xzt3r2bSktLaeHChfTdd98l5SGm5Xle9ePdTAM8JrhClFqb\n1HVL/xaF4DiOOI6j559/Pm6tmRQxLc/zivrlVOXLwQRXSCaLCsUGEg6HqbW1lQBQVVUVnT17dsZr\n1QqtFCa4QuRESGf1YgOxWq3U0tJCAGjLli00MTGRtgy1fbZamOAaSGf1fX19VFJSQhaLJbYSRUt+\ncmTSQJjgKpHeZNHqnU5n3I3fvHlz7FGL4zjFIqp15ZnE7JngM5BoRXI3WTxms9no8OHDcUuQMxl4\nqamb2ryZ4DOQKHC6vryuro4A0KJFi8hisRDP81ntj5WQ2GBNFXxoaIhWrlxJy5cvp56eHvr333+T\n0pgt+ExWFI1G6eOPP6aqqiqaM2cODQwM6FauHgO4xAZrmuBXr16l5uZmOn/+PBER7du3j1555ZWk\ndGYLno7R0VFatWoVAaC1a9dSIBDQLW+95tQTG6xps2Vnz55FY2MjamtrAQAdHR348ssvQXnyIwuf\nffYZ6urqMDAwAI7jMHfuXNx000265a/Xu2xKV9YoQZPgo6OjcSstHQ4HJiYmMDk5qbliiShZeODx\neFBSUoKSkpKkdNJzLpcLHR0dWLNmDYLBIACAiHSfltRTKN3Q4moOHTpEL7/8cuz79PQ01dfX0+Tk\nZFw6PVy6EvcofcEvMV3iy3+VlZV09OhR6uvrI57nc2JwphTTXPrNN9+MQCAQ+z42NobKykrMmjVL\nS7ayJLpHOYt3u93geR4cx2F6ejru3KZNm2KLBmtra/HLL7+gq6sLb775JsLhMMLhcG5ZYrbQ0tLG\nx8fp/vvvjw3a3njjDdq+fbuuLTIV6Sw+8dzg4CDNnTuXysvLqb+/n6LRqG71kGJEWJXIRAu/8cYb\nsWfPHjz77LNYsWIFfv/9d2zbtk2fljgD6QZE4rnNmzeju7sbjz32GCwWC0KhEH799desLQ/Oi036\nstAAk8j2Y5mcZQ0NDVFtbS3NmjWL3nnnHbJarUnz2npbYjZnyKQUfaRN6sInJyepp6cnFgPfsGED\nEcWLYfReL3pTcIKrtUBRzPb2drrjjjtmXGlilCVmi4ITXC72naoBCIJAVquVmpqaYla9fv36lLNg\nRpKtrqPgBE+0wHQuuLS0NG5WKzGdme47W2UX5IsIJAnPyo3IQ6EQmpubMT09DQDo6upCb29vLJ34\nnF5ZWQkAWLx4sbH/QIp6m46uTS8FWl16IufOnaMlS5bELNtms8XOJa4WRYrIWz5TcBYuRsyCwWBc\nXDwcDqOlpQV33303Ll68iHXr1sFut8Ptdscsev/+/bH4uNZttQqSLDTAJORapCAIKWPYgiAkWefI\nyAjde++9ccfl0mdzdUqukJeDNqUTHRaLhR544AGy2Wx05513ygouTV/IQovkpUsX3bbcm5jiYGfD\nhg1wOBz45ptvsGjRIpw7dw5OpzOWTvwZRjG9IAhxEyDppksTybcfq8mYLDTAJNS2yEgkQgcOHKCy\nsjJZLwCJlYsrT+Wed9N5kUTyKfqWly49FefPn6cHH3yQAFB3dzdt2bIlqU8Wlw2LAqUK1DidTsVz\n3fkUfSsIwaPRKL3//vtUXl5ONTU19NVXX8WdT1xLzvM8cRxHPM+T0+lUHKgpBPJecL/fT48++mhs\nkOZyuZLSSF/Om+kFfaXWatT8td7kreDRaJQ++ugjqqysJIfDQSUlJUmiiojvaUvfAtHqhvPVE5gq\neCbbZ4sTHvPnzycA1NHRQePj40miQvKYpeX121TkU78txTTBM90+Wzrh8fnnn8fSSUVNHF3nqzjZ\nwLTn8Ey3z167di14nsfTTz+N1atXx46Lz9O9vb0QBCEuJJqTS37zkZlaxOnTp2nhwoVJny+++CKW\nhm2fbSxZtfDW1laMjIwkfVatWpWVBpgY8ZJ+TxUNK5oomR7o0eL0tPDEkbM0WpZq2VK+jrYzJS9j\n6alIXDTgdrvjzstNdebkQoNcRY8Wl+0+nI3Q49FyP3X5/fC9e/fqkU1K8u03unOZnHPpjOzCBC8y\nTBecPVIZi+mC58ULeAWE6YKzRypj0WWUrgU2AjcWQwSPRCIAru8Jw9COeB/F+6oGQwQXtwXp7Ow0\noriiIRAI4Pbbb1d1DUeU/T22rl27huHhYVRXV4Pn+WwXV/BEIhEEAgE0NDTAbrerutYQwRm5g+mj\ndIaxMMGLDCZ4kWH6c3i2OH36NLxeL0KhEBYsWIDdu3ejvLxcdRqz8Pl8+OCDD8BxHMrKyvDiiy+i\nsbExLs3evXsxODgY2/Sgrq4Ob7/9dvqMdZ6qzQmU7PKsdCdoM/jzzz9p6dKlNDY2RkTX1xW2trYm\npVuzZg19//33qvIuSJeuZJfnXN4J2mq1YufOnZg9ezYAoKGhAePj4wiFQrE0oVAIIyMjOHLkCNra\n2tDT04NLly7NmHdBCq5kl2cjd4JWS01NDZYtWwbg+l43e/bswUMPPQSr1RpLMzY2hubmZgiCAJ/P\nhyVLlsDlcs3YYAtS8Gg0KnvcYrGoSmM2//33H3p7e3HhwgXs3Lkz7tytt96K/v5+zJs3DxzHYdOm\nTbhw4QIuXryYNs/c+e90RMkuz0buBJ0Jly5dQnt7O3iex9GjR1FRURF3/rfffsPAwEDcMSJCaWlp\n2nwLUvCWlhb8+OOP+OuvvwAAn376KR5++GHVaczi77//xlNPPYXly5fjrbfekg2fWiwW7Nq1C36/\nHwBw/PhxLFiwIK6bkqNgQ6tnzpyB1+vF9PQ0brvtNrz++uvw+/146aWX4PP5UqapqqoyuebAoUOH\ncODAAdTX18cdf/XVV7Fjx45Y/X0+H/r7+xGJROBwOLBr1y7ccsstafMuWMEZ8hSkS2ekhgleZDDB\niwwmeJHBBC8ymOBFBhO8yGCCFxn/A332OQXZ67SxAAAAAElFTkSuQmCC\n",
585 | "text/plain": [
586 | ""
587 | ]
588 | },
589 | "metadata": {},
590 | "output_type": "display_data"
591 | },
592 | {
593 | "name": "stdout",
594 | "output_type": "stream",
595 | "text": [
596 | "Full GP regression model\n",
597 | "Hyperparameters: 0.0127667028933 19.179798011\n",
598 | "GP regression model test set\n",
599 | "R = 0.959\n"
600 | ]
601 | },
602 | {
603 | "data": {
604 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAG8AAABvCAYAAADixZ5gAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC11JREFUeJztnX1ME3cYx799sW1UdHNDiYJCNpEtJBAlaiIRrYhijG9j\nk4y5yRKVUHxJi8H4AmoaxWz4EoX4lsHmCGyDZQgJAxRbiYnZYmKUGDFBEIygFdFkUyhtn/0hZaiF\nXtu7a6+9T8IfwO+e35P73vO733O/3z0nISKCiCCRetsBEfcRxRMwongCRhRPwIjiCRg5W4b6+vrQ\n3NyM4OBgyGQytswGLFarFSaTCdHR0VCpVA7bsCZec3Mz0tLS2DInMkhpaSni4uIc/o818YKDg4c6\nCwkJYctswNLd3Y20tLSh8+oI1sSzD5UhISEIDQ1ly2zA8PLlS9y/fx/R0dFv/H20WxBr4om4z4sX\nL7BixQp0dnaio6OD8XGieF6mt7cXy5YtQ0tLC/7880+XjhXF8yJPnz5FUlIS2tvbcfny5REnJiMh\niuclnjx5gsTERHR1daGxsRGxsbEu2xDF8wJdXV1YsmQJenp6cOXKlXcmKUwRxeOZhw8fQq1W459/\n/oHRaERUVJTbtkTxeOTBgwdQq9Uwm80wGo2YOXOmR/bEZ5s80draioULF8Jms+Hq1aseCweI4vHC\nvXv3kJCQgDFjxsBoNCIiIoIVu6J4HHPnzh0kJCRg/PjxMBqNmD59Omu2RfE45NatW1i0aBEmTZoE\ng8GAadOmvdNGp9NBpVIhLi4OKpUKOp2OeQfEEp2dnRQZGUmdnZ1smRQ0N27coEmTJlFMTAw9efJk\nxHZKpZIADP2oVCoiYnY+xcjjgL/++gtLlixBREQEGhsbR10Z0Gg0UKlUmDNnDlQqFTIzM5l3xNaV\nJkbea65du0ZBQUE0b9486u3tdduOGHk8YzQakZSUhJiYGNTX1+O9997jtD9RPJa4dOkSkpOTMXfu\nXNTW1mLChAmc9ymKxwK1tbVYuXIl4uPjUVNTg/Hjx/PSL6PHYz///DPKysogkUgQFhYGvV6PDz74\ngGvfBMHFixfx+eefY+nSpaioqBhxsxAXOI285uZm/PDDDygvL0dNTQ3Cw8Nx4sQJPnzzeSorK/HZ\nZ59hxYoV+P3333kVDmAgXnR0NOrq6hAUFIT+/n48fvyY8xuxECgrK8P69euxdu1a/Prrr1AoFCO2\ntSfiLiXgTGA6dW1oaKC5c+dSfHw8tbW1uTW19RdKSkpIKpVSWloaDQwMOG1vT8TtCTgTmJxPl/O8\nX375hdRqNVmtVpc78wfOnTtHEomENm7cSBaLhdExWq2WVCoVabVaxv2wIl57ezv9/fffQ79bLBaK\nioqiZ8+eudyZ0CksLCQAtHnz5ncuXrZhJUk3mUzQarV49uwZAKC6uhozZ87E+++/z+747eMcP34c\nGo0GWVlZOH36NKRS72dZTlOFuLg4ZGRk4Ouvv4ZMJsPkyZNRWFjIh28+w5EjR7Br1y7odDp89913\nkEgk3nbpNXyGuRA5ePAgAaDdu3eTzWbjrV/x2aYHEBH27duH3Nxc7N+/H3q93ncibpCAF89RDkZE\nyMnJgV6vx6FDh5CXl+dzwgHw72FTq9WSUqkcdYr+dg5ms9lo+/btBIAKCgpcssUmnOR5nnTGN0yS\n4+E5mNVqpYyMDAJAJ0+edNkWmwS8eK4kxxaLhb799luSSCR05swZj2yxQcCLx5SBgQHasGEDSSQS\nKi4u9rY7RMTsfAb8jumBgQFs2LABv/32Gy5cuCCoV7MDdrap0+mgVCrx6aeforKyEuXl5UPCcbYK\nwDZ8hrk3GT5b1Gq1b2y3++OPP95oy/fkxBHiPW+Q4WKpVCpSKBRDv69Zs8Zhez4nJ44QxRtk+MbW\nrVu3UlhYGAGgdevWedu1EREfjw1i39ialZWFmzdvoqenB42NjaisrPS2ax7ht+INn3QUFBSgu7sb\nN27cwM2bN1FXV4fFixd720WP8VvxCgsL0d/fj6KiIvT29iIxMRF37txBQ0MD4uPjve0eK/hlnqfT\n6WCxWCCTyZCeng61Wo0HDx7g8uXLmDNnjrfdYw2/FK+wsBBWqxVKpRJNTU3o7u7GlStXEBMT423X\nWIXRsFlVVYVVq1Zh9erVSE1Nxe3bt7n2yyM0Gg2USiXGjRsHk8kEg8Hgd8IBcJ6kt7a20oIFC+jx\n48dERGQwGCghIcGtqS1fdHR00Mcff0xTp06lu3fvetsdt2AlVVAoFNDr9Zg8eTKA15twnz59CrPZ\nzPmF5Q7t7e1ISEhAf38/jEYjZs2a5W2XOMPpPS80NHSoih8R4fDhw1Cr1aPuEPYWra2tUKvVkEql\nuHr1KsLDw73tEqcwThVevnyJ7du3o6OjA3q9nnVHPH0Y3NLSgoULF0KhUASEcABD8R49eoTU1FTI\nZDL89NNPnLx7Zs/LTpw44bKI9ooLEyZMgNFoRFhYGOv++SJOxXv+/Dm++uorJCUl4dixY6y9CfN2\npNkfYQEYSq6dHQP8X3Hhww8/hMFgwNSpU1nxTxA4m/UUFRVRVFQUrVq16o0fT7e7j7TsMtoT/beP\nYVpxQYj49KqCO8suw4+5fv06TZw4keLi4qinp8ddt30WnxbPE5qamigoKIjmz59Pz58/57w/b+CX\nS0IGgwHLly9HbGws6uvrMXHiRG+75DW8Jp6rqYFOp4NCocDSpUuHKi4EBQVx7KWPw2eYD8fVfSJy\nuZwAkFQqpX///dcTVwWBTw+b9tTAXq5ptEisqqqCzWaDVCpFVlYWxo4dy7e7vgmfV8po+/3fjkR7\n25UrV5JcLqe1a9dSf38/W+76PD432xxtqHw7dRi+aeiLL74gs9nMlquCwOd2TGs0GhQVFTmsbFdQ\nUAAAQ2/dLlq0CHV1dfjkk09QWloKudwv1409g88rxRn2aJPL5SSRSCg9PZ1xxQV/wycmLK6kBBqN\nBnK5HBaLBVu2bMH58+fFb/GNAufiDd/F5YzQ0FBYLBZs27YNRUVFPlFxwZfh7OzYI87+5UVnFVzz\n8/Oh1WqRnZ2N48eP++ZrxL4GV2M00yTcZrPRgQMHCADt2bOH14oLvoxX73nDk/CR7ntEhL179yIv\nLw8HDx70yYoLPg0fV4qjKLTZbJSdnU0AKD8/ny03/AafmG0C7z4KIyLs2LED33//PY4ePYqcnBw+\n3PA7eMt8iQgAYLPZkJmZiTNnzuDUqVPQaDR8ueB/MAlhm81GOTk5dP78ebfC3D5sKpVKSk9PJ4lE\nQmfPnmXSdcDCyrDZ2tqKb775BrW1tW5fIPbt5xEREfjxxx9RXFyMTZs2uW1P5DVOh83S0lKsW7fO\no11Z+fn5ePjwISorK3HhwgV8+eWXbtsS+R+n4uXm5gIArl+/7lYHZrMZqampqK6uRnl5OVJSUtyy\nI/IunE5Y+vr6kJKSgvr6elRUVGD16tVcdhdwcCbeq1evsGbNGhiNRlRVVSE5OZmrrgIWzsSrrq5G\nU1MTampqkJiYyFU3AQ1n4qWkpCApKUn8BgOHMBYvPz/fJcNSqVQUjmNYizyr1QoA6O7uZstkQGM/\nj/bz6gjWxDOZTAAgqKp5QsBkMmHGjBkO/ychGnzo6CF9fX1obm5GcHCwuHWBBaxWK0wm09BitiNY\nE0+Ef8RNIgJGFE/AiOIJGMGLJ7TqTEy4dOkSZs+e7bwhT2uLnMC0OpOQaGtro8TERIqNjXXaVtCR\nJ7TqTM549eoVdu7ciV27djFqL+i3N4RUnYkJubm5WL9+PeOSW4KOPDtcV2fiA/ubUK4sVgs+SX/0\n6BEyMjLw0Ucf4fDhw7x/7potUlJS0NfXB5lMhoGBAbS1tSEyMhJnz57FlClTHB/E9Q2YS3p7e2nx\n4sXvfLRJ6HR2djKasAj6nldWVoauri40NDSgoaFh6O8lJSUB8U1bwQ+bgYxfTFgCFVE8ASOKJ2BE\n8QSMKJ6AEcUTMKJ4AkYUT8D8B/eBZzDOS3kWAAAAAElFTkSuQmCC\n",
605 | "text/plain": [
606 | ""
607 | ]
608 | },
609 | "metadata": {},
610 | "output_type": "display_data"
611 | }
612 | ],
613 | "source": [
614 | "# select the property of interest\n",
615 | "property_ = 'kinetics_off'\n",
616 | "colors = sns.color_palette('colorblind')[2]\n",
617 | "\n",
618 | "# format data for property \n",
619 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n",
620 | "# encode sequences\n",
621 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n",
622 | "# train and CV model\n",
623 | "cross_validation(X, log_data, property_)\n",
624 | "\n",
625 | "# train model on whole test set\n",
626 | "final_prams = ML_train(X, y)\n",
627 | "\n",
628 | "# use model to predict on test set and evaluate accuracy\n",
629 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": null,
635 | "metadata": {
636 | "collapsed": true
637 | },
638 | "outputs": [],
639 | "source": []
640 | }
641 | ],
642 | "metadata": {
643 | "kernelspec": {
644 | "display_name": "Python 2",
645 | "language": "python",
646 | "name": "python2"
647 | },
648 | "language_info": {
649 | "codemirror_mode": {
650 | "name": "ipython",
651 | "version": 2
652 | },
653 | "file_extension": ".py",
654 | "mimetype": "text/x-python",
655 | "name": "python",
656 | "nbconvert_exporter": "python",
657 | "pygments_lexer": "ipython2",
658 | "version": "2.7.6"
659 | }
660 | },
661 | "nbformat": 4,
662 | "nbformat_minor": 2
663 | }
664 |
--------------------------------------------------------------------------------
/regression/GP_matern_5_2_kernel_LASSO.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "metadata": {
7 | "collapsed": false,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "from __future__ import division\n",
14 | "import numpy as np\n",
15 | "import matplotlib.pyplot as plt\n",
16 | "import seaborn as sns\n",
17 | "import pandas as pd\n",
18 | "import pickle\n",
19 | "\n",
20 | "# ML imports\n",
21 | "from sklearn import linear_model\n",
22 | "from scipy import optimize\n",
23 | "import scipy\n",
24 | "\n",
25 | "# custom imports\n",
26 | "import encoding_tools as encoding\n",
27 | "import chimera_tools as chimera\n",
28 | "import GP_tools as GP\n",
29 | "import lasso_tools as lasso_tools\n",
30 | "\n",
31 | "# Plot adjustments:\n",
32 | "sns.set_context(\"paper\")\n",
33 | "sns.set_style(\"white\")\n",
34 | "plt.rcParams.update({'ytick.labelsize': 12})\n",
35 | "plt.rcParams.update({'xtick.labelsize': 12})\n",
36 | "plt.rcParams.update({'axes.labelsize': 14})\n",
37 | "plt.rcParams.update({'legend.fontsize': 12})\n",
38 | "sns.color_palette('colorblind')\n",
39 | "\n",
40 | "plt.close('all')"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {
46 | "deletable": true,
47 | "editable": true
48 | },
49 | "source": [
50 | "## Convert data to usable form"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 8,
56 | "metadata": {
57 | "collapsed": false,
58 | "deletable": true,
59 | "editable": true,
60 | "scrolled": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "# load ephys data\n",
65 | "path_inputs = 'inputs/'\n",
66 | "df_input = pd.read_csv(path_inputs+'Ephys_data_formatted.csv')\n",
67 | "df_input = df_input[df_input.gen != 3]\n",
68 | "\n",
69 | "# load library files\n",
70 | "file_c = path_inputs + 'shmetis_c_10_21_0/chimeras.output'\n",
71 | "file_n = path_inputs + 'shmetis_n_10_21_0/chimeras.output'\n",
72 | "\n",
73 | "# add sequence information to dataframe based on chimera code\n",
74 | "df_input = chimera.chimera_code2seq_convert(file_c,file_n,df_input)\n",
75 | "\n",
76 | "# load contact information\n",
77 | "fname_1 = path_inputs + 'alignment_and_contacts_C1C2.pkl'\n",
78 | "\n",
79 | "# load the contact map\n",
80 | "with open(fname_1, 'rb') as f:\n",
81 | " ss, contacts = pickle.load(f)\n",
82 | " \n",
83 | "# only use the first three parents\n",
84 | "ss = [i[0:3] for i in ss]"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {
90 | "deletable": true,
91 | "editable": true
92 | },
93 | "source": [
94 | "## Spectra"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 9,
100 | "metadata": {
101 | "collapsed": false,
102 | "deletable": true,
103 | "editable": true,
104 | "scrolled": false
105 | },
106 | "outputs": [
107 | {
108 | "name": "stdout",
109 | "output_type": "stream",
110 | "text": [
111 | "20-fold corss validation of GP regression model\n",
112 | "R = 0.92\n"
113 | ]
114 | },
115 | {
116 | "data": {
117 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHUAAABvCAYAAADSSY9BAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADTpJREFUeJztnX9sE2UYx7/tzfVYxBGTQQFRQB2LlEzTmYAoZVNgmg1I\nzMC5YdSJ2daMaRvjDBJMlDl/dBKXMRR/ZUHdP0YWFHEmuJqRGNMlmJThH5phNmdnpwPcj3bb9fGP\n5c7+bq+9XrvjPkn/6N3b93173/d93ud93vfuNEREUFEU2nRXQEV6VFEViCqqAlFFVSCqqAokS45C\nPB4PnE4n8vLywDCMHEUqGo7j4Ha7YTAYwLJsyHlZRHU6naiqqpKjqOuKTz/9FEVFRSHHZRE1Ly9P\nqIRer5ejSEXjcrlQVVUlXNdgZBGVN7l6vR633HKLHEUqCqvVivb2dpjNZthsNuF4pKFMdZQyiKKi\nImg0Gixbtgwsy8JqtQIAWltb4fV60draGlc+qqhpxGq1BojX398PAPjrr78EETUajeh8VVHTSHt7\nO7xeL44dOwYAMBqNAIClS5cmla8qahoxm81gWRb19fUAAIfDASLC6OgoLBZLwvmqosoIb26LioqE\n+eX09DRsNluIKbbZbFi/fn1iBZEMDA0NUX5+Pg0NDclRXEZisVgIQMCHZdmQcwzDkE6no+rq6pD0\nS5cuJaLY11OWKY3K/PjJYzQacfHiRaxfvx4sy2Jubk44x3EcOI7DyZMnkZeXh2vXrsFgMMDpdKK6\nujquslTzKxP8+GmxWOBwOFBfX4/+/n54vd6w6VesWIGKigoAwIULFwIcqpjIYXpU8xuKTqcTzKrR\naCSGYQJM7eTkpJCGYRhiWZYsFgsRxb6eak+VGd4h4oPxFosFTqcTHMcBADQaDRobG5GTkyP07sbG\nRsGhigs5WqXaU/+H7328k0REZDKZhB6q0+li5qH21AzDf25KRNi0aRPsdjuWLVsGnU4Hs9mcdBmq\n9yszNpsNNpsNPp8PjY2N+PHHHwEAV65cgcfjkaQMtacmQXDAIF7m5ubw1FNPoa2tDSUlJWBZVpIe\nKiDVWJHMGLBQCTc+xuKee+4Rxs/Ozk4img8+6HQ6MhqNpNPpBC83ErGupypqElgsloCpRqy02dnZ\nAdMWXkD/6Q3/iZanKmqGoNVqA0TTaDRCL+cbh9FoDAkhhkP1fjMAl8sFn88nfCciPP/884IXbLPZ\nMD09DYfDAYvFApZlhRCi2PGaLyDlLPSeypvIeMxsMIODg3T77bcHBOzjIdp4nfKe2tvbi/LycuzY\nsQMHDhzAxMREsllmHMGL2TyxvN8nn3wSa9aswdjYGJ5++mkhOhQPwWutohDd9Pz4+++/aePGjTQ4\nOEhERG+++SYdPnxYdMvKdCI5RNF6k8PhEBUlEkNKe2pfXx82bNiA1atXAwAqKytx+vRp0AK+OzJS\n7wv3n4J7E//bvXv3ori4WNIokSiSaTHvvfceHTp0SPg+OztL+fn59O+//4pqWZlEuN4X7ljwOBu8\nCL5161a6evVqxLSJjtFEKZ7SdHR0hBV1cnJSVCUyiXCmNtyxYKH955q5ubmk1WqJYZiAeWhwWjFB\nC39SKuqpU6eotrZW+D48PEz33nuv6EosRIKFfuihhwgAFRQUBAQZ/Oeh/j013qBFOFIq6tjYGG3a\ntElwlN5++21qamoSXYmFTktLCwGguro64jiOLBYLMQwj9NRIJGqGUx5R6u3tpfLyciotLaVnn32W\nxsfHRVdioeLz+aipqYkA0EsvvUQ+n0/U7/ndDvHOXXlSvvHMZDLBZDIlm82Cw+fzwWw24/jx42AY\nBl6vN+Ju+kj3wqQMUU0kQRZyTw1nImdmZujxxx8njUZDWVlZMZ2eSI5RomOrGtBPkmBBpqamqKys\njLKysqirqysuYZJ1jIJRRY2DaA6LvyBXr14lk8lELMvS119/HVe+wQ5TsnNUIlXUuIhn3uh2u6mo\nqIgWL15Mdrs9Zp7BwQip5qhE6tJbTKxWK+bm5sAwTMTg+R9//IEtW7bg8uXL+P7777Fly5aY+frv\nyPfPO6lAfbwk3FwkbFnpJFbP+fXXX2n16tW0cuVKunTpUtz5Sj2O+qP21ChE6qVWqxVZWVlgGAYF\nBQX4/fffUVpaioKCgrg3m/EL37JMYYKRvBkl0LLShX8v9Xdgwu0Z4u9G4wMG4aYnyTpA8aI6SlHw\nN5H+At95552CmIWFhcSybED0R+zaqtSoosYJL/DOnTvDLm7zm8KMRmPU36s9NcPo7OwkhmHojjvu\nCDGlcvbEWKiOUpy0tbXhiSeewL59+3Dp0iV4PJ4AJ0eWqYhUZELLEoPUDonP56NXX32VAFBjYyNx\nHCdJvqlEceZXSjPo8/nIarUSAHrllVdEL52lC8WZX6nMIMdx2L9/P2w2G44ePYrDhw9HfRBVojdD\npYVMaFly4/V6qaKigrRaLX388cdx/SbYQsg5Lw1GceZXLMEXf2Jignbs2EHZ2dn0xRdfiMrHf8qS\nTm/4uhbVf6WEZVkaHx+nzZs3U05ODvX09CSdt1zz0mAULWosE+gf7qutraW7776blixZQufPn5e0\nHnKjOEfJn0j3uPDwTtUzzzyDc+fOYWRkBL29vbjvvvtkrqm8LDhR/b3QSLc9WK1WYbNXZWUlvv32\nW3g8HvT19aGwsDDN/0AGMsFciCHSykrwOX/Tq9FoaP/+/Wn1WKVEcWNqpJWV4P1Ae/fuDdlOkknT\nkmRQnKj+RBL47NmztGjRInrggQfIbDYLafxvw4+2NprpKFpUf3jBysrKSKvVklarpYaGhrBp+QbA\nP3ch0nJapqJo79cfm82G9vZ2nDlzBsD8DvoTJ06ETcs7WFrt/N+/cOHCwgkBxoFiRG1tbUVNTQ1q\nampw4MCBqPFhfv9QY2MjGIYBx3HiHr0agYyJD2eCuUgGn89Hhw4dIgD0wgsviF5p8feSE3WY+DFd\nrjFa0WMqx3HU0NBAAKi5uTmhpTMpwn2RnsubKhQr6uzsLO3bt48AUHt7u2T5JoLccWBFijo9PU27\nd+8mhmHo5MmTkuS5kEj5/alyMzExgV27duH8+fP48ssvUV5enu4qZRxJidrd3Y0PP/wQGo0GixYt\nwsGDB7Fhwwap6hbCP//8g0ceeQQXL17EN998g+Li4pSVtaBJ1AT89ttvtHnzZhodHSWi+ccEmEym\nhMxFPIyMjJDBYKCbb76Zfvrpp4TzUQIpM7/Z2dl47bXXhPeSGQwGjI2NYWZmBtnZ2ZI1OgAYHBzE\ntm3bMDU1hR9++CHxNytdJ8QU1W63o66uLuR4c3Mzdu/eDWD+aWCvv/46SkpKJBd0YGAA27ZtA8uy\n6Ovrw9q1ayXNX4nEFNVkMmFgYCDi+ampKTQ1NcHlcuGDDz6QtHIOhwOlpaXQ6/Xo6enBihUrJM1f\nqSQVJhwZGcFjjz0GhmHQ2dmJm266KeG8gkNsdrsdJSUlWLt2Lex2uyqoGBIdrMfHx6m4uJja2tqS\nHtiJAhe4v/rqK2JZloqLi+natWuJVpGIFu6aaTRStkrz+eef488//8R3332HXbt2CZ/x8fGE8uNX\nTpYvX46ysjKsXLkSZ86cweLFi4U0iQTMY+1jUiSZ0LJ4Ojo6oj4jN5EnhKVzK2eqyMj11HA9rqWl\nBXV1dSgsLJTsGbn85jP+OfXXDeloWf7jp8/noxdffJEA0MGDB6OutIjtdZl0T6mUZGRAnxfnueee\no9raWgJAb731luTlKtH0EmWoqETzz/errKwkjUZD77//vhzVUAwZuUozPT2NiooK9PT0oKurC3v2\n7ElHNRRLWkQ9ffo0zp07h+7ubjz88MPpqIKiSYuojz76KLZv344lS5ako3jFk5YpDcMwqqApRJae\nyr9v2+VyyVGc4uGvI39dg5FFVLfbDQCoqqqSo7jrBrfbjdtuuy3kuIYo9a978ng8cDqdyMvLA8Mw\nqS5O8XAcB7fbDYPBAJZlQ87LIqqKvCjmtguV/1FFVSCqqAok7Zu549073NLSgrNnzyI3NxcAsGbN\nGhw9elTu6gbQ29sLm82GmZkZrFu3Ds3NzbjxxhtFp5EcGePQIYjZO7xnzx7q7++XsXbRieeFwPG+\nNFhq0mp+o+0d9mdmZgYDAwP46KOPsHPnTjQ0NGBkZCQdVRaI54XA6XppsCyi2u123HXXXSEfh8OB\nrVu3Aoi+d3h0dBQbN26ExWJBd3c3CgsLUV9fn9Y3KrtcLuj1euG7Xq/HxMQEJicnRaVJBbKMqcnu\nHV61alXArf41NTU4duwYhoeHsWrVqpTUORY+ny/scf6RA/GmSQVp937j2Tv8yy+/4NSpUwHHiAg3\n3HCDXNUMYfny5UL4E5i3Jrm5ucjJyRGVJhWkVdQrV66guroa27dvxzvvvBM25AXMt+wjR45gaGgI\nAPDZZ59h3bp1AaZNbu6//378/PPPuHz5MgCgq6sLDz74oOg0qSCtYcKOjg68++67yM/PDzj+ySef\nYHh4GC+//DK6u7sBzE99Tpw4AY7joNfrceTIkbTv2rfb7bDZbJidncWtt96KN954A0NDQwH1Dpcm\n1cuOauxXgaR9TFWRHlVUBaKKqkBUURWIKqoCUUVVIKqoCkQVVYH8B/xakPtx+JySAAAAAElFTkSu\nQmCC\n",
118 | "text/plain": [
119 | ""
120 | ]
121 | },
122 | "metadata": {},
123 | "output_type": "display_data"
124 | }
125 | ],
126 | "source": [
127 | "# property of interest\n",
128 | "property_ = 'green_norm'\n",
129 | "lasso_alpha = 2.5e-2\n",
130 | "\n",
131 | "# format data for property \n",
132 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n",
133 | "\n",
134 | "# encode sequences\n",
135 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n",
136 | "X = np.array(X)\n",
137 | "\n",
138 | "# use lasso to limit the input \n",
139 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n",
140 | "\n",
141 | "# reformat X with only lasso-limited set for GP model\n",
142 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n",
143 | "\n",
144 | "# evaluate cross-validation performance of GP model with lasso-limited set\n",
145 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n",
146 | "\n",
147 | "# Bayesian ridge regression to find weights\n",
148 | "clf_ff = linear_model.BayesianRidge()\n",
149 | "clf_ff.fit(X_lasso, y)\n",
150 | "weights = clf_ff.coef_\n",
151 | "\n",
152 | "# find features \n",
153 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n",
154 | "\n",
155 | "# find the correct numbering for each feature\n",
156 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {
162 | "deletable": true,
163 | "editable": true
164 | },
165 | "source": [
166 | "## Peak photcurrent"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 10,
172 | "metadata": {
173 | "collapsed": false,
174 | "deletable": true,
175 | "editable": true
176 | },
177 | "outputs": [
178 | {
179 | "name": "stdout",
180 | "output_type": "stream",
181 | "text": [
182 | "20-fold corss validation of GP regression model\n",
183 | "R = 0.80\n"
184 | ]
185 | },
186 | {
187 | "data": {
188 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHgAAABvCAYAAAAntwTxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADIxJREFUeJztnW1sFEUYx/93e3ddtKFgaHOaWl+IbSIF0x5iEaTRoNWY\nGtLEWKxRSJBQTmi6NQREYjCkVetpIhRigoaQEP1govUbKYkH+kEkGiDV+AWIKUKxVTC8XXvtPX7A\n2Wz39vZ29/btjvklF3p7szPD/Od5ZubZ2d0AERE4JUvQ6wpwnIULXOJwgUscLnCJwwUucUJuFJJK\npTA8PIzKykoIguBGkSXN9PQ0xsbGUF9fD1EUddO6IvDw8DA6OjrcKOq24tChQ1i8eLFuGlcErqys\nlCsUjUbdKLKkGR0dRUdHh9yuergiMHPL0WgU1dXVbhRZMvT09GBgYADxeByJRGLGb0aGOz7J8jkD\nAwOYmJjA3r17LZ3PBfY58Xgcoihi48aNls53xUVzrJNIJLJcsxm4BZc4XGAX6enpQSgUQigUwuLF\niyGKInp6evL+VhDkAiMjI1RbW0sjIyNuFOdbysrKCMCMjyiKeX9TY6Y9uQW7SDwehyAIEAQBsVhs\nxuSJ/aaE/dbT02Pdom3tojngFmwMSZJk6xUEQT7OrJtZNLdgmynIgkwwMDCQVS5Q4FLJse6ooNgt\nWG1BTiFJEomiSIIg8DHYTaxYkBWrTyQSuHnzJrq6uuTyJiYmsH37djz99NMgK9vnbO6EmhS7BVuB\nWb0gCFRWVkaSJJnO45dffqGFCxdSKBSi/v5++Ti3YB/ArB6A6VhyOp3Gzp07sWTJEgDATz/9hDff\nfNNSPbjADqHlbo0wPDyMpqYmvPvuu9iyZQtOnDiBhoYG6xUx7TcscDu6aLNMTU3Re++9R5FIhOrq\n6ujHH3/MmZa7aB+jnHyxv9euXYvly5dj27Zt2LRpE5555hk0NzfzUGUxwpZAgiBQJBKRAxvz58+n\n77//nojyL8u4BXuMkSUSEclbbh555BGcOnUKy5cvB1D4NWB1QY7jdwuWJMnyUkYLPQvs7u6mUChE\n4XCYampq6MiRI6bzN9OeXGCyP1LFIlLqDrNu3ToKBAIEgNatW0f//vuvpfy5wCbJJYhdMKvF/+Nt\nOBwuKD8usANIkkSCIJAgCCRJkmG3/ueff1IwGCQAFAgEbBkKuMAGUApkRCzlBXlRFHOGIlle3d3d\n9NxzzxEAecy1y0NwgQ2gHHeVSxc1TLBYLDbDgquqqmbsvhAEQR5f1R/WAVg+hVoxF9gAynFXKbDa\nmnNNwNQisjy0Psoy1MetwAU2iVJspStWW6pSkFgsJh+vqqqizs5OzT1V7Bwtga3O2rnABaDcNiOK\noiFBvv32W4pGo1RRUUH3338/AaBYLJaVL3PxsViMRFHMcvvKtHpunAusgZlghlKMWbNm5XSpV65c\noTVr1hAAamlpMf3/U0/c1MftCFXeNgKrXa+Z9FqNffjwYSovLycAtHLlSspkMqbrpF56KY/rrcuL\nXmC7Q4csT7VYeuWwRmbulC2nIpEILVq0SF7XaonvRP2VFL3ATm1yU1tGvnLUQoXDYbmT7N69m7q7\nu7MsTd2RnBC76AV2OnTIYDNh9YSIwTpAWVkZbd68WbbatWvXyvVUB0uUa2HlrNzOzlr0AruFEQuO\nRCI0Z84cKisroxUrVlAkEsnyAFpLIBY0ydeJrODq9eBkMonW1la0tLRg8+bNuHbtmuW8tK6jOrnp\nXO+6ayqVgiAImJqawkMPPYT29nYcO3YMk5OT8gY65cY6BrslpaurC8CtPVYA8Ouvv9pef0MU0pP+\n/vtvampqonPnzhER0QcffEDvvPOO5R6nZVFubTpX8vLLL8uWuGzZMkqn07qzcL0hxYnhxjUXPTg4\nSK+//vqMghsbG7OWDEYrpNUYbo3HREQTExP09ttvay6PnHC1VnHNRY+Ojs54ak40GsW1a9dw/fp1\ny3mSavc+235ayF3uRjh9+jSWLFmCvr4+PPbYYwgGgxAEQXbfJ0+enPGvW/crFUwhPWnfvn20Y8cO\n+Xs6naba2lq6fv26pR7nxM6KfEuUdDpNu3btonA4TA8//DCdOHFCM536ggQKjCcXgmsu+ptvvqEN\nGzbI38+fP0+PPvqo5QrZ7Y71rhIREb322msUCAQoEAjQli1b6ObNm5p1Ul8uVM6a2RJJKyLlFK4J\nPD4+TkuXLpUnWR9++CFt3bq1oArZiVJgpXeYmpqi/v5+WaRIJJIzD63rxuzDxmPlBEzZCUoikpVM\nJqm1tZWeffZZWr9+PV2+fLmgCpnFSLhRkiR5krRgwQJ6/PHHCQA1NDRknavOT+u6sVJMlkbLup1y\n30UT6LAjjGd03FZuMn/ggQcomUxmpVFe49XKj4nNolW5doCUlAUbIVeFjIpj1EpzpTt37hzde++9\nBIAWLVpEV69e1SxHy/3mqo9bSzctikZgow2VqyPk216TyWTo008/pfLycqqurqbDhw9r5s/yUe7e\n8GJ2bJSiEVgPdSBfqyOoBVWmGxkZoZaWFgJAa9as0ZwbsHOUouYqy+lLgGbwvcBGdhgacd9aYmQy\nGTpw4ABVVFRQNBqlVatW6QqTKwRpdPOdF/he4Hy7JYisjXMXL16k1tZWAkCrV6+m8fFxXWH0JkR6\n3sFrfC+w1m6JQvnyyy/prrvuonnz5tFXX30lH9cTJp/4fhFUje8FtpO//vqLXnzxRQJAbW1tdOnS\nJcPn+llEPW4bgb/++muqqqqiuXPn0qFDh+SrWH6aEDlBSQqsFO2ff/6hV155hQDQ888/T+vXr/ft\nhMgJil5gLQtkooXDYbrnnnto9uzZ9Pnnn1Mmk8k5IbLjPiA/UvQCa1lgPB6X47wrV66kP/74Q/7N\n6DpZTbG68qIUWC+wceTIEaqpqaE777yT9u7da3iTuTofP69tzVCUAms19tWrV2njxo0EgFasWEFn\nzpwhIuuW5+e1rRmKUmB1Yx87dowefPBBEkWRPv74Y5qenpbT6t3Pq0exCqqmKAVm3LhxgyRJokAg\nQE1NTfT7779npbEqcKng6r5oOzl+/DgaGhqwZ88e9PX14YcffkBdXV1WOvb8R7b3mKODCx0ub49L\npVK0bds2CgaD1NjYSK+++qrpMdbuGbGfZ9hF5aKVz0TeuXMndXV1Wboma/eM2M8z7KJw0epnIre3\nt6O3txd79uyR05h5lJ+tj/9zID/PcKHDZfW48+fPU2NjIwWDQXrrrbcolUrNuJHLyZmun12vUXzv\nooeGhuiJJ56Y8UxkM0sYrds2jQrmZ9drFN8LrMbsTkSlSFrBCz3B83UkP92DlIuiEli5J4oJZUYk\ntWCFWqiyLn6lqARW3xXAxLUqUqHRKm7BFshnwWpBiqGRvcT3yyTlrZdat4eyu+J//vln/9+e6XM8\nETjfe+nj8bj890cffcRFLgBPBM4XREgkEpAkSf5u5qVSHBUuDBmm7g/OdWdfvrS3E74fg3Oh5bop\nxwsZ87l5zi18JbDadeuJWDKxYqdx3qFouxQjLrZUdmDYjavP6GhtbaUXXniBXnrpJTp9+rThCpVC\nTNgrXBmDz549i/7+fuzfvx+Dg4Po7OzEpk2bDJ/PXaw7hKyeGIlEsGvXLlRVVQEA6uvrMT4+jsnJ\nSUQikbznJxIJx599xTEg8NGjR9HZ2Zl1vLe3F6tWrQJwa6bb19eHp556SlPc6elpALcenMYpHNaO\nrF31yCtwc3Mzfvvtt5y/37hxA1u3bsXo6Cj279+vmWZsbAwA0NHRkbdCHOOMjY3hvvvu000TIMqx\n0DTAhQsXsGHDBsyfPx99fX1ZT15lpFIpDA8Po7KyEoIgWC2O8z/T09MYGxtDfX19zjZnWBb4ypUr\naGtrQ1tbG9544w1LFeU4j+VJ1hdffIGLFy9iaGgIQ0ND8vEDBw5g7ty5tlSOUzgFuWiO//FVqJJj\nP5ZdtB0MDg7is88+QyAQwKxZs7B9+3YsXLjQyyohmUwikUhgcnISdXV16O3tRXl5uad1UmK6zZwM\nqelx5swZWrZsmfzQlGQySc3NzV5Vh4iMv6LAK6y0mWcCj4yM0HfffSd/Hx8fpwULFtDExIRXVTL8\nigKvsNJmjrtoOyJhbqH3igI/uOnq6mpUV1cDMN5mjgtsRyTMLTKZjObxYNBfc1EzbeZpzS9cuID2\n9nYIgoCDBw9i9uzZXlYHd999txxWBYBLly6hoqICd9xxh4e1monpNnN84MjB5cuX6cknn6Tdu3d7\nVYUsjL6iwCustJlngY59+/bhk08+QW1t7YzjXkfCjh49ikQigXQ6jZqaGrz//vuYM2eOZ/VRYqXN\neCSrxPHX7IFjO1zgEocLXOJwgUscLnCJwwUucbjAJQ4XuMT5D6iJKNYPmN+3AAAAAElFTkSuQmCC\n",
189 | "text/plain": [
190 | ""
191 | ]
192 | },
193 | "metadata": {},
194 | "output_type": "display_data"
195 | }
196 | ],
197 | "source": [
198 | "# property of interest\n",
199 | "property_ = 'max_peak'\n",
200 | "lasso_alpha = 5e-2\n",
201 | "\n",
202 | "# format data for property \n",
203 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n",
204 | "\n",
205 | "# encode sequences\n",
206 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n",
207 | "X = np.array(X)\n",
208 | "\n",
209 | "# use lasso to limit the input \n",
210 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n",
211 | "\n",
212 | "# reformat X with only lasso-limited set for GP model\n",
213 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n",
214 | "\n",
215 | "# evaluate cross-validation performance of GP model with lasso-limited set\n",
216 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n",
217 | "\n",
218 | "# Bayesian ridge regression to find weights\n",
219 | "clf_ff = linear_model.BayesianRidge()\n",
220 | "clf_ff.fit(X_lasso, y)\n",
221 | "weights = clf_ff.coef_\n",
222 | "\n",
223 | "# find features \n",
224 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n",
225 | "\n",
226 | "# find the correct numbering for each feature\n",
227 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {
233 | "deletable": true,
234 | "editable": true
235 | },
236 | "source": [
237 | "## Off-kinetics"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 11,
243 | "metadata": {
244 | "collapsed": false,
245 | "deletable": true,
246 | "editable": true
247 | },
248 | "outputs": [
249 | {
250 | "name": "stdout",
251 | "output_type": "stream",
252 | "text": [
253 | "20-fold corss validation of GP regression model\n",
254 | "R = 0.92\n"
255 | ]
256 | },
257 | {
258 | "data": {
259 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHEAAABvCAYAAADboi87AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADSZJREFUeJztnX9oW1UUx78vL2syW9f6o6PTDaeok6VlY6nQH2PVKiVW\n24rOUtb5g41NbVZjU9HhEIfoKqOZFteCKyKUzTmG0CCT0iGNukWE1k1bYRaqo9H+sJ112C1rzcvx\nj+09XtL8eEleXn69DwTSvJt7b9+559xzzr3vhiEigkpKo0l0B1RiRxViGqAKMQ1QhZgGqEJMA7Ry\nVXTt2jWMjIwgPz8fLMvKVW3GwnEcZmZmUFhYCL1eH7KsbEIcGRlBY2OjXNWp3ODYsWMoLi4OWUY2\nIebn5wuNFhQUyFVtxjI1NYXGxkbhvoZCNiHyJrSgoACrV6+Wq9q0pbW1FZ2dnTCbzbDZbEHLSZma\nVMcmQXR2dmJhYQFdXV0x16UKMUGYzWbo9Xo0NTUBuK6Zer0era2tkVdGMuFyuej+++8nl8slV5UZ\nhU6nIwCk1+uJKLL7qWpikuCvmZGgCjFB+JtPm80Gt9sd0skJhirEOBNsrvN3bH777TccPXo0qjYk\nCdFut6O2thZ1dXVoaGjA8PBwVI2lEjE5GiKCeaFi8zk4OIiSkhLYbDZQNMu74SbNsbExKi8vp+np\naSIicjgcVFFRsaRcujk2/o5GtFitVtLr9WS1WgNe7+vro+zsbHrwwQeFe0wU2f0MK0SXy0UDAwPC\n37Ozs2QwGGhhYWFJuXQSYribLwc9PT2k1WrJZDLRv//+63NNViGK8Xq91NraSs3NzUuupZsQ5cRq\ntZJOpxMGhNfrpYMHDxIAeu6552hxcXHJd+IixCtXrlBzczM988wzdPny5ZgazTTEppnjOHr11VcJ\nAO3du5e8Xm/A78geJ05MTKChoQEsy6KnpwcrVqyIfPLNYHgnZvfu3di2bRs6OjrQ0dGBtrY2MAwT\newPhpDw3N0cPP/wwffTRRyHLpaIm+pu5eHL58mWqrKykrKwsOnHiRNjysmri8ePHMTk5idOnT6Ou\nrk54zc3NxT6CEoycSehQTE5OYsuWLRgcHERfXx/q6+vlbUCOURbpyEkWlPBAL1y4QGvXrqVVq1bR\n+fPnJX9PzZ1KJJZUVyj4RMG2bdtQXl4OnU4Hp9OJDRs2yNoOT0YLMV7wZvr48eO49957cebMGaxd\nuzZu7alCjAMVFRUAgLvvvhtff/01br/99ri2pwoxBvzzq1arFVqtFv39/dixYwdGR0eRnZ0d/47E\nMmlHOxGnC+Ig3uPxEMMwBIAYhgkaxEtFdWwUQhzE19fXCysQGo0Gr732mqClcq2IBCWm4RLlyElm\npCQAxGXm5uZoy5YtpNPpqKamRghZxFoqfi81wRC3BLhcjSYzUpag+DI6nY4KCwspLy+Pvv32W58y\n4hhU/F7qEpcqxBgwGo0EgIxGY9AyVquVsrKy6Oabb6Y777yThoeHJdcvNcGgCjEGeE0BEPRGnzlz\nhm655RZav349jY+Px6UfqmMTA2azWXgfKKdqt9vx6KOPwmAw4LvvvsOaNWuU7F5AVCH6YbPZYLVa\nA24fPHLkCJ566imYTCb09/fj1ltvTVAv/UiE+icDkSxDeb1e2r9/PwGgF198kTweT9z7p86JEpDi\nJfIOTFFREQGgd955J+YgXirqnCgBKTuuDx8+jMXFRQwPD6O7uxtvvfWWPCvxcpOIkZMIIjGfvAZm\nZ2cTAKqrq4uqnlhQzWkAwplPXjh8nMi/zp49G7AehAhB5EA1pwEIZz75NcChoSHhs+effx5ffPGF\nT94zXAiSEBIxcpINq9VKLMuSRqMhlmWJYRjatWsXEQXWYCW2dWSEJkpZGZC6etDZ2QmO4+D1erF5\n82b8/fffOHLkCABfDebrAxCXbR1Rk4iRIwf+GhLI4Qg2D/qXraysJAB03333kdvtltxmPMkIx8bf\npEVi9sSrEG+++SYBILPZHDaIV8KM8mSEEP3hb7DRaBS0LFg4wH9uMBgIAB04cECxIF4qGSlEHl7L\nWJb1CRXEgpyfn6fq6mpiWZY+/fTTxHU2BBnh2ASDd0T8OXToELRaLTZu3IicnBx89dVXICIcPnxY\nNgcpYSRi5CiB1Wr10UQpr2BzXTTbK2Ilo82pGPE8qdFoCABlZWWRRqMhhmGEmJAXYqhsTqTbK2Il\no82pP//99x/OnTsnmNKLFy8KMaHH40FLSwtYlgXLsiGT4XRjJ1ssR5XEjUSMHKUQ5zk1Gk3Ah2Mj\nqUeJ+JBH1cQblJaWAgAYhkFzc3PUD8cmpfaJkTIqBgYG6IknnqCqqipqbm5eckhApCNHbvydDY7j\n6PXXXycA1NLSQhzHKd6nWJHVsbl06RKVlJTQ77//TkREBw8epLfffjumRuVGbDYtFgs9++yzBIDa\n29sV74tcyCpEu90uZPT5yjdt2rQkw5FoTYRo7tNqtXT06NGQ5ZV6zDtaZJ0Tp6amfE4SLigowPz8\nPK5cuRInAx85NpsNL730EhiGAcuyqK2txc6dO5cE53zQ3tHRochj3koRVoherzfwFzXJ4xONjY3h\n9OnTyM/Px/fff49Tp04FFBK/8AsguR2VCAkriVWrVmFmZkb4e3p6Grm5ubjpppvi2jExvAYVFxcv\nSX8NDQ2hrKwMAOB0OmE0GoN6k/znFosl6vXApEzBhbO3s7OzVFpaKjg27e3ttHfv3phseKSIHReI\n4rX+/n7Kyckho9Hocy5aPEnJjM1tt92GtrY2vPLKK3jssccwOjqKN954I95jywdeg4xGo6Bhx44d\nQ3V1NcrKyuBwOLBy5UpF+5JUpjgRIydW2tvbCQBt376dLBZL0nua0ZA2CXD/x8w4jqNNmzYRACou\nLiaO4xKSElOCtEm78dsHh4aGoNPpsH79evz4448Arv8ijkajSU7zpjBJLUSj0Si8X1xcxK+//orq\n6mro9XoYDAZotVp0dHRgxYoVOHToUNif40lXklqIg4ODmJycFH5qZ+vWrTh16hTcbjdGRkbAcRw4\njsNff/0FAD4bfzOJpBbi6OgoSktLwbIszp07h5MnTwrXzGazsA7Ie6Zizc0kEiJEKQHzDz/8gPLy\ncixbtgxOpxMbN270uW6z2eDxeODxeDA9PQ0iQkVFRfIF4koQb28qULKZ34nGsuySB1i0Wq3P3xqN\nRtiGuHz5cp9rK1eu9Klf6mbhVCCpQoxAN1YsRLFQYnnxe2BCbRZOpTAkqUKMQCGAxWIRcpjh5jGW\nZSU92NnU1BT06Mu0D0PiOXLC7cDmP/d4PLRnzx4CQPv27aOWlhaf68E0TMlt9UqTNOY02PMREJlA\nt9tNW7duJYZhqLKy0sfEsiwrV/dSjqQxp4HMWGdnp/B+x44dMJlM+PLLL3Hy5EmcPXsWHMcJ1zmO\nU+aAu1RH6ZHDm8Bdu3ZRUVER5ebm0jfffCNcQwCHJRUdk1iJRBNl+01hqdhsNly6dAnd3d3IycmB\n0+lEUVGRcA24/hi1wWDAL7/8ImhxV1dX+jomsaL0yHE6nYKW6XQ6uZpPO5JiThTPY8XFxWAYBnl5\neSgrK0N2djZ0Op3PIQYqMRCvkSOexxAmUOczMeLjKVMxyyInSaGJvGf68ssv44477ghZ1u12A/Bd\nhVDq12PSgbgJ0WazYX5+Hm63GxMTE9i/fz9aWlpCZl/E19I+yyIn8VL/q1ev0pNPPkkajYY+/vjj\nkN9N58xLtCRFxubEiROk1+upt7dXriYyiqSIE59++mlUVVUhLy8vXk2o3CBucyLLsqoAFUI2TeRz\nnlNTU3JVmdHw91GcSw6GbELkn9dobGyUq0oVXL+vd911V8gyDNGNEwVi5Nq1axgZGUF+fj5YlpWj\nyoyG4zjMzMygsLAw4Lk8YmQTokriSOotiyrSUIWYBqhCTAMUXxSWA4fDAZvNhsXFRaxbtw4HDhxA\nTk5OxGWUxm6345NPPgHDMFi+fDn27dsnLIjzvP/+++jr60Nubi6A6z9p++GHH4auOM7ZI9mRciSL\n1GNblGRsbIzKy8uFJ5odDgdVVFQsKVdfX09DQ0MR1Z1yQpRyJIvUY1uUxOVy0cDAgPD37OwsGQwG\nWlhYED5bWFigwsJCMpvNVFNTQ3v27KE///wzbN0pNydKOZIlGY9tWb16NR566CEA1w/7a2trQ2Vl\nJbKysoQy09PTKCkpgdVqhd1ux4YNG9DU1CQcDhiMlBOilCNZkvnYlqtXr8JisWB8fBzvvvuuz7U1\na9agu7sb99xzDxiGwc6dOzE+Po4//vgjZJ2J/68iRMqRLMlwbEsgJiYm0NDQAJZl0dPTs+TAwAsX\nLqC3t9fnMyLCsmXLQtabckLcvHkzfvrpJ1y8eBEA8Pnnn+ORRx6JuIzS/PPPP9i+fTuqqqrwwQcf\nBEylaTQavPfee3C5XACAzz77DOvWrfOZGgIi+wyuAA6Hg2pqashkMtHu3btpbm6Ofv75Z6qtrQ1Z\nJpF0dXXRAw88QLW1tT6v8+fP+/S7t7eXHn/8cTKZTPTCCy9IcmzU3GkakHLmVGUpqhDTAFWIaYAq\nxDRAFWIaoAoxDVCFmAaoQkwD/geodBJbOZ0P5wAAAABJRU5ErkJggg==\n",
260 | "text/plain": [
261 | ""
262 | ]
263 | },
264 | "metadata": {},
265 | "output_type": "display_data"
266 | }
267 | ],
268 | "source": [
269 | "# property of interest\n",
270 | "property_ = 'kinetics_off'\n",
271 | "lasso_alpha = 3e-2\n",
272 | "\n",
273 | "# format data for property \n",
274 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n",
275 | "\n",
276 | "# encode sequences\n",
277 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n",
278 | "X = np.array(X)\n",
279 | "\n",
280 | "# use lasso to limit the input \n",
281 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n",
282 | "\n",
283 | "# reformat X with only lasso-limited set for GP model\n",
284 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n",
285 | "\n",
286 | "# evaluate cross-validation performance of GP model with lasso-limited set\n",
287 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n",
288 | "\n",
289 | "# Bayesian ridge regression to find weights\n",
290 | "clf_ff = linear_model.BayesianRidge()\n",
291 | "clf_ff.fit(X_lasso, y)\n",
292 | "weights = clf_ff.coef_\n",
293 | "\n",
294 | "# find features \n",
295 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n",
296 | "\n",
297 | "# find the correct numbering for each feature\n",
298 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "collapsed": true,
306 | "deletable": true,
307 | "editable": true
308 | },
309 | "outputs": [],
310 | "source": []
311 | }
312 | ],
313 | "metadata": {
314 | "kernelspec": {
315 | "display_name": "Python 2",
316 | "language": "python",
317 | "name": "python2"
318 | },
319 | "language_info": {
320 | "codemirror_mode": {
321 | "name": "ipython",
322 | "version": 2
323 | },
324 | "file_extension": ".py",
325 | "mimetype": "text/x-python",
326 | "name": "python",
327 | "nbconvert_exporter": "python",
328 | "pygments_lexer": "ipython2",
329 | "version": "2.7.6"
330 | }
331 | },
332 | "nbformat": 4,
333 | "nbformat_minor": 2
334 | }
335 |
--------------------------------------------------------------------------------
/regression/GP_tools.py:
--------------------------------------------------------------------------------
1 | ## Tools for GP
2 |
3 | from __future__ import division
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 | import seaborn as sns
7 | import os
8 | import pandas as pd
9 | import pickle
10 | from scipy.linalg import cho_solve
11 |
12 |
13 | # ML imports
14 | from sklearn import linear_model
15 | from sklearn.cross_validation import train_test_split
16 | from sklearn.model_selection import LeaveOneOut
17 | from sklearn.metrics.pairwise import euclidean_distances
18 | from scipy.spatial import distance
19 | from scipy import optimize, linalg
20 | import scipy
21 |
22 | def matern_5_2_kernel(X, X_, hypers):
23 | """ Calculate the Matern kernel between X and X_.
24 | Parameters:
25 | X (np.ndarray):
26 | X_ (np.ndarray)
27 | hypers (iterable): default is ell=1.0.
28 | Returns:
29 | K (np.ndarray)
30 | """
31 | D = euclidean_distances(X, X_)
32 | D_L = D / hypers[0]
33 |
34 | first = (1.0 + np.sqrt(5.0) * D_L) + 5.0 * D_L ** 2 / 3.0
35 | second = np.exp(-np.sqrt(5.0) * D_L)
36 |
37 | K = first * second
38 | return K
39 |
40 | def predict_GP(X_train, y_train, X_test, prams):
41 | """ Gaussian process regression predictions.
42 | Parameters:
43 | X_train (np.ndarray): n x d training inputs
44 | y_train (np.ndarray): n training observations
45 | X_test (np.ndarray): m x d points to predict
46 | Returns:
47 | mu (np.ndarray): m predicted means
48 | var (np.ndarray): m predictive variances
49 | """
50 |
51 | K = matern_5_2_kernel(X_train, X_train, prams[1:])
52 | Ky = K + np.identity(len(K))*prams[0]
53 |
54 | # To invert K_y we use the Cholesky decomposition (L)
55 | L = np.linalg.cholesky(Ky)
56 |
57 | # solve for z=L^-1y
58 | z = linalg.solve_triangular(L, y_train, lower=True)
59 | alpha = linalg.solve_triangular(L.T, z, lower=False)
60 |
61 | K_star = matern_5_2_kernel(X_train, X_test, prams[1:])
62 | mu = np.matmul(K_star.T, alpha)
63 |
64 |
65 | # Compute the variance at the test points
66 | z = linalg.solve_triangular(L, K_star, lower=True)
67 | alpha = linalg.solve_triangular(L.T, z, lower=False)
68 | K_star_star = matern_5_2_kernel(X_test, X_test, prams[1:])
69 | v = np.diag(K_star_star) - np.dot(K_star.T, alpha)
70 | v = np.diag(v)
71 | return mu, v
72 |
73 | def neg_log_marg_likelihood(log_prams, X, y):
74 | """ Calculate the negative log marginal likelihood loss.
75 | We pass the log hypers here because it makes the optimization
76 | more stable.
77 | Parameters:
78 | log_hypers (np.ndarray): natural log of the hyper-parameters
79 | X (np.ndarray)
80 | y (np.ndarray)
81 | Returns:
82 | (float) The negative log marginal likelihood.
83 | """
84 |
85 | non_log_prams = np.exp(log_prams)
86 | #print(non_log_prams)
87 |
88 | # Evaluate kernel on training data
89 | K = matern_5_2_kernel(X, X, non_log_prams[1:])
90 |
91 | # To invert K we use the Cholesky decomposition (L), because symmetric and positive definite
92 | n = len(y)
93 | Ky = K + np.identity(len(K))*non_log_prams[0]
94 | L = np.linalg.cholesky(Ky)
95 | z = linalg.solve_triangular(L, y, lower=True)
96 | alpha = linalg.solve_triangular(L.T, z, lower=False) #dont know about this
97 |
98 | first = 0.5 * np.dot(y, alpha)
99 | second = np.sum(np.log(np.diag(L)))
100 | third = 0.5 * len(K) * np.log(2 * np.pi)
101 |
102 | #log_p_y_X = 0.5*np.matmul(y, alpha) + np.sum(np.log(np.diag(L))) + 0.5*n*np.log(2*np.pi)
103 | log_p_y_X = (first + second + third)
104 | return log_p_y_X
--------------------------------------------------------------------------------
/regression/__pycache__/.gitignore:
--------------------------------------------------------------------------------
1 | *.cpython-36.pyc
2 |
--------------------------------------------------------------------------------
/regression/chimera_tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import numpy as np
3 | import matplotlib.pyplot as pl
4 | import seaborn as sns
5 | import os
6 | import pandas as pd
7 | import pickle
8 |
9 |
10 |
11 | def chimera_code2seq_convert(file_c,file_n,df_data):
12 | # Load each position file as a df
13 | df_c = pd.read_csv(file_c, sep=' ',
14 | names = ['chimera', 'E', 'm', 'seq'])
15 | df_n = pd.read_csv(file_n, sep=' ',
16 | names = ['chimera', 'E', 'm', 'seq'])
17 |
18 | seq_input = []
19 | for i in df_data.block_k:
20 | if i[0] == 'c':
21 | seq_input.append(df_c[df_c.chimera == i].seq.values[0])
22 | elif i[0] == 'n':
23 | seq_input.append(df_n[df_n.chimera == i].seq.values[0])
24 |
25 | df_data['seq'] = seq_input
26 |
27 | return df_data
28 |
29 | def normalize_(data):
30 | """
31 | Normalize data by subtracting the mean and dividing by the std
32 | Also, make positive
33 | """
34 | return (data - np.mean(data)) / np.std(data)
35 |
36 | def un_normalize_(norm_data, data):
37 | """
38 | Normalize data by subtracting the mean and dividing by the std
39 | Also, make positive
40 | """
41 | return norm_data*np.std(data) + np.mean(data)
--------------------------------------------------------------------------------
/regression/encoding_tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import matplotlib.pyplot as plt
3 | import os
4 | import numpy as np
5 | import pandas as pd
6 | import pickle
7 |
8 |
9 | def one_hot_seq(seq_input):
10 | # make amino acid directory
11 | my_dict = {'-':0, 'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,\
12 | 'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,\
13 | 'V':18,'W':19,'Y':20}
14 |
15 | #print(seq_input[0])
16 | L = len(seq_input[0])
17 | n = len(seq_input)
18 |
19 | X = np.zeros((n, len(my_dict)*L))
20 |
21 | # loop through each sequence and one_hot encode
22 | for i, seq in enumerate(seq_input):
23 | for j, aa in enumerate(seq):
24 | # fine one index that should be '1'
25 | aa_indx = my_dict[aa]
26 | X[i][21*j+aa_indx] = 1
27 |
28 | return X
29 |
30 | def one_hot_contacts(seq_input, ss, contacts):
31 | # make contact directory
32 | my_contact_dict = {}
33 | AAs = ['-','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S',\
34 | 'T','V','W','Y']
35 | count = 0
36 | for k in AAs:
37 | for j in AAs:
38 | my_contact_dict[(k, j)] = count
39 | count += 1
40 |
41 | n = len(seq_input)
42 |
43 | X = np.zeros((n, len(my_contact_dict)*len(contacts)))
44 |
45 | # loop through each sequence and one_hot encode contacts
46 | for i, seq in enumerate(seq_input):
47 | for p, (j, k) in enumerate(contacts):
48 | # find the contact
49 | contact_index = my_contact_dict[(seq[j],seq[k])]
50 | X[i][len(my_contact_dict)*p+contact_index] = 1
51 |
52 | return X
53 |
54 | def one_hot_(seq_input, ss, contacts):
55 | # reshape to contain both contact and sequence info
56 | X_seq = one_hot_seq(seq_input)
57 | X_contact = one_hot_contacts(seq_input, ss, contacts)
58 |
59 | X = []
60 | for i, x in enumerate(X_seq):
61 | x_c = X_contact[i]
62 | X.append(np.concatenate((x, x_c)))
63 | return X
64 |
65 |
--------------------------------------------------------------------------------
/regression/inputs/lit_alignment_and_contacts_pro2.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:39f67bf7044c78b7884aafe40df15ce515adc9302772f3cd927767f9efe3b5b9
3 | size 19286
4 |
--------------------------------------------------------------------------------
/regression/lasso_tools.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import seaborn as sns
3 | import os
4 | import pandas as pd
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 | import pickle
8 |
9 |
10 | # ML imports
11 | from sklearn import linear_model
12 | from scipy import optimize
13 | import scipy
14 | from sklearn.model_selection import KFold
15 |
16 | # custom imports
17 | import encoding_tools as encoding
18 | import GP_tools as GP
19 |
20 |
21 | def data_format_all(property_, df):
22 | # remove ChR_29_10 & ChR_30_10 for kinetics and spectra because currents too low for accurate measurements
23 | if property_ == 'green_norm' or property_ == 'kinetics_off':
24 | df = df[df.chimera != 'ChR_29_10']
25 | df = df[df.chimera != 'ChR_30_10']
26 |
27 | # make a seperate dataframe for the selected property
28 | df_select = pd.DataFrame()
29 | df_select['prop'] = df[str(property_)]
30 | df_select['seq'] = df['seq']
31 | df_select['block_k'] = df['block_k']
32 | df_select['chimera'] = df['chimera']
33 | df_select.dropna(inplace=True)
34 |
35 | # normalize training data
36 | log_data = np.log(df_select.prop.values)
37 | y = (log_data - np.mean(log_data))/np.std(log_data)
38 | seq = df_select.seq.values
39 |
40 | return log_data, y, seq, df_select
41 |
42 | def cross_validation(X, log_data, property_):
43 | path_outputs = 'outputs/'
44 |
45 | kf = KFold(n_splits=20) # Define the split
46 | kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator
47 |
48 | mu_s = []
49 | var_s = []
50 | y_s = []
51 |
52 | for train_index, test_index in kf.split(X):
53 | X_train, X_test = X[train_index], X[test_index]
54 |
55 | log_data_train, log_data_test = log_data[train_index], log_data[test_index]
56 |
57 | y_train = (log_data_train - np.mean(log_data_train))/np.std(log_data_train)
58 | y_test = (log_data_test - np.mean(log_data_train))/np.std(log_data_train)
59 |
60 | initial_guess = [0.1,10]
61 |
62 | # take the log of the initial guess for optimiziation
63 | initial_guess_log = np.log(initial_guess)
64 |
65 | # optimize to fit model
66 | result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X_train,y_train), method='L-BFGS-B')#,
67 |
68 | # next set of hyper prams
69 | prams_me = [np.exp(result.x[0])**2, np.exp(result.x[1])]
70 |
71 | # next used trained GP model to predict on test data
72 | mu, var = GP.predict_GP(X_train, y_train, X_test, prams_me)
73 |
74 | # append
75 | mu_s.append(mu)
76 | var_s.append(var)
77 | y_s.append(y_test)
78 |
79 | # reformat all
80 | y_s_all = [j for i in y_s for j in i]
81 | mu_s_all = [j for i in mu_s for j in i]
82 |
83 | # plot results
84 | plt.figure('GP test set', figsize=(1.5, 1.5))
85 | plt.plot(y_s_all, mu_s_all, 'o', color='k', ms=3)
86 |
87 | # calc correlation
88 | measured = y_s_all
89 | predicted = mu_s_all
90 |
91 | par = np.polyfit(measured, predicted, 1, full=True)
92 | slope=par[0][0]
93 | intercept=par[0][1]
94 |
95 | # calc correlation
96 | variance = np.var(predicted)
97 | residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(measured, predicted)])
98 | Rsqr = np.round(1-residuals/variance, decimals=2)
99 |
100 | print('20-fold corss validation of GP regression model')
101 | print('R = %0.2f'% np.sqrt(Rsqr))
102 |
103 | max_x = np.max(y_s_all)
104 | min_x = np.min(y_s_all)
105 |
106 | plt.plot([min_x, max_x], [slope*min_x+intercept, slope*max_x+intercept], '-', color='k')
107 | plt.savefig(path_outputs + str(property_)+'_matern_kernel_LASSO_CV.pdf', bbox_inches='tight', transparent=True)
108 | plt.show()
109 | return measured, predicted
110 |
111 | def lasso_(alpha_, X, y):
112 | """ import alpha and full X matrix and y to give limited feature set"""
113 | clf = linear_model.Lasso(alpha=alpha_)
114 |
115 | # fit model with training data
116 | clf.fit(X,y)
117 |
118 | # get the coeff for the input to the next model
119 | lasso_coeff = clf.coef_
120 | return lasso_coeff
121 |
122 | def lasso_reformat_X(lasso_coeff, X):
123 | X_lasso = []
124 | for x in X:
125 | X_lasso.append(x[lasso_coeff != 0])
126 | return np.array(X_lasso)
127 |
128 |
129 | def id_sequence_features(index_seq, seqs):
130 | # make sequence directory
131 | my_dict = {'-':0, 'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,\
132 | 'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,\
133 | 'V':18,'W':19,'Y':20}
134 |
135 | # make a vector with amino acids filling
136 | L = len(seqs[0])
137 |
138 | seq_key = np.chararray(len(my_dict)*L)
139 |
140 |
141 | for j in range(L):
142 | for k, v in my_dict.items():
143 | seq_key[j*len(my_dict)+v] = k
144 |
145 | amino_acid_numb = np.floor(index_seq / len(my_dict))
146 | amino_acid = seq_key[index_seq]
147 | # print(str(amino_acid)+str(int(amino_acid_numb)))
148 | aa_numb=amino_acid_numb
149 | aa=amino_acid
150 | return aa_numb, aa
151 |
152 |
153 | def id_contact_features(index_contacts, contacts):
154 | # make contact directory
155 | my_contact_dict = {}
156 | AAs = ['-','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
157 | count = 0
158 | for k in AAs:
159 | for j in AAs:
160 | my_contact_dict[(k, j)] = count
161 | count += 1
162 |
163 | contact_key = np.chararray((len(my_contact_dict), 2))
164 |
165 | for k, v in my_contact_dict.items():
166 | contact_key[v][0] = k[0]
167 | contact_key[v][1] = k[1]
168 |
169 | contact_numbers = []
170 | contact_amino_acids = []
171 |
172 | contact_numb = int(np.floor(index_contacts / len(my_contact_dict)))
173 | contact_numbers = contacts[contact_numb]
174 | amino_acid_contact = contact_key[index_contacts % len(my_contact_dict)]
175 | contact_amino_acids = [amino_acid_contact[0], amino_acid_contact[1]]
176 | return [amino_acid_contact[0], amino_acid_contact[1]], contacts[contact_numb]
177 |
178 | def unique_columns2(data):
179 | """
180 | Identify co-varying columns
181 | """
182 | dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0]))
183 | dataf = np.asfortranarray(data).view(dt)
184 | u,uind = np.unique(dataf, return_inverse=True)
185 | u = u.view(data.dtype).reshape(-1,data.shape[0]).T
186 | return (u,uind)
187 |
188 | def find_features(df, ss, contacts, coeffs, X, weights):
189 | # re make the sequence and contact X
190 | X_seq = encoding.one_hot_seq(df['seq'].values)
191 | X_contact = encoding.one_hot_contacts(df['seq'].values, ss, contacts)
192 | X_seq = np.array(X_seq)
193 | X_contact = np.array(X_contact)
194 |
195 | # find the non-zeros features
196 | index_not_zero = np.where(coeffs != 0)[0]
197 |
198 | # find the co-varying
199 | u,uind = unique_columns2(X)
200 | vects_covary = uind[index_not_zero]
201 | lim_set = list(set(vects_covary))
202 |
203 | # make dataframe of lasso features with weights
204 | df_lasso_features = pd.DataFrame()
205 | df_lasso_features['index_not_zero'] = index_not_zero
206 | df_lasso_features['vects_covary'] = vects_covary
207 | df_lasso_features['weights'] = weights
208 |
209 | # of the lasso limited set, remove replicated values and ID the covarying set of features for each lasso-limited set
210 | co_vary_lim_set = []
211 | co_vary_lim_set_weights = []
212 | for i in lim_set:
213 | j, = np.where(uind == i)
214 | co_vary_lim_set.append(j)
215 | w = df_lasso_features[df_lasso_features.vects_covary == i].weights.values[0]
216 | co_vary_lim_set_weights.append(w)
217 |
218 | # build list of all features, seperate sequence vs contact features
219 | all_features = []
220 | feature_type = []
221 | weights_ = []
222 | aa_ = []
223 | for ind, i in enumerate(co_vary_lim_set):
224 | for j in i:
225 | if j < np.shape(X_seq)[1]:
226 | aa_number, aa = id_sequence_features(j, df['seq'].values)
227 | all_features.append(aa_number)
228 | aa_.append(aa)
229 | weights_.append(co_vary_lim_set_weights[ind])
230 | feature_type.append('seq')
231 |
232 | elif j > np.shape(X_seq)[1]:
233 | j_contact = j - np.shape(X_seq)[1]
234 | contact_aas, contact_pos = id_contact_features(j_contact, contacts)
235 | all_features.append(contact_pos)
236 | aa_.append(contact_aas)
237 | weights_.append(co_vary_lim_set_weights[ind])
238 | feature_type.append('contact')
239 |
240 | df_features = pd.DataFrame(dtype=object)
241 | df_features['weights'] = weights_
242 | df_features['feature'] = all_features
243 | df_features['type'] = feature_type
244 | df_features['aa'] = aa_
245 |
246 | # define different co-varying groups
247 | groups_ = list(set(df_features.weights))
248 | group_number = range(len(groups_))
249 |
250 | feature_by_group = []
251 | for i in df_features.weights:
252 | feature_by_group.append(groups_.index(i))
253 | df_features['feature_group'] = feature_by_group
254 | return df_features
255 |
256 | def refromat_feature_numbering(df_features, df_select, property_, lasso_alpha):
257 | C1C2_seq = df_select[df_select.chimera == 'C1C2'].seq.values[0]
258 | CheRiff_seq = df_select[df_select.chimera == 'CheRiff'].seq.values[0]
259 | CsChrim_seq = df_select[df_select.chimera == 'CsChrim'].seq.values[0]
260 |
261 | C1C2_seq_numb = range(len(C1C2_seq))
262 | CheRiff_seq_numb = range(len(CheRiff_seq))
263 | CsChrim_seq_numb = range(len(CsChrim_seq))
264 |
265 | # C1C2 numbering: first drop the gaps in the alignment sequence, but keep proper index
266 | seq_numb_mod = []
267 | C1C2_seq_mod = []
268 | for ind,i in enumerate(C1C2_seq):
269 | if i != '-':
270 | seq_numb_mod.append(ind)
271 | C1C2_seq_mod.append(i)
272 |
273 | gaps = ['-']*49
274 | C1C2_seq_mod = ''.join(gaps+C1C2_seq_mod)
275 | C1C2_seq_numb_mod = [-1]*49 + seq_numb_mod
276 |
277 | # CheRiff numbering: first drop the gaps in the alignment sequence, but keep proper index
278 | seq_numb_mod = []
279 | CheRiff_seq_mod = []
280 | for ind,i in enumerate(CheRiff_seq):
281 | if ind == 22:
282 | seq_numb_mod.append(ind)
283 | CheRiff_seq_mod.append(i)
284 | elif ind == 23:
285 | seq_numb_mod.append(ind)
286 | CheRiff_seq_mod.append(i)
287 | elif ind == 24:
288 | seq_numb_mod.append(ind)
289 | CheRiff_seq_mod.append(i)
290 | elif i != '-':
291 | seq_numb_mod.append(ind)
292 | CheRiff_seq_mod.append(i)
293 |
294 | gaps = ['-']*74
295 | CheRiff_seq_mod = ''.join(gaps+CheRiff_seq_mod)
296 | CheRiff_seq_numb_mod = [-1]*74 + seq_numb_mod
297 |
298 | # CsChrim numbering: first drop the gaps in the alignment sequence, but keep proper index
299 | seq_numb_mod = []
300 | CsChrim_seq_mod = []
301 | for ind,i in enumerate(CsChrim_seq):
302 | if ind == 22:
303 | seq_numb_mod.append(ind)
304 | CsChrim_seq_mod.append(i)
305 | elif i != '-':
306 | seq_numb_mod.append(ind)
307 | CsChrim_seq_mod.append(i)
308 |
309 | gaps = ['-']*47
310 | CsChrim_seq_mod = ''.join(gaps+CsChrim_seq_mod)
311 | CsChrim_seq_numb_mod = [-1]*47 + seq_numb_mod
312 |
313 | path_outputs = 'outputs/'
314 |
315 | # go through sequence/contact features and adjust numbering for plotting on 3ug9.pdb
316 | feature_adjust = []
317 | aa_feature_adjust = []
318 | for ind, i in enumerate(df_features.feature):
319 | if df_features.type[ind] == 'seq':
320 | feature_adjust.append(C1C2_seq_numb_mod.index(i))
321 | aa_feature_adjust.append(C1C2_seq_mod[C1C2_seq_numb_mod.index(i)])
322 | else:
323 | feature_adjust.append([C1C2_seq_numb_mod.index(i[0]), C1C2_seq_numb_mod.index(i[1])])
324 | aa_feature_adjust.append([C1C2_seq_mod[C1C2_seq_numb_mod.index(i[0])],
325 | C1C2_seq_mod[C1C2_seq_numb_mod.index(i[1])]])
326 | df_features['C1C2_features_adjust'] = feature_adjust
327 | df_features['C1C2_aa_adjust'] = aa_feature_adjust
328 |
329 | # go through sequence/contact features and adjust numbering for CheRiff parent
330 | feature_adjust = []
331 | aa_feature_adjust = []
332 | for ind, i in enumerate(df_features.feature):
333 | if df_features.type[ind] == 'seq':
334 | feature_adjust.append(CheRiff_seq_numb_mod.index(i))
335 | aa_feature_adjust.append(CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i)])
336 | else:
337 | feature_adjust.append([CheRiff_seq_numb_mod.index(i[0]), CheRiff_seq_numb_mod.index(i[1])])
338 | aa_feature_adjust.append([CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i[0])],
339 | CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i[1])]])
340 |
341 | df_features['CheRiff_features_adjust'] = feature_adjust
342 | df_features['CheRiff_aa_adjust'] = aa_feature_adjust
343 |
344 | # go through sequence/contact features and adjust numbering for CsChrim parent
345 | feature_adjust = []
346 | aa_feature_adjust = []
347 | for ind, i in enumerate(df_features.feature):
348 | if df_features.type[ind] == 'seq':
349 | feature_adjust.append(CsChrim_seq_numb_mod.index(i))
350 | aa_feature_adjust.append(CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i)])
351 | else:
352 | feature_adjust.append([CsChrim_seq_numb_mod.index(i[0]), CsChrim_seq_numb_mod.index(i[1])])
353 | aa_feature_adjust.append([CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i[0])],
354 | CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i[1])]])
355 | df_features['CsChrim_features_adjust'] = feature_adjust
356 | df_features['CsChrim_aa_adjust'] = aa_feature_adjust
357 |
358 | df_features.to_csv(path_outputs+'matern_'+ str(property_) +'_' + str(lasso_alpha) + '_LASSO.csv')
359 | return df_features
--------------------------------------------------------------------------------
/regression/outputs/green_norm_matern_kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel.pdf
--------------------------------------------------------------------------------
/regression/outputs/green_norm_matern_kernel_CV_fig1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel_CV_fig1.pdf
--------------------------------------------------------------------------------
/regression/outputs/green_norm_matern_kernel_LASSO_CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel_LASSO_CV.pdf
--------------------------------------------------------------------------------
/regression/outputs/kinetics_off_matern_kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel.pdf
--------------------------------------------------------------------------------
/regression/outputs/kinetics_off_matern_kernel_CV_fig1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel_CV_fig1.pdf
--------------------------------------------------------------------------------
/regression/outputs/kinetics_off_matern_kernel_LASSO_CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel_LASSO_CV.pdf
--------------------------------------------------------------------------------
/regression/outputs/matern_green_norm_0.025_LASSO.csv:
--------------------------------------------------------------------------------
1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust
2 | 0,-0.0477001478388,"(161, 197)",contact,"['D', 'T']",3,"[195, 227]","['D', 'T']","[220, 252]","['D', 'T']","[193, 225]","['C', 'M']"
3 | 1,-0.0477001478388,"(164, 197)",contact,"['T', 'T']",3,"[198, 227]","['T', 'T']","[223, 252]","['T', 'T']","[196, 225]","['M', 'M']"
4 | 2,-0.0999688919254,"(164, 190)",contact,"['T', 'G']",14,"[198, 220]","['T', 'G']","[223, 245]","['T', 'G']","[196, 218]","['M', 'S']"
5 | 3,-0.0999688919254,"(170, 186)",contact,"['T', 'F']",14,"[204, 216]","['T', 'F']","[229, 241]","['T', 'F']","[202, 214]","['A', 'L']"
6 | 4,-0.225551909895,"(172, 247)",contact,"['A', 'F']",16,"[206, 269]","['A', 'F']","[231, 294]","['A', 'F']","[204, 267]","['G', 'W']"
7 | 5,-0.106229081439,"(158, 161)",contact,"['L', 'D']",2,"[192, 195]","['L', 'D']","[217, 220]","['L', 'D']","[190, 193]","['I', 'C']"
8 | 6,-0.0140522194844,158.0,seq,L,11,192,L,217,L,190,I
9 | 7,-0.0140522194844,"(134, 158)",contact,"['P', 'L']",11,"[168, 192]","['P', 'L']","[193, 217]","['P', 'L']","[166, 190]","['P', 'I']"
10 | 8,-0.0140522194844,"(137, 158)",contact,"['L', 'L']",11,"[171, 192]","['L', 'L']","[196, 217]","['L', 'L']","[169, 190]","['L', 'I']"
11 | 9,-0.0140522194844,"(138, 158)",contact,"['I', 'L']",11,"[172, 192]","['I', 'L']","[197, 217]","['I', 'L']","[170, 190]","['I', 'I']"
12 | 10,-0.0140522194844,"(154, 158)",contact,"['T', 'L']",11,"[188, 192]","['T', 'L']","[213, 217]","['T', 'L']","[186, 190]","['T', 'I']"
13 | 11,-0.0140522194844,"(155, 158)",contact,"['M', 'L']",11,"[189, 192]","['M', 'L']","[214, 217]","['M', 'L']","[187, 190]","['M', 'I']"
14 | 12,-0.0140522194844,"(157, 158)",contact,"['L', 'L']",11,"[191, 192]","['L', 'L']","[216, 217]","['L', 'L']","[189, 190]","['L', 'I']"
15 | 13,-0.0140522194844,"(158, 159)",contact,"['L', 'V']",11,"[192, 193]","['L', 'V']","[217, 218]","['L', 'V']","[190, 191]","['I', 'V']"
16 | 14,-0.0140522194844,"(158, 160)",contact,"['L', 'S']",11,"[192, 194]","['L', 'S']","[217, 219]","['L', 'S']","[190, 192]","['I', 'S']"
17 | 15,-0.00115578746512,177.0,seq,G,9,210,G,235,G,208,D
18 | 16,-0.00115578746512,179.0,seq,V,9,212,V,237,V,210,L
19 | 17,-0.00115578746512,186.0,seq,F,9,216,F,241,F,214,L
20 | 18,-0.00115578746512,190.0,seq,G,9,220,G,245,G,218,S
21 | 19,-0.00115578746512,191.0,seq,L,9,221,L,246,L,219,C
22 | 20,-0.00115578746512,"(171, 186)",contact,"['A', 'F']",9,"[205, 216]","['A', 'F']","[230, 241]","['A', 'F']","[203, 214]","['A', 'L']"
23 | 21,-0.00115578746512,"(177, 179)",contact,"['G', 'V']",9,"[210, 212]","['G', 'V']","[235, 237]","['G', 'V']","[208, 210]","['D', 'L']"
24 | 22,-0.00115578746512,"(179, 186)",contact,"['V', 'F']",9,"[212, 216]","['V', 'F']","[237, 241]","['V', 'F']","[210, 214]","['L', 'L']"
25 | 23,-0.00115578746512,"(186, 190)",contact,"['F', 'G']",9,"[216, 220]","['F', 'G']","[241, 245]","['F', 'G']","[214, 218]","['L', 'S']"
26 | 24,-0.00115578746512,"(190, 191)",contact,"['G', 'L']",9,"[220, 221]","['G', 'L']","[245, 246]","['G', 'L']","[218, 219]","['S', 'C']"
27 | 25,-0.00115578746512,"(190, 193)",contact,"['G', 'Y']",9,"[220, 223]","['G', 'Y']","[245, 248]","['G', 'Y']","[218, 221]","['S', 'Y']"
28 | 26,-0.00115578746512,"(190, 194)",contact,"['G', 'G']",9,"[220, 224]","['G', 'G']","[245, 249]","['G', 'G']","[218, 222]","['S', 'G']"
29 | 27,-0.00115578746512,"(191, 193)",contact,"['L', 'Y']",9,"[221, 223]","['L', 'Y']","[246, 248]","['L', 'Y']","[219, 221]","['C', 'Y']"
30 | 28,-0.00115578746512,"(191, 194)",contact,"['L', 'G']",9,"[221, 224]","['L', 'G']","[246, 249]","['L', 'G']","[219, 222]","['C', 'G']"
31 | 29,0.00115578746512,177.0,seq,D,19,210,G,235,G,208,D
32 | 30,0.00115578746512,179.0,seq,L,19,212,V,237,V,210,L
33 | 31,0.00115578746512,186.0,seq,L,19,216,F,241,F,214,L
34 | 32,0.00115578746512,188.0,seq,I,19,218,L,243,C,216,I
35 | 33,0.00115578746512,189.0,seq,V,19,219,M,244,I,217,V
36 | 34,0.00115578746512,190.0,seq,S,19,220,G,245,G,218,S
37 | 35,0.00115578746512,191.0,seq,C,19,221,L,246,L,219,C
38 | 36,0.00115578746512,192.0,seq,I,19,222,C,247,V,220,I
39 | 37,0.00115578746512,"(171, 186)",contact,"['A', 'L']",19,"[205, 216]","['A', 'F']","[230, 241]","['A', 'F']","[203, 214]","['A', 'L']"
40 | 38,0.00115578746512,"(174, 179)",contact,"['A', 'L']",19,"[208, 212]","['S', 'V']","[233, 237]","['A', 'V']","[206, 210]","['A', 'L']"
41 | 39,0.00115578746512,"(174, 186)",contact,"['A', 'L']",19,"[208, 216]","['S', 'F']","[233, 241]","['A', 'F']","[206, 214]","['A', 'L']"
42 | 40,0.00115578746512,"(176, 177)",contact,"['T', 'D']",19,"[209, 210]","['K', 'G']","[234, 235]","['T', 'G']","[207, 208]","['T', 'D']"
43 | 41,0.00115578746512,"(176, 179)",contact,"['T', 'L']",19,"[209, 212]","['K', 'V']","[234, 237]","['T', 'V']","[207, 210]","['T', 'L']"
44 | 42,0.00115578746512,"(177, 178)",contact,"['D', 'W']",19,"[210, 211]","['G', 'Y']","[235, 236]","['G', 'W']","[208, 209]","['D', 'W']"
45 | 43,0.00115578746512,"(177, 179)",contact,"['D', 'L']",19,"[210, 212]","['G', 'V']","[235, 237]","['G', 'V']","[208, 210]","['D', 'L']"
46 | 44,0.00115578746512,"(177, 180)",contact,"['D', 'K']",19,"[210, 213]","['G', 'R']","[235, 238]","['G', 'K']","[208, 211]","['D', 'K']"
47 | 45,0.00115578746512,"(177, 184)",contact,"['D', 'W']",19,"[210, 214]","['G', 'V']","[235, 239]","['G', 'W']","[208, 212]","['D', 'W']"
48 | 46,0.00115578746512,"(178, 179)",contact,"['W', 'L']",19,"[211, 212]","['Y', 'V']","[236, 237]","['W', 'V']","[209, 210]","['W', 'L']"
49 | 47,0.00115578746512,"(179, 180)",contact,"['L', 'K']",19,"[212, 213]","['V', 'R']","[237, 238]","['V', 'K']","[210, 211]","['L', 'K']"
50 | 48,0.00115578746512,"(179, 184)",contact,"['L', 'W']",19,"[212, 214]","['V', 'V']","[237, 239]","['V', 'W']","[210, 212]","['L', 'W']"
51 | 49,0.00115578746512,"(179, 185)",contact,"['L', 'L']",19,"[212, 215]","['V', 'I']","[237, 240]","['V', 'L']","[210, 213]","['L', 'L']"
52 | 50,0.00115578746512,"(179, 186)",contact,"['L', 'L']",19,"[212, 216]","['V', 'F']","[237, 241]","['V', 'F']","[210, 214]","['L', 'L']"
53 | 51,0.00115578746512,"(180, 186)",contact,"['K', 'L']",19,"[213, 216]","['R', 'F']","[238, 241]","['K', 'F']","[211, 214]","['K', 'L']"
54 | 52,0.00115578746512,"(184, 186)",contact,"['W', 'L']",19,"[214, 216]","['V', 'F']","[239, 241]","['W', 'F']","[212, 214]","['W', 'L']"
55 | 53,0.00115578746512,"(184, 188)",contact,"['W', 'I']",19,"[214, 218]","['V', 'L']","[239, 243]","['W', 'C']","[212, 216]","['W', 'I']"
56 | 54,0.00115578746512,"(185, 186)",contact,"['L', 'L']",19,"[215, 216]","['I', 'F']","[240, 241]","['L', 'F']","[213, 214]","['L', 'L']"
57 | 55,0.00115578746512,"(185, 188)",contact,"['L', 'I']",19,"[215, 218]","['I', 'L']","[240, 243]","['L', 'C']","[213, 216]","['L', 'I']"
58 | 56,0.00115578746512,"(185, 189)",contact,"['L', 'V']",19,"[215, 219]","['I', 'M']","[240, 244]","['L', 'I']","[213, 217]","['L', 'V']"
59 | 57,0.00115578746512,"(186, 187)",contact,"['L', 'Y']",19,"[216, 217]","['F', 'F']","[241, 242]","['F', 'Y']","[214, 215]","['L', 'Y']"
60 | 58,0.00115578746512,"(186, 188)",contact,"['L', 'I']",19,"[216, 218]","['F', 'L']","[241, 243]","['F', 'C']","[214, 216]","['L', 'I']"
61 | 59,0.00115578746512,"(186, 189)",contact,"['L', 'V']",19,"[216, 219]","['F', 'M']","[241, 244]","['F', 'I']","[214, 217]","['L', 'V']"
62 | 60,0.00115578746512,"(186, 190)",contact,"['L', 'S']",19,"[216, 220]","['F', 'G']","[241, 245]","['F', 'G']","[214, 218]","['L', 'S']"
63 | 61,0.00115578746512,"(187, 188)",contact,"['Y', 'I']",19,"[217, 218]","['F', 'L']","[242, 243]","['Y', 'C']","[215, 216]","['Y', 'I']"
64 | 62,0.00115578746512,"(187, 189)",contact,"['Y', 'V']",19,"[217, 219]","['F', 'M']","[242, 244]","['Y', 'I']","[215, 217]","['Y', 'V']"
65 | 63,0.00115578746512,"(187, 190)",contact,"['Y', 'S']",19,"[217, 220]","['F', 'G']","[242, 245]","['Y', 'G']","[215, 218]","['Y', 'S']"
66 | 64,0.00115578746512,"(187, 191)",contact,"['Y', 'C']",19,"[217, 221]","['F', 'L']","[242, 246]","['Y', 'L']","[215, 219]","['Y', 'C']"
67 | 65,0.00115578746512,"(188, 189)",contact,"['I', 'V']",19,"[218, 219]","['L', 'M']","[243, 244]","['C', 'I']","[216, 217]","['I', 'V']"
68 | 66,0.00115578746512,"(188, 190)",contact,"['I', 'S']",19,"[218, 220]","['L', 'G']","[243, 245]","['C', 'G']","[216, 218]","['I', 'S']"
69 | 67,0.00115578746512,"(188, 191)",contact,"['I', 'C']",19,"[218, 221]","['L', 'L']","[243, 246]","['C', 'L']","[216, 219]","['I', 'C']"
70 | 68,0.00115578746512,"(188, 192)",contact,"['I', 'I']",19,"[218, 222]","['L', 'C']","[243, 247]","['C', 'V']","[216, 220]","['I', 'I']"
71 | 69,0.00115578746512,"(189, 190)",contact,"['V', 'S']",19,"[219, 220]","['M', 'G']","[244, 245]","['I', 'G']","[217, 218]","['V', 'S']"
72 | 70,0.00115578746512,"(189, 191)",contact,"['V', 'C']",19,"[219, 221]","['M', 'L']","[244, 246]","['I', 'L']","[217, 219]","['V', 'C']"
73 | 71,0.00115578746512,"(189, 192)",contact,"['V', 'I']",19,"[219, 222]","['M', 'C']","[244, 247]","['I', 'V']","[217, 220]","['V', 'I']"
74 | 72,0.00115578746512,"(189, 193)",contact,"['V', 'Y']",19,"[219, 223]","['M', 'Y']","[244, 248]","['I', 'Y']","[217, 221]","['V', 'Y']"
75 | 73,0.00115578746512,"(190, 191)",contact,"['S', 'C']",19,"[220, 221]","['G', 'L']","[245, 246]","['G', 'L']","[218, 219]","['S', 'C']"
76 | 74,0.00115578746512,"(190, 192)",contact,"['S', 'I']",19,"[220, 222]","['G', 'C']","[245, 247]","['G', 'V']","[218, 220]","['S', 'I']"
77 | 75,0.00115578746512,"(190, 193)",contact,"['S', 'Y']",19,"[220, 223]","['G', 'Y']","[245, 248]","['G', 'Y']","[218, 221]","['S', 'Y']"
78 | 76,0.00115578746512,"(190, 194)",contact,"['S', 'G']",19,"[220, 224]","['G', 'G']","[245, 249]","['G', 'G']","[218, 222]","['S', 'G']"
79 | 77,0.00115578746512,"(191, 192)",contact,"['C', 'I']",19,"[221, 222]","['L', 'C']","[246, 247]","['L', 'V']","[219, 220]","['C', 'I']"
80 | 78,0.00115578746512,"(191, 193)",contact,"['C', 'Y']",19,"[221, 223]","['L', 'Y']","[246, 248]","['L', 'Y']","[219, 221]","['C', 'Y']"
81 | 79,0.00115578746512,"(191, 194)",contact,"['C', 'G']",19,"[221, 224]","['L', 'G']","[246, 249]","['L', 'G']","[219, 222]","['C', 'G']"
82 | 80,0.00115578746512,"(192, 193)",contact,"['I', 'Y']",19,"[222, 223]","['C', 'Y']","[247, 248]","['V', 'Y']","[220, 221]","['I', 'Y']"
83 | 81,0.00115578746512,"(192, 194)",contact,"['I', 'G']",19,"[222, 224]","['C', 'G']","[247, 249]","['V', 'G']","[220, 222]","['I', 'G']"
84 | 82,0.0837512338557,"(167, 187)",contact,"['F', 'Y']",10,"[201, 217]","['W', 'F']","[226, 242]","['M', 'Y']","[199, 215]","['F', 'Y']"
85 | 83,0.0837512338557,"(170, 174)",contact,"['A', 'A']",10,"[204, 208]","['T', 'S']","[229, 233]","['T', 'A']","[202, 206]","['A', 'A']"
86 | 84,0.0837512338557,"(172, 174)",contact,"['G', 'A']",10,"[206, 208]","['A', 'S']","[231, 233]","['A', 'A']","[204, 206]","['G', 'A']"
87 | 85,0.0837512338557,"(172, 180)",contact,"['G', 'K']",10,"[206, 213]","['A', 'R']","[231, 238]","['A', 'K']","[204, 211]","['G', 'K']"
88 | 86,-0.456801277898,"(202, 237)",contact,"['G', 'Y']",17,"[232, 259]","['A', 'F']","[257, 284]","['G', 'Y']","[230, 257]","['A', 'F']"
89 | 87,-0.339700102915,"(192, 195)",contact,"['C', 'I']",0,"[222, 225]","['C', 'I']","[247, 250]","['V', 'T']","[220, 223]","['I', 'G']"
90 | 88,0.00260809419317,36.0,seq,P,8,85,N,110,L,83,P
91 | 89,0.00260809419317,37.0,seq,G,8,86,A,111,W,84,G
92 | 90,0.00260809419317,40.0,seq,I,8,89,L,114,E,87,I
93 | 91,0.00260809419317,41.0,seq,G,8,90,A,115,T,88,G
94 | 92,0.00260809419317,43.0,seq,Q,8,92,N,117,R,90,Q
95 | 93,0.00260809419317,44.0,seq,V,8,93,I,118,G,91,V
96 | 94,0.00260809419317,45.0,seq,C,8,94,L,119,F,92,C
97 | 95,0.00260809419317,"(34, 36)",contact,"['G', 'P']",8,"[83, 85]","['G', 'N']","[108, 110]","['G', 'L']","[81, 83]","['G', 'P']"
98 | 96,0.00260809419317,"(35, 36)",contact,"['T', 'P']",8,"[84, 85]","['T', 'N']","[109, 110]","['A', 'L']","[82, 83]","['T', 'P']"
99 | 97,0.00260809419317,"(35, 37)",contact,"['T', 'G']",8,"[84, 86]","['T', 'A']","[109, 111]","['A', 'W']","[82, 84]","['T', 'G']"
100 | 98,0.00260809419317,"(36, 37)",contact,"['P', 'G']",8,"[85, 86]","['N', 'A']","[110, 111]","['L', 'W']","[83, 84]","['P', 'G']"
101 | 99,0.00260809419317,"(36, 38)",contact,"['P', 'E']",8,"[85, 87]","['N', 'E']","[110, 112]","['L', 'E']","[83, 85]","['P', 'E']"
102 | 100,0.00260809419317,"(36, 39)",contact,"['P', 'K']",8,"[85, 88]","['N', 'K']","[110, 113]","['L', 'Q']","[83, 86]","['P', 'K']"
103 | 101,0.00260809419317,"(36, 40)",contact,"['P', 'I']",8,"[85, 89]","['N', 'L']","[110, 114]","['L', 'E']","[83, 87]","['P', 'I']"
104 | 102,0.00260809419317,"(37, 38)",contact,"['G', 'E']",8,"[86, 87]","['A', 'E']","[111, 112]","['W', 'E']","[84, 85]","['G', 'E']"
105 | 103,0.00260809419317,"(37, 39)",contact,"['G', 'K']",8,"[86, 88]","['A', 'K']","[111, 113]","['W', 'Q']","[84, 86]","['G', 'K']"
106 | 104,0.00260809419317,"(37, 40)",contact,"['G', 'I']",8,"[86, 89]","['A', 'L']","[111, 114]","['W', 'E']","[84, 87]","['G', 'I']"
107 | 105,0.00260809419317,"(37, 41)",contact,"['G', 'G']",8,"[86, 90]","['A', 'A']","[111, 115]","['W', 'T']","[84, 88]","['G', 'G']"
108 | 106,0.00260809419317,"(38, 40)",contact,"['E', 'I']",8,"[87, 89]","['E', 'L']","[112, 114]","['E', 'E']","[85, 87]","['E', 'I']"
109 | 107,0.00260809419317,"(38, 41)",contact,"['E', 'G']",8,"[87, 90]","['E', 'A']","[112, 115]","['E', 'T']","[85, 88]","['E', 'G']"
110 | 108,0.00260809419317,"(39, 40)",contact,"['K', 'I']",8,"[88, 89]","['K', 'L']","[113, 114]","['Q', 'E']","[86, 87]","['K', 'I']"
111 | 109,0.00260809419317,"(39, 41)",contact,"['K', 'G']",8,"[88, 90]","['K', 'A']","[113, 115]","['Q', 'T']","[86, 88]","['K', 'G']"
112 | 110,0.00260809419317,"(39, 43)",contact,"['K', 'Q']",8,"[88, 92]","['K', 'N']","[113, 117]","['Q', 'R']","[86, 90]","['K', 'Q']"
113 | 111,0.00260809419317,"(40, 41)",contact,"['I', 'G']",8,"[89, 90]","['L', 'A']","[114, 115]","['E', 'T']","[87, 88]","['I', 'G']"
114 | 112,0.00260809419317,"(40, 42)",contact,"['I', 'A']",8,"[89, 91]","['L', 'A']","[114, 116]","['E', 'A']","[87, 89]","['I', 'A']"
115 | 113,0.00260809419317,"(40, 43)",contact,"['I', 'Q']",8,"[89, 92]","['L', 'N']","[114, 117]","['E', 'R']","[87, 90]","['I', 'Q']"
116 | 114,0.00260809419317,"(40, 44)",contact,"['I', 'V']",8,"[89, 93]","['L', 'I']","[114, 118]","['E', 'G']","[87, 91]","['I', 'V']"
117 | 115,0.00260809419317,"(41, 42)",contact,"['G', 'A']",8,"[90, 91]","['A', 'A']","[115, 116]","['T', 'A']","[88, 89]","['G', 'A']"
118 | 116,0.00260809419317,"(41, 43)",contact,"['G', 'Q']",8,"[90, 92]","['A', 'N']","[115, 117]","['T', 'R']","[88, 90]","['G', 'Q']"
119 | 117,0.00260809419317,"(41, 44)",contact,"['G', 'V']",8,"[90, 93]","['A', 'I']","[115, 118]","['T', 'G']","[88, 91]","['G', 'V']"
120 | 118,0.00260809419317,"(41, 45)",contact,"['G', 'C']",8,"[90, 94]","['A', 'L']","[115, 119]","['T', 'F']","[88, 92]","['G', 'C']"
121 | 119,0.00260809419317,"(42, 43)",contact,"['A', 'Q']",8,"[91, 92]","['A', 'N']","[116, 117]","['A', 'R']","[89, 90]","['A', 'Q']"
122 | 120,0.00260809419317,"(42, 44)",contact,"['A', 'V']",8,"[91, 93]","['A', 'I']","[116, 118]","['A', 'G']","[89, 91]","['A', 'V']"
123 | 121,0.00260809419317,"(42, 45)",contact,"['A', 'C']",8,"[91, 94]","['A', 'L']","[116, 119]","['A', 'F']","[89, 92]","['A', 'C']"
124 | 122,0.00260809419317,"(43, 44)",contact,"['Q', 'V']",8,"[92, 93]","['N', 'I']","[117, 118]","['R', 'G']","[90, 91]","['Q', 'V']"
125 | 123,0.00260809419317,"(43, 45)",contact,"['Q', 'C']",8,"[92, 94]","['N', 'L']","[117, 119]","['R', 'F']","[90, 92]","['Q', 'C']"
126 | 124,0.00260809419317,"(43, 46)",contact,"['Q', 'Q']",8,"[92, 95]","['N', 'Q']","[117, 120]","['R', 'Q']","[90, 93]","['Q', 'Q']"
127 | 125,0.00260809419317,"(43, 47)",contact,"['Q', 'W']",8,"[92, 96]","['N', 'W']","[117, 121]","['R', 'W']","[90, 94]","['Q', 'W']"
128 | 126,0.00260809419317,"(44, 45)",contact,"['V', 'C']",8,"[93, 94]","['I', 'L']","[118, 119]","['G', 'F']","[91, 92]","['V', 'C']"
129 | 127,0.00260809419317,"(44, 46)",contact,"['V', 'Q']",8,"[93, 95]","['I', 'Q']","[118, 120]","['G', 'Q']","[91, 93]","['V', 'Q']"
130 | 128,0.00260809419317,"(44, 47)",contact,"['V', 'W']",8,"[93, 96]","['I', 'W']","[118, 121]","['G', 'W']","[91, 94]","['V', 'W']"
131 | 129,0.00260809419317,"(44, 48)",contact,"['V', 'I']",8,"[93, 97]","['I', 'I']","[118, 122]","['G', 'F']","[91, 95]","['V', 'I']"
132 | 130,0.00260809419317,"(45, 46)",contact,"['C', 'Q']",8,"[94, 95]","['L', 'Q']","[119, 120]","['F', 'Q']","[92, 93]","['C', 'Q']"
133 | 131,0.00260809419317,"(45, 47)",contact,"['C', 'W']",8,"[94, 96]","['L', 'W']","[119, 121]","['F', 'W']","[92, 94]","['C', 'W']"
134 | 132,0.00260809419317,"(45, 48)",contact,"['C', 'I']",8,"[94, 97]","['L', 'I']","[119, 122]","['F', 'F']","[92, 95]","['C', 'I']"
135 | 133,0.00260809419317,"(45, 268)",contact,"['C', 'I']",8,"[94, 290]","['L', 'I']","[119, 315]","['F', 'I']","[92, 288]","['C', 'I']"
136 | 134,-0.0852051821975,"(242, 245)",contact,"['M', 'I']",1,"[264, 267]","['M', 'I']","[289, 292]","['M', 'G']","[262, 265]","['S', 'I']"
137 | 135,-0.0852051821975,"(243, 245)",contact,"['F', 'I']",1,"[265, 267]","['F', 'I']","[290, 292]","['F', 'G']","[263, 265]","['Y', 'I']"
138 | 136,-0.146294383456,"(48, 50)",contact,"['F', 'V']",5,"[97, 99]","['I', 'F']","[122, 124]","['F', 'V']","[95, 97]","['I', 'F']"
139 | 137,0.0140522194844,158.0,seq,I,20,192,L,217,L,190,I
140 | 138,0.0140522194844,"(134, 158)",contact,"['P', 'I']",20,"[168, 192]","['P', 'L']","[193, 217]","['P', 'L']","[166, 190]","['P', 'I']"
141 | 139,0.0140522194844,"(137, 158)",contact,"['L', 'I']",20,"[171, 192]","['L', 'L']","[196, 217]","['L', 'L']","[169, 190]","['L', 'I']"
142 | 140,0.0140522194844,"(138, 158)",contact,"['I', 'I']",20,"[172, 192]","['I', 'L']","[197, 217]","['I', 'L']","[170, 190]","['I', 'I']"
143 | 141,0.0140522194844,"(154, 158)",contact,"['T', 'I']",20,"[188, 192]","['T', 'L']","[213, 217]","['T', 'L']","[186, 190]","['T', 'I']"
144 | 142,0.0140522194844,"(155, 158)",contact,"['M', 'I']",20,"[189, 192]","['M', 'L']","[214, 217]","['M', 'L']","[187, 190]","['M', 'I']"
145 | 143,0.0140522194844,"(157, 158)",contact,"['L', 'I']",20,"[191, 192]","['L', 'L']","[216, 217]","['L', 'L']","[189, 190]","['L', 'I']"
146 | 144,0.0140522194844,"(158, 159)",contact,"['I', 'V']",20,"[192, 193]","['L', 'V']","[217, 218]","['L', 'V']","[190, 191]","['I', 'V']"
147 | 145,0.0140522194844,"(158, 160)",contact,"['I', 'S']",20,"[192, 194]","['L', 'S']","[217, 219]","['L', 'S']","[190, 192]","['I', 'S']"
148 | 146,-0.0903677077125,"(50, 52)",contact,"['V', 'L']",18,"[99, 101]","['F', 'L']","[124, 126]","['V', 'L']","[97, 99]","['F', 'I']"
149 | 147,-0.0903677077125,"(50, 53)",contact,"['V', 'S']",18,"[99, 102]","['F', 'S']","[124, 127]","['V', 'S']","[97, 100]","['F', 'A']"
150 | 148,-0.0903677077125,"(50, 54)",contact,"['V', 'A']",18,"[99, 103]","['F', 'A']","[124, 128]","['V', 'A']","[97, 101]","['F', 'I']"
151 | 149,-0.0903677077125,"(53, 91)",contact,"['S', 'L']",18,"[102, 130]","['S', 'M']","[127, 155]","['S', 'L']","[100, 128]","['A', 'V']"
152 | 150,-0.0903677077125,"(54, 91)",contact,"['A', 'L']",18,"[103, 130]","['A', 'M']","[128, 155]","['A', 'L']","[101, 128]","['I', 'V']"
153 | 151,-0.0325495757327,50.0,seq,V,21,99,F,124,V,97,F
154 | 152,-0.0325495757327,62.0,seq,W,21,111,Y,136,W,109,F
155 | 153,-0.0325495757327,63.0,seq,H,21,112,Q,137,H,110,S
156 | 154,-0.0325495757327,65.0,seq,Y,21,114,W,139,Y,112,W
157 | 155,-0.0325495757327,68.0,seq,S,21,117,T,142,S,115,T
158 | 156,-0.0325495757327,69.0,seq,V,21,118,C,143,V,116,C
159 | 157,-0.0325495757327,88.0,seq,S,21,127,T,152,S,125,C
160 | 158,-0.0325495757327,91.0,seq,L,21,130,M,155,L,128,V
161 | 159,-0.0325495757327,99.0,seq,Y,21,138,F,163,Y,136,F
162 | 160,-0.0325495757327,100.0,seq,F,21,139,H,164,F,137,K
163 | 161,-0.0325495757327,105.0,seq,T,21,142,D,167,T,140,S
164 | 162,-0.0325495757327,"(46, 50)",contact,"['Q', 'V']",21,"[95, 99]","['Q', 'F']","[120, 124]","['Q', 'V']","[93, 97]","['Q', 'F']"
165 | 163,-0.0325495757327,"(47, 50)",contact,"['W', 'V']",21,"[96, 99]","['W', 'F']","[121, 124]","['W', 'V']","[94, 97]","['W', 'F']"
166 | 164,-0.0325495757327,"(50, 91)",contact,"['V', 'L']",21,"[99, 130]","['F', 'M']","[124, 155]","['V', 'L']","[97, 128]","['F', 'V']"
167 | 165,-0.0325495757327,"(50, 94)",contact,"['V', 'V']",21,"[99, 133]","['F', 'F']","[124, 158]","['V', 'V']","[97, 131]","['F', 'V']"
168 | 166,-0.0325495757327,"(50, 95)",contact,"['V', 'I']",21,"[99, 134]","['F', 'I']","[124, 159]","['V', 'I']","[97, 132]","['F', 'T']"
169 | 167,-0.0325495757327,"(57, 88)",contact,"['L', 'S']",21,"[106, 127]","['L', 'T']","[131, 152]","['L', 'S']","[104, 125]","['L', 'C']"
170 | 168,-0.0325495757327,"(57, 91)",contact,"['L', 'L']",21,"[106, 130]","['L', 'M']","[131, 155]","['L', 'L']","[104, 128]","['L', 'V']"
171 | 169,-0.0325495757327,"(69, 79)",contact,"['V', 'G']",21,"[118, 119]","['C', 'G']","[143, 144]","['V', 'G']","[116, 117]","['C', 'G']"
172 | 170,-0.0325495757327,"(69, 80)",contact,"['V', 'W']",21,"[118, 120]","['C', 'W']","[143, 145]","['V', 'W']","[116, 118]","['C', 'W']"
173 | 171,-0.0325495757327,"(69, 82)",contact,"['V', 'E']",21,"[118, 121]","['C', 'E']","[143, 146]","['V', 'E']","[116, 119]","['C', 'E']"
174 | 172,-0.0325495757327,"(69, 285)",contact,"['V', 'R']",21,"[118, 307]","['C', 'R']","[143, 332]","['V', 'R']","[116, 305]","['C', 'R']"
175 | 173,-0.0325495757327,"(84, 88)",contact,"['V', 'S']",21,"[123, 127]","['I', 'T']","[148, 152]","['V', 'S']","[121, 125]","['V', 'C']"
176 | 174,-0.0325495757327,"(85, 88)",contact,"['Y', 'S']",21,"[124, 127]","['Y', 'T']","[149, 152]","['Y', 'S']","[122, 125]","['Y', 'C']"
177 | 175,-0.0325495757327,"(86, 88)",contact,"['V', 'S']",21,"[125, 127]","['V', 'T']","[150, 152]","['V', 'S']","[123, 125]","['V', 'C']"
178 | 176,-0.0325495757327,"(87, 88)",contact,"['C', 'S']",21,"[126, 127]","['A', 'T']","[151, 152]","['C', 'S']","[124, 125]","['C', 'C']"
179 | 177,-0.0325495757327,"(87, 91)",contact,"['C', 'L']",21,"[126, 130]","['A', 'M']","[151, 155]","['C', 'L']","[124, 128]","['C', 'V']"
180 | 178,-0.0325495757327,"(88, 89)",contact,"['S', 'V']",21,"[127, 128]","['T', 'I']","[152, 153]","['S', 'V']","[125, 126]","['C', 'V']"
181 | 179,-0.0325495757327,"(88, 90)",contact,"['S', 'E']",21,"[127, 129]","['T', 'E']","[152, 154]","['S', 'E']","[125, 127]","['C', 'E']"
182 | 180,-0.0325495757327,"(88, 91)",contact,"['S', 'L']",21,"[127, 130]","['T', 'M']","[152, 155]","['S', 'L']","[125, 128]","['C', 'V']"
183 | 181,-0.0325495757327,"(88, 92)",contact,"['S', 'I']",21,"[127, 131]","['T', 'I']","[152, 156]","['S', 'I']","[125, 129]","['C', 'L']"
184 | 182,-0.0325495757327,"(89, 91)",contact,"['V', 'L']",21,"[128, 130]","['I', 'M']","[153, 155]","['V', 'L']","[126, 128]","['V', 'V']"
185 | 183,-0.0325495757327,"(89, 92)",contact,"['V', 'I']",21,"[128, 131]","['I', 'I']","[153, 156]","['V', 'I']","[126, 129]","['V', 'L']"
186 | 184,-0.0325495757327,"(89, 93)",contact,"['V', 'K']",21,"[128, 132]","['I', 'K']","[153, 157]","['V', 'K']","[126, 130]","['V', 'F']"
187 | 185,-0.0325495757327,"(90, 91)",contact,"['E', 'L']",21,"[129, 130]","['E', 'M']","[154, 155]","['E', 'L']","[127, 128]","['E', 'V']"
188 | 186,-0.0325495757327,"(91, 92)",contact,"['L', 'I']",21,"[130, 131]","['M', 'I']","[155, 156]","['L', 'I']","[128, 129]","['V', 'L']"
189 | 187,-0.0325495757327,"(91, 93)",contact,"['L', 'K']",21,"[130, 132]","['M', 'K']","[155, 157]","['L', 'K']","[128, 130]","['V', 'F']"
190 | 188,-0.0325495757327,"(91, 94)",contact,"['L', 'V']",21,"[130, 133]","['M', 'F']","[155, 158]","['L', 'V']","[128, 131]","['V', 'V']"
191 | 189,-0.0325495757327,"(91, 95)",contact,"['L', 'I']",21,"[130, 134]","['M', 'I']","[155, 159]","['L', 'I']","[128, 132]","['V', 'T']"
192 | 190,-0.0325495757327,"(92, 94)",contact,"['I', 'V']",21,"[131, 133]","['I', 'F']","[156, 158]","['I', 'V']","[129, 131]","['L', 'V']"
193 | 191,-0.0325495757327,"(92, 96)",contact,"['I', 'L']",21,"[131, 135]","['I', 'I']","[156, 160]","['I', 'L']","[129, 133]","['L', 'L']"
194 | 192,-0.0325495757327,"(93, 94)",contact,"['K', 'V']",21,"[132, 133]","['K', 'F']","[157, 158]","['K', 'V']","[130, 131]","['F', 'V']"
195 | 193,-0.0325495757327,"(93, 96)",contact,"['K', 'L']",21,"[132, 135]","['K', 'I']","[157, 160]","['K', 'L']","[130, 133]","['F', 'L']"
196 | 194,-0.0325495757327,"(94, 95)",contact,"['V', 'I']",21,"[133, 134]","['F', 'I']","[158, 159]","['V', 'I']","[131, 132]","['V', 'T']"
197 | 195,-0.0325495757327,"(95, 96)",contact,"['I', 'L']",21,"[134, 135]","['I', 'I']","[159, 160]","['I', 'L']","[132, 133]","['T', 'L']"
198 | 196,-0.0325495757327,"(95, 98)",contact,"['I', 'I']",21,"[134, 137]","['I', 'Y']","[159, 162]","['I', 'I']","[132, 135]","['T', 'I']"
199 | 197,-0.0325495757327,"(95, 99)",contact,"['I', 'Y']",21,"[134, 138]","['I', 'F']","[159, 163]","['I', 'Y']","[132, 136]","['T', 'F']"
200 | 198,-0.0325495757327,"(96, 99)",contact,"['L', 'Y']",21,"[135, 138]","['I', 'F']","[160, 163]","['L', 'Y']","[133, 136]","['L', 'F']"
201 | 199,-0.0325495757327,"(96, 100)",contact,"['L', 'F']",21,"[135, 139]","['I', 'H']","[160, 164]","['L', 'F']","[133, 137]","['L', 'K']"
202 | 200,-0.0325495757327,"(97, 99)",contact,"['E', 'Y']",21,"[136, 138]","['E', 'F']","[161, 163]","['E', 'Y']","[134, 136]","['E', 'F']"
203 | 201,-0.0325495757327,"(97, 100)",contact,"['E', 'F']",21,"[136, 139]","['E', 'H']","[161, 164]","['E', 'F']","[134, 137]","['E', 'K']"
204 | 202,-0.0325495757327,"(98, 99)",contact,"['I', 'Y']",21,"[137, 138]","['Y', 'F']","[162, 163]","['I', 'Y']","[135, 136]","['I', 'F']"
205 | 203,-0.0325495757327,"(98, 100)",contact,"['I', 'F']",21,"[137, 139]","['Y', 'H']","[162, 164]","['I', 'F']","[135, 137]","['I', 'K']"
206 | 204,-0.0325495757327,"(99, 100)",contact,"['Y', 'F']",21,"[138, 139]","['F', 'H']","[163, 164]","['Y', 'F']","[136, 137]","['F', 'K']"
207 | 205,-0.0325495757327,"(99, 101)",contact,"['Y', 'E']",21,"[138, 140]","['F', 'E']","[163, 165]","['Y', 'E']","[136, 138]","['F', 'E']"
208 | 206,-0.0325495757327,"(99, 105)",contact,"['Y', 'T']",21,"[138, 142]","['F', 'D']","[163, 167]","['Y', 'T']","[136, 140]","['F', 'S']"
209 | 207,-0.0325495757327,"(100, 101)",contact,"['F', 'E']",21,"[139, 140]","['H', 'E']","[164, 165]","['F', 'E']","[137, 138]","['K', 'E']"
210 | 208,-0.0325495757327,"(100, 102)",contact,"['F', 'F']",21,"[139, 141]","['H', 'F']","[164, 166]","['F', 'F']","[137, 139]","['K', 'F']"
211 | 209,-0.0325495757327,"(100, 105)",contact,"['F', 'T']",21,"[139, 142]","['H', 'D']","[164, 167]","['F', 'T']","[137, 140]","['K', 'S']"
212 | 210,-0.0325495757327,"(100, 108)",contact,"['F', 'A']",21,"[139, 145]","['H', 'A']","[164, 170]","['F', 'A']","[137, 143]","['K', 'A']"
213 | 211,-0.0325495757327,"(101, 105)",contact,"['E', 'T']",21,"[140, 142]","['E', 'D']","[165, 167]","['E', 'T']","[138, 140]","['E', 'S']"
214 | 212,-0.0325495757327,"(102, 105)",contact,"['F', 'T']",21,"[141, 142]","['F', 'D']","[166, 167]","['F', 'T']","[139, 140]","['F', 'S']"
215 | 213,-0.0325495757327,"(105, 107)",contact,"['T', 'P']",21,"[142, 144]","['D', 'P']","[167, 169]","['T', 'P']","[140, 142]","['S', 'P']"
216 | 214,-0.0325495757327,"(105, 108)",contact,"['T', 'A']",21,"[142, 145]","['D', 'A']","[167, 170]","['T', 'A']","[140, 143]","['S', 'A']"
217 | 215,-0.00632980766847,"(89, 132)",contact,"['V', 'T']",15,"[128, 166]","['I', 'T']","[153, 191]","['V', 'T']","[126, 164]","['V', 'S']"
218 | 216,-0.106774499838,"(112, 114)",contact,"['L', 'G']",7,"[149, 151]","['S', 'N']","[174, 176]","['L', 'G']","[147, 149]","['L', 'T']"
219 | 217,0.0423640139123,"(228, 287)",contact,"['V', 'K']",12,"[250, 309]","['V', 'L']","[275, 334]","['L', 'K']","[248, 307]","['V', 'K']"
220 | 218,0.0651927695805,"(206, 208)",contact,"['V', 'A']",13,"[236, 238]","['I', 'A']","[261, 263]","['V', 'S']","[234, 236]","['V', 'A']"
221 | 219,0.0651927695805,"(206, 217)",contact,"['V', 'H']",13,"[236, 240]","['I', 'H']","[261, 265]","['V', 'Y']","[234, 238]","['V', 'H']"
222 | 220,-0.010932852442,"(49, 50)",contact,"['A', 'F']",4,"[98, 99]","['T', 'F']","[123, 124]","['A', 'V']","[96, 97]","['A', 'F']"
223 | 221,0.0325495757327,50.0,seq,F,6,99,F,124,V,97,F
224 | 222,0.0325495757327,65.0,seq,W,6,114,W,139,Y,112,W
225 | 223,0.0325495757327,68.0,seq,T,6,117,T,142,S,115,T
226 | 224,0.0325495757327,69.0,seq,C,6,118,C,143,V,116,C
227 | 225,0.0325495757327,99.0,seq,F,6,138,F,163,Y,136,F
228 | 226,0.0325495757327,"(46, 50)",contact,"['Q', 'F']",6,"[95, 99]","['Q', 'F']","[120, 124]","['Q', 'V']","[93, 97]","['Q', 'F']"
229 | 227,0.0325495757327,"(47, 50)",contact,"['W', 'F']",6,"[96, 99]","['W', 'F']","[121, 124]","['W', 'V']","[94, 97]","['W', 'F']"
230 | 228,0.0325495757327,"(69, 79)",contact,"['C', 'G']",6,"[118, 119]","['C', 'G']","[143, 144]","['V', 'G']","[116, 117]","['C', 'G']"
231 | 229,0.0325495757327,"(69, 80)",contact,"['C', 'W']",6,"[118, 120]","['C', 'W']","[143, 145]","['V', 'W']","[116, 118]","['C', 'W']"
232 | 230,0.0325495757327,"(69, 82)",contact,"['C', 'E']",6,"[118, 121]","['C', 'E']","[143, 146]","['V', 'E']","[116, 119]","['C', 'E']"
233 | 231,0.0325495757327,"(69, 285)",contact,"['C', 'R']",6,"[118, 307]","['C', 'R']","[143, 332]","['V', 'R']","[116, 305]","['C', 'R']"
234 | 232,0.0325495757327,"(97, 99)",contact,"['E', 'F']",6,"[136, 138]","['E', 'F']","[161, 163]","['E', 'Y']","[134, 136]","['E', 'F']"
235 | 233,0.0325495757327,"(99, 101)",contact,"['F', 'E']",6,"[138, 140]","['F', 'E']","[163, 165]","['Y', 'E']","[136, 138]","['F', 'E']"
236 |
--------------------------------------------------------------------------------
/regression/outputs/matern_kernel_gen10_green_norm.csv:
--------------------------------------------------------------------------------
1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real
2 | 0,1.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,1.02271618013,0.941918382512,1.0,0.931513986392
3 | 2,0.089599974,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,-1.72475257563,-1.36366794838,0.089599974,0.123026952317
4 | 6,0.661021095,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,0.551248670309,-0.151029742925,0.661021095,0.356792345662
5 | 14,0.563438909,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,0.369336692916,0.398130050637,0.563438909,0.577865283452
6 | 18,0.041108704,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,-2.61210511773,-2.00641530807,0.041108704,0.0699683203893
7 | 25,0.060136195,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,-2.1788788232,-0.306443744649,0.060136195,0.311280213508
8 | 26,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,1.02271618013,0.976088888203,1.0,0.959885910951
9 | 29,0.602216508,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,0.445139346208,0.295835534249,0.602216508,0.528224622577
10 | 43,1.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,1.02271618013,0.781325681647,1.0,0.809003822973
11 | 47,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,1.02271618013,0.922198703383,1.0,0.915523878307
12 | 48,0.095908825,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,-1.64725882791,-1.70056941884,0.095908825,0.091522873194
13 | 58,0.429018765,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,0.0589216655269,0.0925431737285,0.429018765,0.441872705821
14 | 62,0.03504343,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,-2.79390877129,-1.95258034556,0.03504343,0.0733551031453
15 | 65,0.096970698,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,-1.63471863447,-0.372566386263,0.096970698,0.293722335116
16 | 68,0.47181211,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,0.167207929544,0.0648253122398,0.47181211,0.431248428662
17 | 72,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,1.02271618013,0.939255490174,1.0,0.92933852205
18 | 85,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,1.02271618013,1.04215023866,1.0,1.01721039238
19 | 90,0.153510737,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,-1.11155428205,-1.24322365882,0.153510737,0.136750612502
20 | 92,0.517913171,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,0.273383361603,0.184983461237,0.517913171,0.479233783856
21 | 102,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,1.02271618013,0.991534410348,1.0,0.972992420924
22 | 105,0.102952425,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,-1.56654654578,-1.7489922083,0.102952425,0.0877131253377
23 | 114,0.056003285,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,-2.25996986632,-1.91137246332,0.056003285,0.0760578652421
24 | 116,0.402343781,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,-0.0141882375507,-0.217789697629,0.402343781,0.336478995455
25 | 118,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,1.02271618013,0.907852220524,1.0,0.904063492935
26 | 123,0.543114006,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,0.327494105173,0.0107699724399,0.543114006,0.411258233522
27 | 126,0.498502553,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,0.229878861244,0.179842908138,0.498502553,0.477075571754
28 | 130,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,1.02271618013,0.863813252773,1.0,0.869772265278
29 |
--------------------------------------------------------------------------------
/regression/outputs/matern_kernel_gen10_kinetics_off.csv:
--------------------------------------------------------------------------------
1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real
2 | 0,29.8,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,-0.755347722203,-0.742174445599,29.8,30.4865002826
3 | 2,169.7333333,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,0.250897421852,0.314322993012,169.7333333,189.404794518
4 | 6,607.3916667,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,0.988319154052,0.576867733954,607.3916667,298.21243049
5 | 14,389.5,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,0.731332570021,0.856499057671,389.5,483.604324564
6 | 18,9.616666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,-1.40951844698,-0.941908310311,9.616666667,21.5841562125
7 | 25,8491.4,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,2.51391376936,1.74859381358,8491.4,2261.15276527
8 | 26,61.75,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,-0.333937595146,-0.244556081469,61.75,72.0692681014
9 | 29,3544.2,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,2.00854625622,1.2881798356,3544.2,1020.05293881
10 | 43,28.66,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,-0.77790857387,-0.601035249998,28.66,38.9120023574
11 | 47,19.64,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,-0.996503711137,-0.207834506392,19.64,76.7932297794
12 | 48,14.11666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,-1.18749672014,-0.62859173715,14.11666667,37.1015856156
13 | 58,217.2,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,0.39352380661,0.42969986086,217.2,231.218741072
14 | 62,11.1,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,-1.32654921146,-0.865548082173,11.1,24.6303759824
15 | 65,6723.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,2.37884735606,1.76019034112,6723.0,2306.94527442
16 | 68,310.94,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,0.601041098251,0.521387186806,310.94,270.936507276
17 | 72,56.31666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,-0.387209754147,-0.487852780348,56.31666667,47.3225298679
18 | 85,12.81,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,-1.24367621877,-1.05464775675,12.81,17.7616517302
19 | 90,146.35,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,0.165163366157,0.189255167907,146.35,152.574633873
20 | 92,228.7,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,0.423364648898,0.506905195001,228.7,264.236954877
21 | 102,33.16,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,-0.69355436972,-0.167098783557,33.16,82.3967012581
22 | 105,37.63333333,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,-0.620360904267,-0.645813123384,37.63333333,36.0131899965
23 | 114,12.81,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,-1.24367621877,-1.20036116763,12.81,13.8061548443
24 | 116,322.7666667,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,0.622632403696,0.574998449291,322.7666667,297.250208691
25 | 118,27.82857143,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,-0.794936043221,-0.152354285467,27.82857143,84.5241677159
26 | 123,179.26,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,0.28248280422,0.795286857215,179.26,435.039090984
27 | 126,940.5,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,1.24121524215,0.806265333155,940.5,443.375406037
28 | 130,18.9,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,-1.01871776291,-0.468718805493,18.9,48.914196253
29 |
--------------------------------------------------------------------------------
/regression/outputs/matern_kernel_gen10_max_peak.csv:
--------------------------------------------------------------------------------
1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real
2 | 0,0.408825104,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,0.639787095292,0.444058543315,0.408825104,0.304164705421
3 | 2,0.712525792,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,1.00747788655,1.08943456963,0.712525792,0.806449308107
4 | 6,2.106051626,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,1.72478812836,1.65014706839,2.106051626,1.8814502068
5 | 14,2.560177881,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,1.85402713135,1.5918989079,2.560177881,1.72295036936
6 | 18,0.722079305,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,1.01629331501,0.792040809852,0.722079305,0.514564661896
7 | 25,0.461560568,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,0.720089692223,0.792045333536,0.461560568,0.514568178774
8 | 26,0.328829052,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,0.495664091484,0.502572079309,0.328829052,0.332279009925
9 | 29,1.608608124,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,1.5464486702,1.25021262662,1.608608124,1.0281886028
10 | 39,0.025428131,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1210021101,ChR_29_10,-1.19852657072,0.224774494202,0.025428131,0.218385584035
11 | 43,0.495936951,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,0.767635858036,0.556701201507,0.495936951,0.360595358134
12 | 47,0.651422528,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,0.948135698315,0.646168922392,0.651422528,0.412786075646
13 | 48,0.399744553,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,0.62492023374,0.486512601336,0.399744553,0.32431375037
14 | 58,2.398501892,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,1.81085139985,1.59043849564,2.398501892,1.71915291192
15 | 62,0.848262695,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,1.12289192958,0.897126412923,0.848262695,0.603104617465
16 | 65,0.314668112,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,0.466528634001,0.769472200629,0.314668112,0.497314815463
17 | 66,0.026046454,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1210001121,ChR_30_10,-1.18262464527,0.388215662507,0.026046454,0.279554893367
18 | 68,3.472616895,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,2.05578776486,1.6211742775,3.472616895,1.80086856743
19 | 69,0.768058679,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111002,ChR_13_10,1.05715165607,1.09203574513,0.768058679,0.809624895854
20 | 72,0.239768735,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,0.28660201521,0.463819433189,0.239768735,0.313382739147
21 | 85,0.550266779,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,0.836440730862,0.373304299162,0.550266779,0.273327228192
22 | 90,2.118434326,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,1.7286682827,1.27146856584,2.118434326,1.06174451804
23 | 92,1.995635986,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,1.6891447199,1.54053211299,1.995635986,1.59429272704
24 | 102,0.432389432,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,0.676878066064,0.549789259576,0.432389432,0.356849267807
25 | 105,1.415689748,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,1.46189247888,0.784853465205,1.415689748,0.509007202901
26 | 114,0.460355225,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,0.718358975785,0.216981459528,0.460355225,0.215829353167
27 | 116,2.626104167,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,1.87085511758,1.51395541011,2.626104167,1.53154430469
28 | 118,1.221857893,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,1.36443508774,0.702031245763,1.221857893,0.44913770535
29 | 123,3.323136548,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,2.02666565635,1.56239449688,3.323136548,1.64783312887
30 | 126,2.564170288,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,1.85505847296,1.54973774044,2.564170288,1.6166217053
31 | 130,0.332408124,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,0.502829215139,0.584138672734,0.332408124,0.375857682512
32 |
--------------------------------------------------------------------------------
/regression/outputs/matern_kinetics_off_0.03_LASSO.csv:
--------------------------------------------------------------------------------
1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust
2 | 0,-0.469988726749,"(144, 147)",contact,"['T', 'S']",14,"[178, 181]","['T', 'A']","[203, 206]","['T', 'S']","[176, 179]","['S', 'K']"
3 | 1,0.139320184843,188.0,seq,C,5,218,L,243,C,216,I
4 | 2,0.139320184843,189.0,seq,I,5,219,M,244,I,217,V
5 | 3,0.139320184843,192.0,seq,V,5,222,C,247,V,220,I
6 | 4,0.139320184843,"(174, 179)",contact,"['A', 'V']",5,"[208, 212]","['S', 'V']","[233, 237]","['A', 'V']","[206, 210]","['A', 'L']"
7 | 5,0.139320184843,"(174, 186)",contact,"['A', 'F']",5,"[208, 216]","['S', 'F']","[233, 241]","['A', 'F']","[206, 214]","['A', 'L']"
8 | 6,0.139320184843,"(176, 177)",contact,"['T', 'G']",5,"[209, 210]","['K', 'G']","[234, 235]","['T', 'G']","[207, 208]","['T', 'D']"
9 | 7,0.139320184843,"(176, 179)",contact,"['T', 'V']",5,"[209, 212]","['K', 'V']","[234, 237]","['T', 'V']","[207, 210]","['T', 'L']"
10 | 8,0.139320184843,"(177, 178)",contact,"['G', 'W']",5,"[210, 211]","['G', 'Y']","[235, 236]","['G', 'W']","[208, 209]","['D', 'W']"
11 | 9,0.139320184843,"(177, 180)",contact,"['G', 'K']",5,"[210, 213]","['G', 'R']","[235, 238]","['G', 'K']","[208, 211]","['D', 'K']"
12 | 10,0.139320184843,"(177, 184)",contact,"['G', 'W']",5,"[210, 214]","['G', 'V']","[235, 239]","['G', 'W']","[208, 212]","['D', 'W']"
13 | 11,0.139320184843,"(178, 179)",contact,"['W', 'V']",5,"[211, 212]","['Y', 'V']","[236, 237]","['W', 'V']","[209, 210]","['W', 'L']"
14 | 12,0.139320184843,"(179, 180)",contact,"['V', 'K']",5,"[212, 213]","['V', 'R']","[237, 238]","['V', 'K']","[210, 211]","['L', 'K']"
15 | 13,0.139320184843,"(179, 184)",contact,"['V', 'W']",5,"[212, 214]","['V', 'V']","[237, 239]","['V', 'W']","[210, 212]","['L', 'W']"
16 | 14,0.139320184843,"(179, 185)",contact,"['V', 'L']",5,"[212, 215]","['V', 'I']","[237, 240]","['V', 'L']","[210, 213]","['L', 'L']"
17 | 15,0.139320184843,"(180, 186)",contact,"['K', 'F']",5,"[213, 216]","['R', 'F']","[238, 241]","['K', 'F']","[211, 214]","['K', 'L']"
18 | 16,0.139320184843,"(184, 186)",contact,"['W', 'F']",5,"[214, 216]","['V', 'F']","[239, 241]","['W', 'F']","[212, 214]","['W', 'L']"
19 | 17,0.139320184843,"(184, 188)",contact,"['W', 'C']",5,"[214, 218]","['V', 'L']","[239, 243]","['W', 'C']","[212, 216]","['W', 'I']"
20 | 18,0.139320184843,"(185, 186)",contact,"['L', 'F']",5,"[215, 216]","['I', 'F']","[240, 241]","['L', 'F']","[213, 214]","['L', 'L']"
21 | 19,0.139320184843,"(185, 188)",contact,"['L', 'C']",5,"[215, 218]","['I', 'L']","[240, 243]","['L', 'C']","[213, 216]","['L', 'I']"
22 | 20,0.139320184843,"(185, 189)",contact,"['L', 'I']",5,"[215, 219]","['I', 'M']","[240, 244]","['L', 'I']","[213, 217]","['L', 'V']"
23 | 21,0.139320184843,"(186, 187)",contact,"['F', 'Y']",5,"[216, 217]","['F', 'F']","[241, 242]","['F', 'Y']","[214, 215]","['L', 'Y']"
24 | 22,0.139320184843,"(186, 188)",contact,"['F', 'C']",5,"[216, 218]","['F', 'L']","[241, 243]","['F', 'C']","[214, 216]","['L', 'I']"
25 | 23,0.139320184843,"(186, 189)",contact,"['F', 'I']",5,"[216, 219]","['F', 'M']","[241, 244]","['F', 'I']","[214, 217]","['L', 'V']"
26 | 24,0.139320184843,"(187, 188)",contact,"['Y', 'C']",5,"[217, 218]","['F', 'L']","[242, 243]","['Y', 'C']","[215, 216]","['Y', 'I']"
27 | 25,0.139320184843,"(187, 189)",contact,"['Y', 'I']",5,"[217, 219]","['F', 'M']","[242, 244]","['Y', 'I']","[215, 217]","['Y', 'V']"
28 | 26,0.139320184843,"(187, 190)",contact,"['Y', 'G']",5,"[217, 220]","['F', 'G']","[242, 245]","['Y', 'G']","[215, 218]","['Y', 'S']"
29 | 27,0.139320184843,"(187, 191)",contact,"['Y', 'L']",5,"[217, 221]","['F', 'L']","[242, 246]","['Y', 'L']","[215, 219]","['Y', 'C']"
30 | 28,0.139320184843,"(188, 189)",contact,"['C', 'I']",5,"[218, 219]","['L', 'M']","[243, 244]","['C', 'I']","[216, 217]","['I', 'V']"
31 | 29,0.139320184843,"(188, 190)",contact,"['C', 'G']",5,"[218, 220]","['L', 'G']","[243, 245]","['C', 'G']","[216, 218]","['I', 'S']"
32 | 30,0.139320184843,"(188, 191)",contact,"['C', 'L']",5,"[218, 221]","['L', 'L']","[243, 246]","['C', 'L']","[216, 219]","['I', 'C']"
33 | 31,0.139320184843,"(188, 192)",contact,"['C', 'V']",5,"[218, 222]","['L', 'C']","[243, 247]","['C', 'V']","[216, 220]","['I', 'I']"
34 | 32,0.139320184843,"(189, 190)",contact,"['I', 'G']",5,"[219, 220]","['M', 'G']","[244, 245]","['I', 'G']","[217, 218]","['V', 'S']"
35 | 33,0.139320184843,"(189, 191)",contact,"['I', 'L']",5,"[219, 221]","['M', 'L']","[244, 246]","['I', 'L']","[217, 219]","['V', 'C']"
36 | 34,0.139320184843,"(189, 192)",contact,"['I', 'V']",5,"[219, 222]","['M', 'C']","[244, 247]","['I', 'V']","[217, 220]","['V', 'I']"
37 | 35,0.139320184843,"(189, 193)",contact,"['I', 'Y']",5,"[219, 223]","['M', 'Y']","[244, 248]","['I', 'Y']","[217, 221]","['V', 'Y']"
38 | 36,0.139320184843,"(190, 192)",contact,"['G', 'V']",5,"[220, 222]","['G', 'C']","[245, 247]","['G', 'V']","[218, 220]","['S', 'I']"
39 | 37,0.139320184843,"(191, 192)",contact,"['L', 'V']",5,"[221, 222]","['L', 'C']","[246, 247]","['L', 'V']","[219, 220]","['C', 'I']"
40 | 38,0.139320184843,"(192, 193)",contact,"['V', 'Y']",5,"[222, 223]","['C', 'Y']","[247, 248]","['V', 'Y']","[220, 221]","['I', 'Y']"
41 | 39,0.139320184843,"(192, 194)",contact,"['V', 'G']",5,"[222, 224]","['C', 'G']","[247, 249]","['V', 'G']","[220, 222]","['I', 'G']"
42 | 40,-1.48382127775,"(161, 197)",contact,"['D', 'T']",13,"[195, 227]","['D', 'T']","[220, 252]","['D', 'T']","[193, 225]","['C', 'M']"
43 | 41,-1.48382127775,"(164, 197)",contact,"['T', 'T']",13,"[198, 227]","['T', 'T']","[223, 252]","['T', 'T']","[196, 225]","['M', 'M']"
44 | 42,0.142100116668,"(191, 195)",contact,"['L', 'G']",9,"[221, 225]","['L', 'I']","[246, 250]","['L', 'T']","[219, 223]","['C', 'G']"
45 | 43,0.0592470722772,"(164, 190)",contact,"['M', 'G']",6,"[198, 220]","['T', 'G']","[223, 245]","['T', 'G']","[196, 218]","['M', 'S']"
46 | 44,0.0592470722772,"(167, 186)",contact,"['F', 'F']",6,"[201, 216]","['W', 'F']","[226, 241]","['M', 'F']","[199, 214]","['F', 'L']"
47 | 45,0.0592470722772,"(167, 190)",contact,"['F', 'G']",6,"[201, 220]","['W', 'G']","[226, 245]","['M', 'G']","[199, 218]","['F', 'S']"
48 | 46,0.0592470722772,"(170, 186)",contact,"['A', 'F']",6,"[204, 216]","['T', 'F']","[229, 241]","['T', 'F']","[202, 214]","['A', 'L']"
49 | 47,-0.248666834914,"(167, 187)",contact,"['W', 'F']",12,"[201, 217]","['W', 'F']","[226, 242]","['M', 'Y']","[199, 215]","['F', 'Y']"
50 | 48,-0.248666834914,"(167, 189)",contact,"['W', 'M']",12,"[201, 219]","['W', 'M']","[226, 244]","['M', 'I']","[199, 217]","['F', 'V']"
51 | 49,-0.225257299268,"(53, 91)",contact,"['A', 'V']",2,"[102, 130]","['S', 'M']","[127, 155]","['S', 'L']","[100, 128]","['A', 'V']"
52 | 50,-0.225257299268,"(54, 91)",contact,"['I', 'V']",2,"[103, 130]","['A', 'M']","[128, 155]","['A', 'L']","[101, 128]","['I', 'V']"
53 | 51,0.0326238176403,"(235, 272)",contact,"['L', 'L']",10,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']"
54 | 52,0.0326238176403,"(235, 276)",contact,"['L', 'I']",10,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']"
55 | 53,0.123927605457,"(202, 237)",contact,"['G', 'F']",8,"[232, 259]","['A', 'F']","[257, 284]","['G', 'Y']","[230, 257]","['A', 'F']"
56 | 54,0.22516450092,"(238, 242)",contact,"['V', 'M']",7,"[260, 264]","['V', 'M']","[285, 289]","['S', 'M']","[258, 262]","['A', 'S']"
57 | 55,-0.56035946208,"(238, 242)",contact,"['S', 'M']",1,"[260, 264]","['V', 'M']","[285, 289]","['S', 'M']","[258, 262]","['A', 'S']"
58 | 56,-0.467384269913,"(105, 109)",contact,"['S', 'T']",4,"[142, 146]","['D', 'V']","[167, 171]","['T', 'M']","[140, 144]","['S', 'T']"
59 | 57,-0.387740792677,"(105, 106)",contact,"['S', 'S']",11,"[142, 143]","['D', 'E']","[167, 168]","['T', 'S']","[140, 141]","['S', 'S']"
60 | 58,-0.357155396295,"(89, 132)",contact,"['V', 'S']",3,"[128, 166]","['I', 'T']","[153, 191]","['V', 'T']","[126, 164]","['V', 'S']"
61 | 59,0.350950624649,"(204, 208)",contact,"['I', 'A']",0,"[234, 238]","['V', 'A']","[259, 263]","['I', 'S']","[232, 236]","['C', 'A']"
62 |
--------------------------------------------------------------------------------
/regression/outputs/matern_max_peak_0.05_LASSO.csv:
--------------------------------------------------------------------------------
1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust
2 | 0,0.136551668956,"(52, 271)",contact,"['L', 'L']",7,"[101, 293]","['L', 'L']","[126, 318]","['L', 'L']","[99, 291]","['I', 'I']"
3 | 1,0.136551668956,"(52, 275)",contact,"['L', 'N']",7,"[101, 297]","['L', 'N']","[126, 322]","['L', 'N']","[99, 295]","['I', 'E']"
4 | 2,0.136551668956,"(53, 275)",contact,"['S', 'N']",7,"[102, 297]","['S', 'N']","[127, 322]","['S', 'N']","[100, 295]","['A', 'E']"
5 | 3,0.487954323853,"(105, 109)",contact,"['S', 'T']",11,"[142, 146]","['D', 'V']","[167, 171]","['T', 'M']","[140, 144]","['S', 'T']"
6 | 4,-0.0659944556613,114.0,seq,N,6,151,N,176,G,149,T
7 | 5,-0.0659944556613,"(114, 115)",contact,"['N', 'G']",6,"[151, 152]","['N', 'G']","[176, 177]","['G', 'G']","[149, 150]","['T', 'G']"
8 | 6,-0.0659944556613,"(114, 116)",contact,"['N', 'N']",6,"[151, 153]","['N', 'N']","[176, 178]","['G', 'N']","[149, 151]","['T', 'N']"
9 | 7,-0.0659944556613,"(114, 173)",contact,"['N', 'L']",6,"[151, 207]","['N', 'L']","[176, 232]","['G', 'L']","[149, 205]","['T', 'L']"
10 | 8,-0.0213551669437,"(139, 278)",contact,"['H', 'G']",4,"[173, 300]","['H', 'G']","[198, 325]","['H', 'G']","[171, 298]","['R', 'T']"
11 | 9,-0.0213551669437,"(139, 281)",contact,"['H', 'G']",4,"[173, 303]","['H', 'G']","[198, 328]","['H', 'G']","[171, 301]","['R', 'A']"
12 | 10,-0.199600712935,"(118, 169)",contact,"['T', 'M']",10,"[155, 203]","['T', 'T']","[180, 228]","['T', 'V']","[153, 201]","['A', 'M']"
13 | 11,-0.199600712935,"(118, 172)",contact,"['T', 'G']",10,"[155, 206]","['T', 'A']","[180, 231]","['T', 'A']","[153, 204]","['A', 'G']"
14 | 12,0.326253727293,"(206, 208)",contact,"['V', 'A']",0,"[236, 238]","['I', 'A']","[261, 263]","['V', 'S']","[234, 236]","['V', 'A']"
15 | 13,0.326253727293,"(206, 217)",contact,"['V', 'H']",0,"[236, 240]","['I', 'H']","[261, 265]","['V', 'Y']","[234, 238]","['V', 'H']"
16 | 14,-0.191853161447,"(235, 272)",contact,"['L', 'I']",2,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']"
17 | 15,-0.191853161447,"(235, 276)",contact,"['L', 'F']",2,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']"
18 | 16,0.112963236909,"(235, 272)",contact,"['L', 'L']",1,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']"
19 | 17,0.112963236909,"(235, 276)",contact,"['L', 'I']",1,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']"
20 | 18,-0.756058535511,"(242, 269)",contact,"['M', 'C']",9,"[264, 291]","['M', 'I']","[289, 316]","['M', 'A']","[262, 289]","['S', 'C']"
21 | 19,-0.756058535511,"(243, 269)",contact,"['F', 'C']",9,"[265, 291]","['F', 'I']","[290, 316]","['F', 'A']","[263, 289]","['Y', 'C']"
22 | 20,-0.756058535511,"(243, 273)",contact,"['F', 'A']",9,"[265, 295]","['F', 'S']","[290, 320]","['F', 'S']","[263, 293]","['Y', 'A']"
23 | 21,0.245774599494,"(170, 174)",contact,"['T', 'A']",3,"[204, 208]","['T', 'S']","[229, 233]","['T', 'A']","[202, 206]","['A', 'A']"
24 | 22,0.245774599494,"(172, 174)",contact,"['A', 'A']",3,"[206, 208]","['A', 'S']","[231, 233]","['A', 'A']","[204, 206]","['G', 'A']"
25 | 23,0.245774599494,"(172, 180)",contact,"['A', 'K']",3,"[206, 213]","['A', 'R']","[231, 238]","['A', 'K']","[204, 211]","['G', 'K']"
26 | 24,-0.349941733312,"(41, 264)",contact,"['G', 'I']",12,"[90, 286]","['A', 'V']","[115, 311]","['T', 'I']","[88, 284]","['G', 'I']"
27 | 25,-0.349941733312,"(45, 264)",contact,"['C', 'I']",12,"[94, 286]","['L', 'V']","[119, 311]","['F', 'I']","[92, 284]","['C', 'I']"
28 | 26,0.155622833297,"(156, 158)",contact,"['A', 'L']",5,"[190, 192]","['G', 'L']","[215, 217]","['A', 'L']","[188, 190]","['G', 'I']"
29 | 27,0.425036524061,"(158, 161)",contact,"['L', 'D']",8,"[192, 195]","['L', 'D']","[217, 220]","['L', 'D']","[190, 193]","['I', 'C']"
30 | 28,-0.37731282473,"(172, 247)",contact,"['G', 'F']",13,"[206, 269]","['A', 'F']","[231, 294]","['A', 'F']","[204, 267]","['G', 'W']"
31 |
--------------------------------------------------------------------------------
/regression/outputs/max_peak_matern_kernel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel.pdf
--------------------------------------------------------------------------------
/regression/outputs/max_peak_matern_kernel_CV_fig1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel_CV_fig1.pdf
--------------------------------------------------------------------------------
/regression/outputs/max_peak_matern_kernel_LASSO_CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel_LASSO_CV.pdf
--------------------------------------------------------------------------------