├── .gitattributes ├── README.md ├── classification ├── code │ ├── .ipynb_checkpoints │ │ └── .gitignore │ └── train_and_predict.ipynb ├── inputs │ ├── 2GFP_above_parent123457.pkl │ ├── GFP_data.pkl │ ├── X_and_terms.pkl │ ├── all_lit_chimeras_gaps.txt │ ├── gfp_props.pkl │ ├── lit_alignment_and_contacts.pkl │ └── props.pkl └── outputs │ ├── 2GFP_above_parent.pkl │ ├── 2GFP_above_parent.pkl.txt │ ├── matern52_bin0.1_max_peak_False.pkl │ └── matern52_bin0.1_max_peak_False.txt └── regression ├── .ipynb_checkpoints └── .gitignore ├── GP_matern_5_2_kernel.ipynb ├── GP_matern_5_2_kernel_LASSO.ipynb ├── GP_tools.py ├── __pycache__ └── .gitignore ├── chimera_tools.py ├── encoding_tools.py ├── inputs ├── Ephys_data_formatted.csv ├── alignment_and_contacts_C1C2.pkl ├── lit_alignment_and_contacts_pro2.pkl ├── shmetis_c_10_21_0 │ └── chimeras.output └── shmetis_n_10_21_0 │ └── chimeras.output ├── lasso_tools.py └── outputs ├── green_norm_matern_kernel.pdf ├── green_norm_matern_kernel_CV_fig1.pdf ├── green_norm_matern_kernel_LASSO_CV.pdf ├── kinetics_off_matern_kernel.pdf ├── kinetics_off_matern_kernel_CV_fig1.pdf ├── kinetics_off_matern_kernel_LASSO_CV.pdf ├── matern_green_norm_0.025_LASSO.csv ├── matern_kernel_gen10_green_norm.csv ├── matern_kernel_gen10_kinetics_off.csv ├── matern_kernel_gen10_max_peak.csv ├── matern_kinetics_off_0.03_LASSO.csv ├── matern_max_peak_0.05_LASSO.csv ├── max_peak_matern_kernel.pdf ├── max_peak_matern_kernel_CV_fig1.pdf └── max_peak_matern_kernel_LASSO_CV.pdf /.gitattributes: -------------------------------------------------------------------------------- 1 | *pkl filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # channels 2 | ## Code to reproduce the paper *Machine learning-guided channelrhodopsin engineering enables minimally-invasive optogenetics*. Gaussian process models for optimizing channelrhodopsin properties. 3 | 4 | ### Computing Environment: 5 | 6 | This was originally developed using Anaconda Python 3.6 and the following packages and versions: 7 | 8 | 1. numpy 1.13.3 9 | 2. pandas 0.20.3 10 | 3. scipy 0.19.1 11 | 4. sklearn 0.19.0 12 | 5. gpmodel (https://github.com/yangkky/gpmodel) 13 | 14 | ### File structure 15 | 16 | The repository is divided into two self-contained directories containing all the code and inputs for the regression and classification models, respectively. For regression, the GP code is here. For classification, the GP code is in the gpmodel repository (https://github.com/yangkky/gpmodel) -------------------------------------------------------------------------------- /classification/code/.ipynb_checkpoints/.gitignore: -------------------------------------------------------------------------------- 1 | *checkpoint.ipynb 2 | -------------------------------------------------------------------------------- /classification/code/train_and_predict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pickle\n", 10 | "import os\n", 11 | "\n", 12 | "import pandas as pd\n", 13 | "import numpy as np\n", 14 | "from scipy import stats\n", 15 | "from sklearn import metrics\n", 16 | "from sklearn import model_selection\n", 17 | "from gpmodel import gpmodel, gpkernel, chimera_tools" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/html": [ 28 | "
\n", 29 | "\n", 42 | "\n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | "
block_kcyan_normcyan_peakcyan_ssgenerationgreen_normgreen_peakgreen_sskinetics_offm...log_cyan_sslog_green_normlog_green_peaklog_green_sslog_max_peaklog_max_sslog_red_peaklog_red_sslog_kinetics_offbin0.1_max_peak
0n0221012201NaN0.0144520.0031704NaN0.0112250.002416NaN59.0...-5.754181NaN-4.489609-6.025555-4.236934-5.754181-4.513885-6.321817NaN-1
1n00100000001.00.2608710.17361810.1840760.0480200.03631011.52519.0...-1.750895-1.692407-3.036137-3.315654-1.343730-1.750895-4.623666-7.5914072.4445191
2n00010000001.01.0346040.92589310.3594320.3718700.35322970.5503.0...-0.076997-1.023229-0.989210-1.0406390.034019-0.076997-4.643232-6.4340304.2563221
3c11120011011.01.2343270.92058590.3732560.4607200.441420269.12544.0...-0.082746-0.985491-0.774965-0.8177590.210526-0.082746-4.235193-5.5911615.5951761
4c2202121120NaN0.0093880.0000004NaN0.0086620.000000NaN62.0...NaNNaN-4.748757NaN-4.563973-8.729060-4.563973-8.729060NaN-1
\n", 192 | "

5 rows × 32 columns

\n", 193 | "
" 194 | ], 195 | "text/plain": [ 196 | " block_k cyan_norm cyan_peak cyan_ss generation green_norm \\\n", 197 | "0 n0221012201 NaN 0.014452 0.003170 4 NaN \n", 198 | "1 n0010000000 1.0 0.260871 0.173618 1 0.184076 \n", 199 | "2 n0001000000 1.0 1.034604 0.925893 1 0.359432 \n", 200 | "3 c1112001101 1.0 1.234327 0.920585 9 0.373256 \n", 201 | "4 c2202121120 NaN 0.009388 0.000000 4 NaN \n", 202 | "\n", 203 | " green_peak green_ss kinetics_off m ... log_cyan_ss \\\n", 204 | "0 0.011225 0.002416 NaN 59.0 ... -5.754181 \n", 205 | "1 0.048020 0.036310 11.525 19.0 ... -1.750895 \n", 206 | "2 0.371870 0.353229 70.550 3.0 ... -0.076997 \n", 207 | "3 0.460720 0.441420 269.125 44.0 ... -0.082746 \n", 208 | "4 0.008662 0.000000 NaN 62.0 ... NaN \n", 209 | "\n", 210 | " log_green_norm log_green_peak log_green_ss log_max_peak log_max_ss \\\n", 211 | "0 NaN -4.489609 -6.025555 -4.236934 -5.754181 \n", 212 | "1 -1.692407 -3.036137 -3.315654 -1.343730 -1.750895 \n", 213 | "2 -1.023229 -0.989210 -1.040639 0.034019 -0.076997 \n", 214 | "3 -0.985491 -0.774965 -0.817759 0.210526 -0.082746 \n", 215 | "4 NaN -4.748757 NaN -4.563973 -8.729060 \n", 216 | "\n", 217 | " log_red_peak log_red_ss log_kinetics_off bin0.1_max_peak \n", 218 | "0 -4.513885 -6.321817 NaN -1 \n", 219 | "1 -4.623666 -7.591407 2.444519 1 \n", 220 | "2 -4.643232 -6.434030 4.256322 1 \n", 221 | "3 -4.235193 -5.591161 5.595176 1 \n", 222 | "4 -4.563973 -8.729060 NaN -1 \n", 223 | "\n", 224 | "[5 rows x 32 columns]" 225 | ] 226 | }, 227 | "execution_count": 2, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "with open('../inputs/props.pkl', 'rb') as f:\n", 234 | " df = pickle.load(f)\n", 235 | "df.head()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 3, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "def select_X_and_Y(df, all_X, y_column):\n", 245 | " not_dropped = ~pd.isnull(df[y_column])\n", 246 | " not_dropped = pd.Series(not_dropped, index=df.index)\n", 247 | " Ys = df[not_dropped][y_column]\n", 248 | " gens = df[not_dropped]['generation']\n", 249 | " Ys.index = df[not_dropped]['name']\n", 250 | " Xs = all_X.loc[Ys.index]\n", 251 | " return Xs, Ys, gens" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 4, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "tasks = ['bin0.1_max_peak', '']\n", 261 | "lits = [False]\n", 262 | "mtypes = [gpmodel.GPClassifier]" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 5, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "with open('../inputs/X_and_terms.pkl', 'rb') as f:\n", 272 | " X_all, terms = pickle.load(f)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 6, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "../outputs/matern52_bin0.1_max_peak_False.pkl\n", 285 | "[ 20.88092844] [ 76.59353427]\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "def train_and_save(df, task, fname, mtype, guesses=None):\n", 291 | " X, y, _ = select_X_and_Y(df, X_all, task)\n", 292 | " X = X.values\n", 293 | " y = y.values \n", 294 | " k = gpkernel.MaternKernel('5/2')\n", 295 | " clf = mtype(k, guesses=guesses)\n", 296 | " clf.fit(X, y)\n", 297 | " clf.dump(fname)\n", 298 | " return clf\n", 299 | "\n", 300 | "for task, lit, mtype in zip(tasks, lits, mtypes):\n", 301 | " fname = '../outputs/matern52_' + task + '_' + str(lit) + '.pkl'\n", 302 | " if lit:\n", 303 | " clf = train_and_save(df, task, fname, mtype)\n", 304 | " else:\n", 305 | " clf = train_and_save(df[df['generation'] != 8], task, fname, mtype)\n", 306 | " print(fname)\n", 307 | " print(clf.hypers, clf.ML)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 7, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "['matern52_bin0.1_max_peak_False']" 319 | ] 320 | }, 321 | "execution_count": 7, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "cls_dict = {True:gpmodel.GPClassifier.load, False: gpmodel.GPRegressor.load}\n", 328 | "clfs = [cls_dict['bin' in path]('../outputs/' + path) for path in os.listdir('../outputs/') if path != '.DS_Store']\n", 329 | "fnames = ['.'.join(path.split('.')[:-1]) for path in os.listdir('../outputs/') if path != '.DS_Store']\n", 330 | "fnames" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 8, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "for fname in fnames:\n", 340 | " with open('../outputs/' + fname + '.txt', 'w') as f:\n", 341 | " if 'bin' in fname:\n", 342 | " f.write('name,p,mu,var\\n')\n", 343 | " else:\n", 344 | " f.write('name,mu,var\\n')" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 9, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "0\n", 357 | "100\n", 358 | "200\n", 359 | "300\n", 360 | "400\n", 361 | "500\n", 362 | "600\n", 363 | "700\n", 364 | "800\n", 365 | "900\n", 366 | "1000\n", 367 | "CPU times: user 7h 41min 48s, sys: 15min 36s, total: 7h 57min 24s\n", 368 | "Wall time: 8h 6min\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "%%time\n", 374 | "df = pd.read_csv('../inputs/all_lit_chimeras_gaps.txt', index_col=0)\n", 375 | "with open('../inputs/lit_alignment_and_contacts.pkl', 'rb') as f:\n", 376 | " ss, contacts = pickle.load(f)\n", 377 | "amino_acids = ('G', 'A', 'L', 'M', 'F', 'W', 'K', 'Q', 'E', 'S',\n", 378 | " 'P', 'V', 'I', 'C', 'Y', 'H', 'R', 'N', 'D', 'T', '-')\n", 379 | "sample_space = [amino_acids for _ in ss]\n", 380 | "n_splits = 1000\n", 381 | "n_per = len(df.index) // n_splits\n", 382 | "inds = [df.index[n * n_per: (n+1) * n_per]\n", 383 | " for n in range(n_splits)]\n", 384 | "inds.append(df.index[n_splits * n_per::])\n", 385 | "seq_terms = chimera_tools.make_sequence_terms(sample_space)\n", 386 | "struct_terms = chimera_tools.contacting_terms(sample_space, contacts)\n", 387 | "all_terms = seq_terms + struct_terms\n", 388 | "\n", 389 | "for i, ind in enumerate(inds):\n", 390 | " seqs = df.loc[ind]['sequence'].values\n", 391 | " if len(seqs) == 0:\n", 392 | " continue\n", 393 | " if i % (n_splits // 10) == 0:\n", 394 | " print(i)\n", 395 | " struct_X, _ = chimera_tools.make_contact_X(seqs, sample_space, contacts, contact_terms=struct_terms)\n", 396 | " seq_X, _ = chimera_tools.make_sequence_X(seqs,\n", 397 | " sample_space=sample_space,\n", 398 | " sequence_terms=seq_terms)\n", 399 | " all_X = np.concatenate([seq_X, struct_X], axis=1)\n", 400 | " for clf, fname in zip(clfs, fnames):\n", 401 | " preds = pd.DataFrame(index=df.loc[ind]['name'].values)\n", 402 | " if 'bin' in fname:\n", 403 | " pi, mu, var = clf.predict(all_X)\n", 404 | " preds['pi'] = pi\n", 405 | " else:\n", 406 | " mu, var = clf.predict(all_X)\n", 407 | " var = np.diag(var)\n", 408 | " preds['mu'] = mu\n", 409 | " preds['var'] = var\n", 410 | " with open('../outputs/' + fname + '.txt', 'a') as f:\n", 411 | " preds.to_csv(f, header=False)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 15, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "../outputs/2GFP_above_parent.pkl\n", 424 | "[ 0.36061643 0.04737465] [ 117.78582814]\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "with open('../inputs/GFP_data.pkl', 'rb') as f:\n", 430 | " X, y = pickle.load(f)\n", 431 | "\n", 432 | "def train_and_save(X, y, fname, mtype):\n", 433 | " k = gpkernel.PolynomialKernel(2)\n", 434 | " clf = mtype(k, guesses=None)\n", 435 | " clf.fit(X, y)\n", 436 | " clf.dump(fname)\n", 437 | " return clf\n", 438 | "\n", 439 | "task = 'GFP_above_parent'\n", 440 | "mtype = gpmodel.GPClassifier\n", 441 | "fname = '../outputs/2' + task + '.pkl'\n", 442 | "\n", 443 | "clf = train_and_save(X, y, fname, mtype)\n", 444 | "print(fname)\n", 445 | "print(clf.hypers, clf.ML)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 21, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "0\n", 458 | "100\n", 459 | "200\n", 460 | "300\n", 461 | "400\n", 462 | "500\n", 463 | "600\n", 464 | "700\n", 465 | "800\n", 466 | "900\n", 467 | "1000\n", 468 | "CPU times: user 1h, sys: 16min 42s, total: 1h 16min 42s\n", 469 | "Wall time: 1h 7min 57s\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "%%time\n", 475 | "with open('../outputs/' + fname + '.txt', 'w') as f:\n", 476 | " f.write('name,p,mu,var\\n')\n", 477 | " \n", 478 | "df = pd.read_csv('../inputs/all_lit_chimeras_gaps.txt', index_col=0)\n", 479 | "with open('../inputs/lit_alignment_and_contacts.pkl', 'rb') as f:\n", 480 | " ss, contacts = pickle.load(f)\n", 481 | "amino_acids = ('G', 'A', 'L', 'M', 'F', 'W', 'K', 'Q', 'E', 'S',\n", 482 | " 'P', 'V', 'I', 'C', 'Y', 'H', 'R', 'N', 'D', 'T', '-')\n", 483 | "sample_space = [amino_acids for _ in ss]\n", 484 | "n_splits = 1000\n", 485 | "n_per = len(df.index) // n_splits\n", 486 | "inds = [df.index[n * n_per: (n+1) * n_per]\n", 487 | " for n in range(n_splits)]\n", 488 | "inds.append(df.index[n_splits * n_per::])\n", 489 | "seq_terms = chimera_tools.make_sequence_terms(sample_space)\n", 490 | "struct_terms = chimera_tools.contacting_terms(sample_space, contacts)\n", 491 | "all_terms = seq_terms + struct_terms\n", 492 | "\n", 493 | "\n", 494 | "for i, ind in enumerate(inds):\n", 495 | " seqs = df.loc[ind]['sequence'].values\n", 496 | " if len(seqs) == 0:\n", 497 | " continue\n", 498 | " if i % (n_splits // 10) == 0:\n", 499 | " print(i)\n", 500 | " struct_X, _ = chimera_tools.make_contact_X(seqs, sample_space, contacts, contact_terms=struct_terms)\n", 501 | " seq_X, _ = chimera_tools.make_sequence_X(seqs,\n", 502 | " sample_space=sample_space,\n", 503 | " sequence_terms=seq_terms)\n", 504 | " all_X = np.concatenate([seq_X, struct_X], axis=1)\n", 505 | " \n", 506 | " \n", 507 | " preds = pd.DataFrame(index=df.loc[ind]['name'].values)\n", 508 | " pi, mu, var = clf.predict(all_X)\n", 509 | " preds['pi'] = pi\n", 510 | " var = np.diag(var)\n", 511 | " preds['mu'] = mu\n", 512 | " preds['var'] = var\n", 513 | " with open('../outputs/' + fname + '.txt', 'a') as f:\n", 514 | " preds.to_csv(f, header=False)" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [] 523 | } 524 | ], 525 | "metadata": { 526 | "kernelspec": { 527 | "display_name": "Python [conda env:python36]", 528 | "language": "python", 529 | "name": "conda-env-python36-py" 530 | }, 531 | "language_info": { 532 | "codemirror_mode": { 533 | "name": "ipython", 534 | "version": 3 535 | }, 536 | "file_extension": ".py", 537 | "mimetype": "text/x-python", 538 | "name": "python", 539 | "nbconvert_exporter": "python", 540 | "pygments_lexer": "ipython3", 541 | "version": "3.6.2" 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 2 546 | } 547 | -------------------------------------------------------------------------------- /classification/inputs/2GFP_above_parent123457.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:12d19745f73a6d7cab15890a3cbcd3f70f2ad268f7ca201879532b0677f9cfa2 3 | size 1194349133 4 | -------------------------------------------------------------------------------- /classification/inputs/GFP_data.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e0a3f2014964954309a59cfe998723256b540cd9328338ce14b8fdd2135624df 3 | size 1191884280 4 | -------------------------------------------------------------------------------- /classification/inputs/X_and_terms.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b6cab1d93f08394f25d5a9479da3d1007133a21682a120f1823d39641b5891aa 3 | size 966127630 4 | -------------------------------------------------------------------------------- /classification/inputs/gfp_props.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d8285bfe4bf5c90da7b7ba2c405118ed04653f80aaeba278affc0767f29881b0 3 | size 241396 4 | -------------------------------------------------------------------------------- /classification/inputs/lit_alignment_and_contacts.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:890911d5dd9fa16f61d7c78db99cfe59a33f4b40cda23e4e4b22bc241ce96c17 3 | size 19286 4 | -------------------------------------------------------------------------------- /classification/inputs/props.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:177031fb6fd9f95b0a0cac78c0c5579a55f37d652c4bf193bd7601d0d4551e22 3 | size 123292 4 | -------------------------------------------------------------------------------- /classification/outputs/2GFP_above_parent.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4c32d6190b2233f5fde3198b74215b13f3022eab904c2cfb71b26f7138ac7ed3 3 | size 1193369038 4 | -------------------------------------------------------------------------------- /classification/outputs/matern52_bin0.1_max_peak_False.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:38911daf0d4028dac936d9cdef8e8ae42d290c1c8777cf76fed81e971e3f23f5 3 | size 596315951 4 | -------------------------------------------------------------------------------- /regression/.ipynb_checkpoints/.gitignore: -------------------------------------------------------------------------------- 1 | *checkpoint.ipynb 2 | -------------------------------------------------------------------------------- /regression/GP_matern_5_2_kernel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true, 10 | "scrolled": true 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "from __future__ import division\n", 15 | "import numpy as np\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "import seaborn as sns\n", 18 | "import os\n", 19 | "import pandas as pd\n", 20 | "import pickle\n", 21 | "\n", 22 | "# ML imports\n", 23 | "from sklearn import linear_model\n", 24 | "from sklearn.cross_validation import train_test_split\n", 25 | "from sklearn.model_selection import LeaveOneOut\n", 26 | "from sklearn.metrics.pairwise import euclidean_distances\n", 27 | "from scipy.spatial import distance\n", 28 | "from scipy import optimize, linalg\n", 29 | "import scipy\n", 30 | "from sklearn.model_selection import KFold # import KFold\n", 31 | "\n", 32 | "# custom imports\n", 33 | "import encoding_tools as encoding\n", 34 | "import chimera_tools as chimera\n", 35 | "import GP_tools as GP\n", 36 | "\n", 37 | "# import scipy\n", 38 | "import seaborn as sns\n", 39 | "\n", 40 | "# define plotting settings\n", 41 | "sns.set_context(\"paper\")\n", 42 | "sns.set_style(\"white\")\n", 43 | "\n", 44 | "# Plot adjustments:\n", 45 | "plt.rcParams.update({'ytick.labelsize': 12})\n", 46 | "plt.rcParams.update({'xtick.labelsize': 12})\n", 47 | "plt.rcParams.update({'axes.labelsize': 14})\n", 48 | "plt.rcParams.update({'legend.fontsize': 12})\n", 49 | "plt.rcParams.update({u'axes.titlesize': 16})\n", 50 | "sns.color_palette('colorblind')\n", 51 | "plt.close('all')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "deletable": true, 58 | "editable": true 59 | }, 60 | "source": [ 61 | "## Load model inputs" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 2, 67 | "metadata": { 68 | "collapsed": false, 69 | "deletable": true, 70 | "editable": true, 71 | "scrolled": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "# load ephys data\n", 76 | "path_inputs = 'inputs/'\n", 77 | "df_input = pd.read_csv(path_inputs+'Ephys_data_formatted.csv')\n", 78 | "\n", 79 | "# load library files\n", 80 | "file_c = path_inputs + 'shmetis_c_10_21_0/chimeras.output'\n", 81 | "file_n = path_inputs + 'shmetis_n_10_21_0/chimeras.output'\n", 82 | "\n", 83 | "# add sequence information to dataframe based on chimera code\n", 84 | "df_input = chimera.chimera_code2seq_convert(file_c,file_n,df_input)\n", 85 | "\n", 86 | "# load contact information\n", 87 | "fname_1 = path_inputs + 'alignment_and_contacts_C1C2.pkl'\n", 88 | "\n", 89 | "# load the contact map\n", 90 | "with open(fname_1, 'rb') as f:\n", 91 | " ss, contacts = pickle.load(f)\n", 92 | " \n", 93 | "# only use the first three parents\n", 94 | "ss = [i[0:3] for i in ss]" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "deletable": true, 101 | "editable": true 102 | }, 103 | "source": [ 104 | "## Data formating" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 3, 110 | "metadata": { 111 | "collapsed": true, 112 | "deletable": true, 113 | "editable": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "def data_format(property_, df):\n", 118 | " # test data only includes gen 10\n", 119 | " df_test_data = df[df.gen == 10]\n", 120 | "\n", 121 | " # remove ChR_29_10 & ChR_30_10 for kinetics and spectra because currents too low for accurate measurements\n", 122 | " if property_ == 'green_norm' or property_ == 'kinetics_off':\n", 123 | " df_test_data = df_test_data[df_test_data.chimera != 'ChR_29_10']\n", 124 | " df_test_data = df_test_data[df_test_data.chimera != 'ChR_30_10']\n", 125 | "\n", 126 | " # training data excludes test data (gen 10)\n", 127 | " df_data = df[df.gen != 10]\n", 128 | "\n", 129 | " # make a seperate dataframe for the selected property\n", 130 | " df_select = pd.DataFrame()\n", 131 | " df_select['prop'] = df_data[str(property_)]\n", 132 | " df_select['seq'] = df_data['seq']\n", 133 | " df_select['block_k'] = df_data['block_k']\n", 134 | " df_select['chimera'] = df_data['chimera']\n", 135 | " df_select.dropna(inplace=True)\n", 136 | "\n", 137 | " # normalize training data\n", 138 | " log_data = np.log(df_select.prop.values)\n", 139 | " y = (log_data - np.mean(log_data))/np.std(log_data)\n", 140 | " seq = df_select.seq.values\n", 141 | "\n", 142 | " # make a seperate dataframe for the selected property for test set\n", 143 | " df_select_test = pd.DataFrame()\n", 144 | " df_select_test['prop'] = df_test_data[str(property_)]\n", 145 | " df_select_test['seq'] = df_test_data['seq']\n", 146 | " df_select_test['block_k'] = df_test_data['block_k']\n", 147 | " df_select_test['chimera'] = df_test_data['chimera']\n", 148 | " df_select_test.dropna(inplace=True)\n", 149 | "\n", 150 | " # normalize test data\n", 151 | " log_data_test = np.log(df_select_test.prop.values)\n", 152 | " y_true_test = (log_data_test - np.mean(log_data))/np.std(log_data)\n", 153 | " seq_test = df_select_test.seq.values\n", 154 | " return log_data, y, seq, y_true_test, seq_test, df_select, df_select_test" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "deletable": true, 161 | "editable": true 162 | }, 163 | "source": [ 164 | "## Encodings" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 4, 170 | "metadata": { 171 | "collapsed": true, 172 | "deletable": true, 173 | "editable": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "def encoding_inputs(df_select, df_select_test, ss, contacts):\n", 178 | " # one_hot_encode based on sequence & structure\n", 179 | " X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n", 180 | " X = np.array(X)\n", 181 | "\n", 182 | " # also encode the test sequences\n", 183 | " X_true_test = encoding.one_hot_(df_select_test['seq'].values, ss, contacts)\n", 184 | " X_true_test = np.array(X_true_test)\n", 185 | " \n", 186 | " return X, X_true_test" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "deletable": true, 193 | "editable": true 194 | }, 195 | "source": [ 196 | "## Train on split training data" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 5, 202 | "metadata": { 203 | "collapsed": true, 204 | "deletable": true, 205 | "editable": true, 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "def cross_validation(X, log_data, property_):\n", 211 | " path_outputs = 'outputs/'\n", 212 | "\n", 213 | " kf = KFold(n_splits=20) # Define the split\n", 214 | " kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator\n", 215 | "\n", 216 | " mu_s = []\n", 217 | " var_s = []\n", 218 | " y_s = []\n", 219 | " \n", 220 | " for train_index, test_index in kf.split(X):\n", 221 | " X_train, X_test = X[train_index], X[test_index]\n", 222 | "\n", 223 | " log_data_train, log_data_test = log_data[train_index], log_data[test_index]\n", 224 | "\n", 225 | " y_train = (log_data_train - np.mean(log_data_train))/np.std(log_data_train)\n", 226 | " y_test = (log_data_test - np.mean(log_data_train))/np.std(log_data_train)\n", 227 | "\n", 228 | " initial_guess = [0.1,10]\n", 229 | "\n", 230 | " # take the log of the initial guess for optimiziation \n", 231 | " initial_guess_log = np.log(initial_guess)\n", 232 | "\n", 233 | " # optimize to fit model\n", 234 | " result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X_train,y_train), method='L-BFGS-B')#,\n", 235 | "\n", 236 | " # next set of hyper prams \n", 237 | " prams_me = [np.exp(result.x[0])**2, np.exp(result.x[1])]\n", 238 | "\n", 239 | " # next used trained GP model to predict on test data\n", 240 | " mu, var = GP.predict_GP(X_train, y_train, X_test, prams_me)\n", 241 | " \n", 242 | " # un normalize\n", 243 | " y_test_real = np.exp(y_test*np.std(log_data_train) + np.mean(log_data_train))\n", 244 | " mu_real = np.exp(mu*np.std(log_data_train) + np.mean(log_data_train))\n", 245 | " \n", 246 | " mu_s.append(mu)\n", 247 | " var_s.append(var)\n", 248 | " y_s.append(y_test)\n", 249 | "\n", 250 | " # reformat all\n", 251 | " y_s_all = [j for i in y_s for j in i]\n", 252 | " mu_s_all = [j for i in mu_s for j in i]\n", 253 | "\n", 254 | " # plot results\n", 255 | " plt.figure('My GP test set evaluation', figsize=(1.5, 1.5))\n", 256 | " plt.plot(y_s_all, mu_s_all, 'o', ms=3, color='k')\n", 257 | "\n", 258 | "\n", 259 | " # calculate correlation \n", 260 | " measured = y_s_all\n", 261 | " predicted = mu_s_all\n", 262 | "\n", 263 | " par = np.polyfit(measured, predicted, 1, full=True)\n", 264 | " slope=par[0][0]\n", 265 | " intercept=par[0][1]\n", 266 | "\n", 267 | " # calc correlation \n", 268 | " variance = np.var(predicted)\n", 269 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(measured, predicted)])\n", 270 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n", 271 | " \n", 272 | " print('20-fold corss validation of GP regression model')\n", 273 | " print('R = %0.2f'% np.sqrt(Rsqr))\n", 274 | "\n", 275 | " max_x = np.max(y_s_all)\n", 276 | " min_x = np.min(y_s_all)\n", 277 | " \n", 278 | " plt.plot([min_x, max_x], [slope*min_x+intercept, slope*max_x+intercept], '-', color='k')\n", 279 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel_CV_fig1.pdf', bbox_inches='tight', transparent=True)\n", 280 | " plt.show()\n", 281 | " return measured, predicted\n" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "deletable": true, 288 | "editable": true 289 | }, 290 | "source": [ 291 | "## Evaluate on whole training set" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 6, 297 | "metadata": { 298 | "collapsed": false, 299 | "deletable": true, 300 | "editable": true, 301 | "scrolled": false 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "def ML_train(X, y):\n", 306 | " # test the optimization of the hyp-prams\n", 307 | " initial_guess = [0.9,0.9]\n", 308 | "\n", 309 | " # take the log of the initial guess for optimiziation \n", 310 | " initial_guess_log = np.log(initial_guess)\n", 311 | "\n", 312 | " # optimize to fit model\n", 313 | " result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X,y), method='L-BFGS-B')\n", 314 | " \n", 315 | " print('Full GP regression model')\n", 316 | " print('Hyperparameters: ' + str(np.exp(result.x[0])) + ' ' + str(np.exp(result.x[1])))\n", 317 | "\n", 318 | " # next set of hyper prams \n", 319 | " final_prams = [np.exp(result.x[0]), np.exp(result.x[1])]\n", 320 | " \n", 321 | " return final_prams\n", 322 | " \n", 323 | "def ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_):\n", 324 | " path_outputs = 'outputs/'\n", 325 | " \n", 326 | " # next use trained GP model to predict full test set\n", 327 | " mu_true_test, var_true_test = GP.predict_GP(X, y, X_true_test, final_prams)\n", 328 | "\n", 329 | " # convert the true test predications and y back to unnormalized data\n", 330 | " y_test_real = np.exp(y_true_test*np.std(log_data) + np.mean(log_data))\n", 331 | " mu_test_real = np.exp(mu_true_test*np.std(log_data) + np.mean(log_data))\n", 332 | "\n", 333 | " if property_ != 'kinetics_off':\n", 334 | " \n", 335 | " par = np.polyfit(y_test_real, mu_test_real, 1, full=True)\n", 336 | " slope=par[0][0]\n", 337 | " intercept=par[0][1]\n", 338 | " \n", 339 | " # coefficient of determination, plot text\n", 340 | " variance = np.var(mu_test_real)\n", 341 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(y_test_real, mu_test_real)])\n", 342 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n", 343 | " print('GP regression model test set')\n", 344 | " print('R = %0.3f'% np.sqrt(Rsqr))\n", 345 | " \n", 346 | " # plot and measure correlation\n", 347 | " plt.figure('True test', figsize=(1.5, 1.5))\n", 348 | " plt.plot(y_test_real, mu_test_real, 'o', ms=3, color='k')\n", 349 | " \n", 350 | " max_x = np.max(y_test_real)\n", 351 | " plt.plot([0, max_x], [intercept, slope*max_x+intercept], '-', color='k')\n", 352 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel.pdf', bbox_inches='tight', transparent=True)\n", 353 | " plt.show()\n", 354 | "\n", 355 | " elif property_ == 'kinetics_off':\n", 356 | " \n", 357 | " par = np.polyfit(np.log10(y_test_real), np.log10(mu_test_real), 1, full=True)\n", 358 | " slope=par[0][0]\n", 359 | " intercept=par[0][1]\n", 360 | " \n", 361 | " # coefficient of determination, plot text\n", 362 | " variance = np.var(np.log10(mu_test_real))\n", 363 | " residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(np.log10(y_test_real), np.log10(mu_test_real))])\n", 364 | " Rsqr = np.round(1-residuals/variance, decimals=2)\n", 365 | " print('GP regression model test set')\n", 366 | " print('R = %0.3f'% np.sqrt(Rsqr))\n", 367 | " \n", 368 | " # plot and measure correlation\n", 369 | " plt.figure('True test', figsize=(1.5, 1.5))\n", 370 | " plt.plot(np.log10(y_test_real), np.log10(mu_test_real), 'o', ms=3, color='k')\n", 371 | " \n", 372 | " max_x = np.max(y_test_real)\n", 373 | " min_x = np.min(y_test_real)\n", 374 | " \n", 375 | " plt.plot([np.log10(min_x), np.log10(max_x)], [np.log10(slope*min_x+intercept), np.log10(slope*max_x+intercept)], '-', color='k')\n", 376 | " \n", 377 | " plt.savefig(path_outputs + str(property_)+'_matern_kernel.pdf', bbox_inches='tight', transparent=True)\n", 378 | " plt.show()\n", 379 | "\n", 380 | " # export csv with predicted values\n", 381 | " df_select_test['y'] = y_true_test\n", 382 | " df_select_test['mu'] = mu_true_test\n", 383 | " df_select_test['y_real'] = y_test_real\n", 384 | " df_select_test['mu_real'] = mu_test_real\n", 385 | "\n", 386 | " df_select_test.to_csv(path_outputs+ 'matern_kernel_gen10_'+str(property_)+'.csv')\n", 387 | " return" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": { 393 | "deletable": true, 394 | "editable": true 395 | }, 396 | "source": [ 397 | "# Train models are different properties " 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": { 403 | "deletable": true, 404 | "editable": true 405 | }, 406 | "source": [ 407 | "### Max_peak" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 7, 413 | "metadata": { 414 | "collapsed": false, 415 | "deletable": true, 416 | "editable": true 417 | }, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "20-fold corss validation of GP regression model\n", 424 | "R = 0.77\n" 425 | ] 426 | }, 427 | { 428 | "data": { 429 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHcAAABvCAYAAADWvF98AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADFdJREFUeJztnV1oHNUbxp/dyX40VaOU4JbGVAomwTR+paXVaoOipaIt\nVREiQawXRpMl3bqhGBCNRUhRsoKRGMSKpRgvVExLL/xCmoCCF1UUY4XSIm1EN2xjQ01N083u87/Q\nmf/sZGZnZnd2ZnYzPyh0Z2f2vDnPed/zno+Z8ZEkPCoSv9MGeJQOT9wKxhO3gvHErWA8cSuYKjsK\nuXz5MiYnJ1FbWwtBEOwosqLJZDJIpVJYv349wuGw5nm2iDs5OYmOjg47ilpWjI6OYsOGDZrf2yJu\nbW2tZEwkErGjyIommUyio6NDqlctbBFXDMWRSAR1dXV2FFk29Pb2Ynh4GNFoFIlEwtS1el2cl1A5\nQG9vL8LhsCTswsIC3n777ZzjlkAbmJqaYkNDA6empuwozvWEQiECYDgcZmtrKwGwtbU153g+jNan\n57kOEI1GEQ6H0d3djcnJSQDAL7/8knPcCjxxHYL/rdfIBU0kEpifnzfd92phS0LlkYu8n7VSTCWe\n5zqA1eFXC09cE1iVzZoNvwWXW2zml81m+cILL/DgwYNFZ3dux2g2KxKPxxkKhRiPx4sqNxAI5JRr\nS7Z85swZPPXUU/jss8+K+ZmywWw4lfetauh55OzsLLq6upBOp+H3+9HV1WXO4GJa1P79+zk2NrZs\nPNcs8Xic4XCY8Xhc1YsFQSAACoKQc102m+WHH37I66+/nitXruQbb7zBdDotfW+0Pi2ZxPDE1Q/B\nYkgHIJ2jJu7p06e5bds2AuCuXbt47ty5Jb/lTWLYjF4Ijkaj0v/Fc2KxmDQ/vHfvXgwMDKCxsRFf\nfvkldu7cibGxMdxwww3OJVSk+z3XqsRGr4x8IVh5jojo0T6fj36/X/JmedKmTOS8sCzDbJZrV3nn\nz5/nqlWrCIDV1dX84YcfVBuA8pgXlmXYNWmgVp5aSCWJw4cPo6mpCTMzMwD+3V1x++23q46BC56W\nLLqZGsBpz9XDSNiWn2MmzCu9+Omnn6bP5yMAPv744+zs7FziqXrYGpatMsYpjIRR+TlmwrwYUvfs\n2cP+/n4pYw4EAgXnAl5YNoEYRpubmzWz0mg0CkEQkE6npY1p+cK8GI4BoL6+HkNDQ9i/fz82btyI\nUCiEnp4e3Qy7aEw1mQJxu+eK6HmkGY+VZ8H4z1uV1a2WPBnB89w8aI0b9RIhufcqr5Wfn81msXXr\nVgD/X7cFgNbW1pxrtBIly7bbmGoyBeI2zxW9ShAEzT5Py0u1rhWPB4NB3n333TleKwiCKe/UixCe\n5+ZB9FAAmpvTtIZPovdmMpmc/rKzsxOCIGBxcRHJZBKBQCDHa80MYywbuhluTkXgNs8Vkfd5et4i\nz2yV88RffPEF161bx0AgwE2bNtHv9+f0tcqFgWIp26GQHVOFWuXmS27k4ovnPvvss2xvbycAbt26\nlSdPnswRXhAE6Z+Vf0/ZilvqqcJCG49c/Ewmw5GREdbU1DAcDrOqqorPP/+8dJ5c0FL8PWUrbqHD\nA73fFAXVWkM1yk8//cTNmzcTAHfv3s1gMKgbzq3+e8pW3FIg9x4j4iq9jyTn5ua4b98+CoLAxsZG\nHj9+nCRzNpXbhSeuDOVynJ4nyfvNcDjMY8eOce3atQyFQrzrrrukpTm15MoOKl7cUiZeouf6/X7e\ndNNNBMD6+noGAgHJ8+XJlfyzHfZVvLilTLwWFxc5NDTEq6++mrW1tfzggw/o9/uliQllyM63CF8K\n+2wT9/jx43z44Ye5bds29vT08O+//y7YGDOIFdra2mrpcOP777/nhg0bCIAtLS1SCJbPNpmxr2w9\nd2Zmhps3b+Zvv/1Gknz99dfZ399fsDGFoOwfC+XixYvcu3cv/X4/m5ub+c033ywZs8rFEsOueHee\nloilCM+2iHv06FE+88wzOYXecccdzGazBRmjRb4KUstszTI2Nsa6ujpWVVVREATGYjHd35YLn69h\nle0495133uFLL70kfU6n02xoaFgSmosV14qd/mrHzp49y507dxIAt2/frjtmVZYhdgv5wm/ZjnNH\nRkZUxb106VJeY8yGKiMVpDb3q7WDMJ1Oc3BwkCtXrmQkEuFDDz3EYDCoK5RWeXZji7hHjhzhc889\nJ33+/fffuXHjRl1jjCy5mUVt7ldtB+ETTzzB2267jT6fj93d3ZydnS0odNq9o1KOLeKeP3+ed955\np5RQDQ4Osq+vT9cYsaLV9ugWilJQpWfNzs4yGo3S5/Px1ltv5XfffbfkWr3kKF95dmLbUGh8fJw7\nduzg9u3b2dnZyQsXLhg2ppQVJHpWKBTiRx99xNWrV7O6upqDg4M5992oXVNKb7QinFf8JIYe8Xic\nwWCQN954IwFwx44dPHv2rOa54rCm2MZm9J6hYhrQshb3ypUrPHDgAFesWME1a9bw008/XTI8k5Ov\nws0uDBhZ9C+2AS1bcb/99luuX7+efr+fsViMFy9e1PUmtfGsclHA6KYVO/riZSfuX3/9xc7OTsnL\nTpw4IX1ndtM5yZwFAruX9PSoiA1yRrZ4ksTo6Ciamppw6NAhVFVV4Z577snZRmpkw5nWOYIggCRO\nnDhh2jbHcVNLU96PA52pvVOnTvH+++8nAD722GOmZpj0MLOnym7K0nPlt1cMDw9Lx5XetLCwgFdf\nfRUtLS04deoUjh07hk8++QQtLS0AgObmZkvsYZ4Xt9h952BBuKmliZmpz+fTHJaMj4+zqamJgiBw\n3759nJubk76z0puc9Ew9yjKhyrd8l0qluHv3bgLgpk2b+OOPPy65Xi+UmplA0JvxcpKyFFdtSJLN\nZvn+++9z1apVrKmp4cjICDOZTEF2FDOn7SZPLgtx9bzh119/ZVtbGwGwvb2df/75Z1F2FDOn7eRc\nspKyEFfLG/bs2SNtUFu3bh0///xzS+1xc8g1QlmIq+YNX331Vc59Nj09PZq/W4woVm5Ut5uyEFfO\n9PQ0Ozo6CIBr1qwxtC+qmH7Q7EZ1N1E249xsNot3330X9fX1GB0dxQMPPIBz584hHo8XPKtkBPm1\nsVgM4XAYsVismD/FfTjZ0n7++Wdu2bKFAKR9wWaTnFJuTHdrP2xbWC7kkbzz8/Ps6+tjVVUVGxoa\n+PXXXy+55cNIxZYynLpp6KPEFnFPnz7NJ598krfccospcT/++GMGg0G+8sornJ+fX3K+0Yotpbhu\nGvoosaXPHR0dxaOPPooHH3zQ1HWPPPIIZmZm0N/fr/ouOqN9aSn7SqtfJuEEuuJOTEzg5ptvXvLv\nyJEjePnll7Fr1y7ThQqCgKuuukr6rLZ8lk6n8eabb+ZdUqsEAUqKFWGi2Ad7KsOwco7ZzcmNE5TN\nUAhYGobFJ8YIgoDu7m7TT1ori4V0O7CiJZX6kbxmkxs3Z7pWUFaeq4bc+8z2rWWxkG4Hbmppcird\n+4qhLD1X7q2e91mAUy1NLQP2vNUYrvdcZQbc29uLxcVFKUP2KB7HxFWG3eHhYWQyGQQCAW9SwiIc\n7XMp2zrq9bElwKk+wutfC8f1fa7nqaXHsTdfJxIJr28tMa4a53pYiyduBeMqcb3VHGtxlbglf4nS\nMsNV4noZtLU4li2r4WXQ1mKLuJlMBgCQTCbtKK7iEetRrFctbBE3lUoBADo6OuwobtmQSqWwdu1a\nze99ZJ5nA1jE5cuXMTk5idraWukd7R6Fk8lkkEqlpLeBamGLuB7O4Kps2cNaPHErGE/cCsb2ce7R\no0fx3nvvwefzYcWKFXjxxRel50c5zfj4OBKJBK5cuYLGxkYMDAzk3PbiNKbrzoa1ZYkzZ85wy5Yt\nnJ6eJvnvM6Xa2trsNEETo29acYpC6s5WcaempqR34JH/Pmm9ubmZCwsLdpqhitE3rThFIXVXkrA8\nMTGBrq6uJccHBgakuwJJ4sCBA7jvvvsQDAZLYYYpkskkIpGI9DkSiWBubg6XLl1yRWiuq6tDXV0d\nAON1VxJx29racPLkSc3v//nnH/T19SGZTOLgwYOlMME02WxW9bjf766c00zd2W75H3/8gfb2dgiC\ngMOHD+Oaa66x2wRVVq9eLU2TAsD09DRqampQXV3toFW5mK67kncWMi5cuMB7772Xb731lp3FGsLo\nm1acopC6s3X6cWRkBENDQ2hoaMg5fujQIVx33XV2maHJxMQEEokE0uk06uvr8dprr+Haa6912iwA\nhdWdN7dcwbgrW/CwFE/cCsYTt4LxxK1gPHErGE/cCsYTt4LxxK1g/geT6zWgANqEbAAAAABJRU5E\nrkJggg==\n", 430 | "text/plain": [ 431 | "" 432 | ] 433 | }, 434 | "metadata": {}, 435 | "output_type": "display_data" 436 | }, 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "Full GP regression model\n", 442 | "Hyperparameters: 0.0486636299687 19.6621464374\n", 443 | "GP regression model test set\n", 444 | "R = 0.927\n" 445 | ] 446 | }, 447 | { 448 | "data": { 449 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAG4AAABvCAYAAAANB/VeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC2pJREFUeJztnW1MU9cfx7/10lKfUIk45jrdDMMXK26JZjMR11EQHG6y\nETU1sk19gcmKLMCysTF1m8SHmfqA6dSI82HiSCDLEEUdDltdzIyBTCUmmtQ164JFIEAUUOD293/h\n4M9TH723tw/nk5DQ9txzvjnf8zs999xzTmVERGAEHeOkFsDwDWZckMKMC1KYcUEKMy5IiRAqo8eP\nH6OxsRExMTHgOE6obMMWnufR0tICtVoNpVI56nPBjGtsbMSaNWuEyo7xH2VlZViwYMGo9z0yrqqq\nCkeOHIFMJsP48eNRVFSEhISEYWliYmIGC4qNjRVAcnhjt9uxZs2awXodBbnBYrHQokWLqLm5mYiI\nTCYTaTSaUelsNhvFx8eTzWZzlyVjDLq6uujWrVuDr93Vp9uIUygUKC4uxowZMwAAarUara2t6O3t\nhUKhELSVhSudnZ1IT0+HzWbDP//849E1bo1TqVRQqVQAACLC9u3bodVqmWkC0d7ejrS0NNy5cwfn\nz5/3+DqPByfd3d0oLCyE3W5HaWmpTyIZw2ltbUVqaiqsVit+//33MQchzvDoPq6pqQk6nQ4cx+HE\niROIioryWSzjKQ8ePIBWq4XNZkNdXZ1XpgEeRFxHRweysrKQmZmJnJwcn4Uy/s/9+/eRnJyMtrY2\nXLp0CWq12us83Ebczz//jPv376O2thYZGRmDf+3t7T6JDnQKCgqgVCpRUFAgSv7//vsvNBoNOjo6\nYDabfTINgPvbAU8JlduByMhIAkBKpVLwvK1WK82ZM4dUKhXdvXvXZVp39cnmKkeg1+uhVCrxySef\nCJqvxWLBW2+9BYfDgcuXL+OVV155pvyYcSMwGAzo6emBwWAQLM+7d+9Co9FALpfDbDbj5ZdfHpXG\n2y6aGScyt2/fhkajwaRJk2A2mzFr1qwx0xmNRjx58gQ//PCDR/ky40Tk5s2bePvttxEdHQ2TyYQX\nXnjBaVqvu2ihvnhDZXDiivz8fIqMjKT8/Pxh/49FfX09RUdH02uvvUYPHjzwuix39cmM84KhI05X\no89r167R1KlTaf78+dTW1uZTWWxUKSBDuzNnXdvVq1eRkpKCuXPn4uLFi4iOjhZHjE/NwYcWEg6Y\nTCaaOHEiJSYmUmdnp9v0rrpb1lWKxMhKr62tpfHjx1NSUhI9fPjQozxcdbfMOA9xN9gYydBKr6mp\nocjISFqyZAl1dXV5VaZSqWQR9yx4O9U1UOkZGRmkUCho2bJl1NPTI5geNjjxEG/vowwGA06ePImz\nZ88iPT0dv/zyy5irsUTDXy0k1Dh16hRxHEfx8fGkUCg87mI9hUWcCBw/fhxZWVnQ6XSwWq3o7e31\neKpKKJhxXlJaWop169bho48+wvHjx5GTkyPK0wS3+Cu0QwGj0UgAKDs7m3ieF7Us1lUKxN69e6HX\n65GTk4ODBw9i3Dhpq44Z5wE7d+5EXl4eCgoKUFJSAplMJrUkZpw7tm7disLCQnz11VfYtWtXQJgG\nMOOcQkTYtGkTNm/ejG+++QbFxcUBYxog4G6dUIKI8MUXX2DXrl3Ytm0bvvzyS6kljYIZNwIiQl5e\nHvbt2weDwYD8/HypJY0JM24IDocDer0eBw8exP79+wN6ATAz7j94nkd2djaOHj2KQ4cOITs7W2pJ\nLmHGAejv78f69etx8uRJ/Pjjj1i7dq3UktwS9sb19fXhww8/REVFBX766aeg2Q4d1sb19vZCp9Oh\nuroa5eXlWLlypdSSPCZsjXvy5AlWrFiBCxcuoLKyEhkZGVJL8oqwNK6npwcffPABTCYTfv31V6Sn\np0styWvCzriuri4sX74cV69exenTp5Gamiq1JJ8IK+MePnyIZcuWob6+HjU1NUhKSpJaks94NFdJ\nRCgsLMSRI0fE1iManZ2dSEtLw19//YULFy64NE3szY1C4NY4i8WCjz/+GOfOnfOHHlHQ6/WYNm0a\nGhoaUFtbi8TERJfpvd05IwVujSsrK0NmZibeeecdf+gRnNbWVhw4cAD0dCki3nzzTbfXiLW5UUjc\nfsdt3rwZAPDnn3+KLkZompubkZKSAqVSCZ7nPZ57NBgMgm5sFIOQHZw0NTUhOTkZ7e3tuH79Ol59\n9VWpJQlKSBpns9mg1WrR3d0Ns9mMuXPnSi1JcELOOKvVCq1Wi/7+fpjNZsTFxUktSRRCaumCxWKB\nRqMBEeHy5cshaxrgRcTt2LFDTB3PzJ07d6DVajFhwgTU1dXhxRdflFqSqIRExA2cbBAVFQWz2Rzy\npgEBbFxBQQEiIiIQERHhcgZj4GSD6dOnw2QyYebMmX5UKR0Ba5zRaATP8+B53ukMRkNDA5KSkjBz\n5kxcunQJzz33nJ9VSkfAGqfX68FxHDiOG3MG49q1a9BqtZgzZw7q6uqcn10cqvhrk4KQXLlyhSZP\nnkwLFy6kjo4O0cuTgpDb9GEymbB06VK8/vrr+O233zBlyhSpJUlCUBl38eJFpKen44033sC5c+cw\nefJkqSVJRtAYV1NTg3fffReLFy/GmTNnMHHiRKklSUpQGFdVVYX3338fKSkpqKqqwoQJE6SWJDkB\nb1xlZSUyMzPB8zzi4uL8e7JBABPQxp06dQo6nQ7A03X9hw4dklhR4BCwxg2cbLB69Wrk5uYG/BNp\nfxOQj3UOHz6MDRs2YO3atTh8+DA4jsOePXuklhVQSBJxQ1dRjVxRZTQakZ2djYSEBJSVleHzzz9/\npvxDFn/d6Q/F2YGdu3fvJgCUm5tLCoXC52PkxTyC3l8E5MzJWAd2LliwAPn5+fjss8+wd+9erw5+\nGRlhwbBK65nxVwtxhsPhoG+//ZYAUFFRETkcDq/LDoUIG0lARtyQRoOvv/4aW7ZswXfffTd4soG3\n31FhEWEj8VcLGUleXh5xHEcAaMeOHcM+C8UI8paAjDgiQklJCXieh0wmQ1FR0bAn3WEZQd7irxYy\nAM/zNG/ePAIw6i+cI2wkARVxeXl5kMvluHnz5qjPnD3pZoyNqMYNHWT09/ejpKQEDocDMpkMSqVy\n8IgljuPw6aefwmg0hvZNs5CIGdoDg4zIyEhatWoVyWQyksvlg8fgDj0FnA1IhiPpKegDhsTFxVFE\nRARVVFQ4vd7VUe7hiKTfcf39/ejr68O9e/dQWVmJFStWjEoz0J0CEPx320IasVpId3c3yWQyAkBy\nudzpdayLHBvJIq66uhocx0Eul2Pjxo1O07F7Nh8Rq4XwPE/t7e1CZR92SBZx48aNw9SpU8XKPuwR\n7Ak4z/MAALvdLlSWYc1APQ7U60gEM66lpQUAgub0uWChpaUFs2fPHvW+jIhIiAIeP36MxsZGxMTE\ngOM4IbIMa3ieR0tLC9Rq9ZhLEgUzjuFfAnZ5HsM1zLgghRkXpIhinMlkwnvvvYe0tDTk5ubi0aNH\nYhQjGFVVVVi+fDkyMjKg0+lw69YtqSW5R+g7/ra2Nlq4cCH9/fffRET0/fff05YtW4QuRjAsFgst\nWrSImpubiejpT0JrNBppRXmA4BH3xx9/ICEhAS+99BIAYPXq1aiurgYF6OBVoVCguLgYM2bMAACo\n1Wq0trait7dXYmWuEXzvgN1uR2xs7ODr2NhYPHr0CF1dXZg0aZLQxT0zKpUKKpUKwNNFTNu3b4dW\nq4VCoZBYmWsEN87hcIz5vtQ/lOeO7u5uFBYWwm63o7S0VGo5bhG8Np9//vnB6S/g6ZmRU6ZMCehd\npE1NTdDpdOA4DidOnEBUVJTUktwiuHGJiYm4ceMGrFYrAKC8vBzJyclCFyMYHR0dyMrKQmpqKvbs\n2RM0O15FmfIym80wGAzo6+vDrFmzsHPnzoB9xHPgwAGUlJQgPj5+2PvHjh3DtGnTJFLlHjZXGaQE\n9oiB4RRmXJDCjAtSmHFBCjMuSGHGBSnMuCCFGRek/A+4SJtQR/DLjgAAAABJRU5ErkJggg==\n", 450 | "text/plain": [ 451 | "" 452 | ] 453 | }, 454 | "metadata": {}, 455 | "output_type": "display_data" 456 | } 457 | ], 458 | "source": [ 459 | "# select the property of interest\n", 460 | "property_ = 'max_peak'\n", 461 | "\n", 462 | "# format data for property \n", 463 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n", 464 | "\n", 465 | "# encode sequences\n", 466 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n", 467 | "\n", 468 | "# train and CV model\n", 469 | "measured_CV, predicted_CV = cross_validation(X, log_data, property_)\n", 470 | "\n", 471 | "# train model on whole test set\n", 472 | "final_prams = ML_train(X, y)\n", 473 | "\n", 474 | "# use model to predict on test set and evaluate accuracy\n", 475 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "deletable": true, 482 | "editable": true 483 | }, 484 | "source": [ 485 | "### Norm_green" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 8, 491 | "metadata": { 492 | "collapsed": false, 493 | "deletable": true, 494 | "editable": true 495 | }, 496 | "outputs": [ 497 | { 498 | "name": "stdout", 499 | "output_type": "stream", 500 | "text": [ 501 | "20-fold corss validation of GP regression model\n", 502 | "R = 0.90\n" 503 | ] 504 | }, 505 | { 506 | "data": { 507 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHUAAABvCAYAAADSSY9BAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADQ1JREFUeJztnW1MHEUYx//L3t0ulBSsBU4LbTUI8eBiU0hFMMW2avqh\nvZrWUhRCIkZDIFjv+FK1xDSRWo1XbQ1gbFWiUdTEWDBNVGoEpdYm9S3QalRUAqFQrmKr0N4dd48f\nyG7vFe51b9nuL7nkbnd2Z7L/Z56ZeWZuhyEigoqiSEp0AVRijyqqAlFFVSCqqApEFVWBaKTI5OrV\nqxgcHERGRgZYlpUiS0XjcrkwOTmJwsJC8Dzvd14SUQcHB1FVVSVFVtcV7777LoqLi/2OSyJqRkaG\nWAi9Xi9FlopmfHwcVVVV4nP1RRJRBZer1+uRnZ0tRZaKoKmpCa2trWhoaIDVavU7H6wpUztKMqOp\nqQk8z4uC2u12HDx4EAzDICsrK6R7qKLKiKamJhw8eBB2ux1tbW1oaGgAwzDi+QsXLoR0H1VUGdHa\n2ip+r6+vh9VqRSSheVVUGdHQ0ACe52GxWMQ21LOmhooqagLxbD8BwGq1or6+Hq2treIxs9kc/o1J\nAkZGRigvL49GRkakyE62WCwW4jiOLBYLERExDEMAiGEY8TwAAkAsyxLHcfTEE0+IxwS5FnqeqqgS\nwnEcASCe54mI/MQSzguiCoKHK6rqfiVEaDMLCgrA8zwyMzMBAMnJyWAYBna7HQBQVFSEqqoqMAyD\n5ORk8fpQ21dVVAmxWq24cuUKBgcHYbfbcfnyZRARrly54pXuhx9+wLFjx2AwGHD27FlYLBbwPB9y\n+6qKmgCEGltfXw9grmZ64na7UVpaim+++QarV68WjSFQVCkgUrQlapu6ME6n06vj5HQ6xXO+HSy1\nTV0EXL58GSaTCQCg0WhgNpuh0WjEIc+hQ4fEKFMoqKKGiO+YMlYMDw+jrKwM/f39OH78OJxOp+hm\nhdgvAC93vSBSuBYluF/f4Ugs+PbbbykrK4tWrVpFNTU1Xi6WaM7t8jzvdYxIHafGjGAPOFI++OAD\n4nme9Ho96XQ6cVwaitGoosoIi8VCOp2OSktLCQDt2rWLdDqdV5AhMzPTr8b6oooqIwQBAVBzczO5\nXC7RAwg1VfgIXiGQwKqoMmFycpJWrFhBAGjz5s1+5wVxi4qKvAQN5JIXep5RL2fp7e2F1WqFw+FA\nfn4+9u/fj9TU1Ghvqyh+/vlnbNmyBXa7Hf39/SgrK/NLY7VaAwYX2traQu/1CkRjfRcvXqSSkhL6\n888/iYjoxRdfpGeffdYv3fVUU31dZk9PD6WlpdHtt99OQ0NDMckjru63q6uLHnvsMa/M1q5dS263\nO6xCKAlPl/naa68Ry7J033330dTUVMj3sFgsxLIssSwbsMMU14jS+Pi415JPvV6P//77D9PT09Hc\ndtHS1NSE2dlZJCUlwWAwoK6uDkQEg8GA9PT0eQMYvgvOXC4XXC5XyFEkLyK1SCKi9vZ2am5uFn87\nnU7Ky8uj6enpsCxLKQi1NCkpiRiG8Rt7CueFCXDPWuhZwxNaU2+66SZMTk6KvycmJpCWloaUlJRo\nbrtoaGpqgkajQVJSEjQaDfLy8sAwDFiWRXd3N9asWQMAKCgoAHBtdgaAXyzXc+bGarVidnYWs7Oz\noc/MeBKNZdpsNrrrrrvEjtJLL71Ee/bsCduyFiueKxWET05ODv34449e532HJNFGp+I+Tu3t7aWt\nW7fS5s2b6fHHHw/YIVCqqIKbFATNysqisbExr/PC2FNwt8ECCuGgBh/iiNvtpv379xMAevDBB/36\nEgKeNTYWEwPqfGqccDgcqK2txdNPP41169ahu7sbzc3NAdN6tpe+qx7iQsTmEkPLWgx4uk2bzUbr\n168nrVZLHR0d89a+WLhbX1T3GwGBhBCE0+l0lJubS8uWLaO+vj4xfbCOj6/gapuaIALVPIvFQlqt\nljiOo/z8fPrtt9/8rgskmK/gUrSpqqgBCDT4P3r0KGk0Gtq4cSP9/fffAa+bbwjj2fuNdrJdFTVC\nBIE4jqPi4mICQEajkRwOR9BrggkW66Uwau/Xg3AWjzU0NIDjOGRnZ+PMmTMAgF9//RVarTboNcHW\n50rS4/UkJqYTpWVJRTg1ZnR0lNauXUspKSlkMpnmjcVKjVpTPQi1xlRXVyM7OxsDAwNwOp3Izc0F\nMPeqm0OHDvmlj9fy0YiRg2XJiY8//tgvnuu5hohlWb9r4rF8dD7UmroAxcXFYBgGDMNg/fr12L59\nO3Jzc8FxHIqKisSavXv3bvA8j927d/vVTMnbzIWQg2UlEvjUSpZlyWw2+6XzHJZIXTN9UWvqAixf\nvlz8zjAMXC4X2tvb/dIJf4EQ3poiq5rpw3Ut6u+//46LFy8CALRaLcxmc1CxfCexw/prodTIwV1E\nQ6Sx1N7eXlq2bBmlp6eTTqcL6fp4BOcjQfERpUjat7feeou0Wi2Vl5eTzWaLa17xQPFtqqdbXGi8\n6Ha78dRTT+GRRx5BdXU1Pv/8c9x4440R5SVr5GBZsSLY7ArHcdTY2Eg7duwgAHTgwAEym81erlQu\nrjUUFO9+PQkUUBeEZhiGkpOT6aOPPvI6LhjAfMEFuaF49+tJoF5pRUUFACAlJQVfffUVtm/fDmAR\nudJIkINlxRJPN9rd3U1LliyhNWvWLJh3LOY5pXLhinW/wR6g4FY1Gg0xDEMmk4n+/fffmOU7H1L1\njhXrfj0jPJ7U1dWBZVnMzs7CYrHg1ltvxfLlyyWZQZGNS4+rSYVoWZEQyF1OTU3RvffeSxqNhl5/\n/XUiks/YMpYo1v36UltbSwzDEMdxdOLECfF4rF/AIQeuC1G//vprcZZFp9PFJQ85odg2VeCdd97B\npk2bsGLFCnAcB6PRKK9VCAkgKlG7urpgMpmwbds2VFZWYmBgIFblWhC32429e/eipqYGlZWVGBoa\nEneqCueVb4okUhcwNDREZWVlNDExQURzsx7l5eUhuYtox3MzMzNUUVFBAKilpcXrdQRKbEN9iVub\nOjIyQl9++aX422azUUFBAdnt9gULEU2P9Pz587Ru3TrieZ4+/PDDSIu/qIm6Te3r64PBYPD7nDlz\nBvfcc49Q2/H8889j48aN0Ol0C3qHSMdzAwMDuPPOOzE8PIy+vj7s3LkzrOuvG6K1munpaWpsbKSd\nO3fSpUuXIrKsUDh+/DilpqaS0Wik4eHhiO+jBOLa+x0bG0NlZSVYlsXbb7+NpUuXxsrWRIgIhw8f\nxtatW1FeXo6TJ09i5cqVMc8nXGS31teTSK1lamqKNmzYQK+++mrUlhUMp9NJ9fX1BICefPJJmp2d\njbS4MSeRkaq4vcaus7MT58+fR09PD3p6esTjHR0duOGGG6I2tkuXLmHXrl04ceIE2tvbUVdXF/U9\nY0lDQ0Nkr5iTAjlYli9//PEHGQwGWrp0KX322WdxLt3iI+4vnIw1p06dwrZt25CamopTp07BYDAk\nukiLDlmFCTs7O7FhwwbcdtttOH36tCpohMhCVCLCvn378PDDD2PHjh344osvgm7NrLIwCXe/V69e\nRW1tLTo7O7Fv3z40NzdHtL2kyjUSKuqFCxfwwAMP4Pvvv0dnZycqKysTWRzFkDBRz549iy1btmBm\nZga9vb0oKSlJVFEUR0La1IGBAZSWlmLJkiU4ffq0KmiMSYioDocDNTU1OHnyJFavXp2IIiiahLjf\noqIiv50IVWKHLIY0KrFl0Ykq69kRmbDoRA22iFvlGotOVNmsgpcxCY8ohUuwnZZUriGJqC6XC8Dc\nPjYq0SM8R+G5+iKJqMI2J1VVVVJkd90wOTmJVatW+R1niIjinbmwyDojIwMsy8Y7O8XjcrkwOTmJ\nwsJCcZ8bTyQRVUVaFl3vV2VhVFEViCqqApHtOLWrqwtvvPEGGIZBcnIynnnmGRiNRr90Bw4cwKef\nfoq0tDQAwC233IJXXnlF0rKGstuzpDtCS7ewMXTC+UddRUUFfffddxKWzptQdnsOdUfoWCFL96vT\n6fDcc88hMzMTAFBYWAibzQaHw+GVzuFw4Ny5c3jzzTdhMpnQ2NiIsbExScva398Po9Eozgs/9NBD\n+OSTT0Aeg4pQ0sSShIoa7T/qJiYmUFJSAovFgq6uLtxxxx2or6+P28MKRCi7PUu9I3RC29Ty8nKc\nO3cu6PmZmRns2bMH4+PjOHr0qN/5nJwcHDlyRPz96KOPoq2tDaOjo8jJyYlLmX1xu90BjyclJYWV\nJpbI0v0Cof2j7pdffsGxY8e8jhHRvHvHxJpQdnuWekdoWYr6zz//oLq6Gvfffz9efvnlgKEwYM7S\nW1paMDIyAgB47733kJ+f7+Xq4s3dd9+Nn376CX/99RcA4P3338emTZvCThNLZBkmbG9vx+HDh5GX\nl+d1vKOjA6Ojo9i7dy+6uroAzA19jhw5ApfLBb1ej5aWFtx8882Slrevrw9WqxVOpxMrV67ECy+8\ngJGREa9yBkqTnp4el/LIUlSV6JCl+1WJDlVUBaKKqkBUURWIKqoCUUVVIKqoCkQVVYH8D73TxLYM\nMBhoAAAAAElFTkSuQmCC\n", 508 | "text/plain": [ 509 | "" 510 | ] 511 | }, 512 | "metadata": {}, 513 | "output_type": "display_data" 514 | }, 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "Full GP regression model\n", 520 | "Hyperparameters: 0.104064905203 38.5362857546\n", 521 | "GP regression model test set\n", 522 | "R = 0.964\n" 523 | ] 524 | }, 525 | { 526 | "data": { 527 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHgAAABvCAYAAAAntwTxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC3NJREFUeJztnWlME2kYx/8tCo26SqJ4xDNuEA24Hig7um5cr0CFUgME\no2iUoMRjF6V+AKPBjRtFjfWMRINnYt31LkGjxKhFlitqPJb1ILorIi5abzFAS3n2wy4NSLud0mNm\nuu/vS9POM/M+nf888848877PyIiIwPBZ5EI7wPAsTGAfhwns4zCBfRwmsI/TydsNNjQ0oKKiAkFB\nQfDz8/N28z6HxWKB0WhEWFgYFApFu+VeF7iiogJJSUnebtbn0el0GDduXLvfeQlMRFi9ejWCg4OR\nkpLSbrnBYIBWq4XJZEJISAg2btyIbt262dxWUFCQ1aG+ffs68x8YNqitrUVSUpJ1v7aDHPDo0SOa\nP38+ffXVV7R///52y1+/fk0cx9Gff/5JRERbtmyhdevW2d1edXU1DRs2jKqrqx01zbCBRqOhgIAA\n0mg0ROR4fzq8yNLpdIiLi4NSqbS5/Ndff8XIkSMxZMgQAMCcOXOQn58PYgkyj7Bnzx40NjYiJyeH\nl71DgbOysjBr1iy7y2tra9ucavv27Yu6ujp8+vSJlwMM51i+fDkUCgWWLVvGy97li6zm5mabv8vl\n7A7ME2i1Wmi1Wt72LqvQr18/GI1G6/cXL16gR48e6NKli6ubZthg1apVUCgUWLVqFS97lwWeNGkS\n7ty5gydPngAAfvnlF0ybNs3VzTLssHPnTjQ2NmLnzp287Dsk8G+//Qa1Wg0A6NmzJ7Kzs5GWlgal\nUonKykpkZGR0ZLMMT+DFK3wiYrdJrtK7d28CQL179yYiN9wmMcTFy5cv23w6ggns4zCBJURDQwNk\nMhkA/rehTGCJYDQaMX36dMjlcnTu3BkrV67ktR4TWAI8ePAAHMfh4cOHSEhIcCqJxAQWOVeuXMGE\nCRPg7++PsrIy6PV69+aiGcJx6NAhREZGYuzYsSgpKcGXX37pdC6a3QeLEIvFQpmZmQSAUlJSyGQy\n2bVl98ESo76+HrNnz8amTZuwefNm5ObmonPnztblzuaiWQSLiNraWoqIiCCFQkGnTp2yaRMQEEAA\nSKFQEBGLYMlQUVGBr7/+GlVVVSgsLER8fLxNO2f7YCawCCgoKMA333yDL774AuXl5YiIiLBrq9Vq\nUV9fz/uZMBNYYPbu3Yvo6GhMnDgRxcXFGDx4sFu3zwQWCIvFAo1Gg6VLlyI1NRX5+fno3r2729vx\n+rhoBlBXV4ekpCTk5+dj+/btWLFihTXH7G6YwF6mpqYGKpUKlZWVyMvLg0ql8mh7TGAvcuvWLcTE\nxEAmk6GoqAhjxozxeJusD/YS+fn5+Pbbb9GnTx+Ul5d7RVyACexxiAg7duyAWq3GtGnTcO3aNfTv\n399r7TOBPUhTUxO+//57pKenIz09HWfOnLE7Z8tTsD7YQ3z48AGzZ8/GpUuXkJOTg6VLlwriBxPY\nA1RVVSEmJgZVVVU4f/48IiMjBfOFCexmrl+/DpVKhYCAAJSUlCAsLExQf1gf7EZOnz6NyZMnY/Dg\nwSgvLxdcXIAJ7BaICFu2bEFCQgKio6Nx9epV0UxuZwK7iMlkwuLFi5GRkYHMzEwcP35cVBPvmMBO\n8Ploirdv30KpVOLIkSM4cOAAsrOzxTdt1kODE+wi5REdrUdTPHr0iEJCQigwMJCuXLkimE9sRIcb\naRlNoVarwXEczGYzSktLMWXKFKFdswu7TXICrVaL8PBwJCcnY/z48dDr9ejVq5fQbv0nvCLYYDBA\npVIhMjISaWlpqKura2ezadMmfPfdd1Cr1VCr1bynVkgFIsL69euRlJSEhIQEhIeHY8CAAfxHNwqF\no3M83zJJiYmJdPPmTZf7DDHS0NBA8+bNIwD0448/UnNzc7vRjULhch/Mp0ySyWTCvXv3cPDgQcTG\nxuKHH37A8+fPPXRIepdXr15h+vTpOHHiBI4ePYp169ZBJpM5P8NAIBwKzKdM0osXL8BxHDQaDfLy\n8jBq1CgsW7ZM8rWyHj58CI7jcP/+fVy+fLlNCUZnRzcKhUOB+ZRJGjhwIHJzczF06FDIZDKkpKTg\n6dOnePbsmfs89TKJiYkYPnw4nj17ho8fP+Ls2bNCu9QhHArMp0zSgwcPoNfr26xHRG2mXEiJw4cP\n4+TJkwCAxsZGmEwm3rP5xIZDgfmUSZLL5diwYQOqq6sBAMeOHUNISIho8rF8aW5uxpo1a5CcnIzQ\n0FAEBAQgPDxcEn2tXfhcqRkMBlKpVBQVFUWpqan09u1bunv3LsXGxlpt9Ho9RUdHU1RUFC1cuJBq\namo6dNUnBBqNhvz9/WnYsGEEgPz8/Cg9PV1ot3jhaH+yVCUR+fv7EwACQJ06dRLF7Q9fWKrSAb//\n/ru1UvqcOXOQlpZm85Ts9LRNseDlA05UEVxQUEDdu3en0NBQayLHHmJJbHwOi2A77Nu3DzNnzgTH\ncSguLrYmcuwhlcRGO7x8wAkewU1NTaTRaAgALVmyhMxmsyB+uAsWwa349OkT4uPjsX37dmzbtg05\nOTno1Ml9D9RE2U97+YATLIJrampo7Nix1KVLF8rLy/NIG0L00yyCAdy+fRsRERGora1FUVERYmNj\nAbg/4kTZT3vtUPsXb0fwuXPnqGvXrjR69Oh2bYr1ytgZJBnB7ogsIsKuXbsQGxuLqVOnoqioCAMG\nDGhjI8qIczfePd74RXDryPr8PUF8MJvNtHz5cgJAK1eupKamJne4LkokmarUaDRtxIUTp9H379+T\nUqkkuVxOe/bscZfbosXR/hTloLvPXx2Tk5PD6zT69OlTxMTE4MmTJzh//jyioqI86aYkEKXAreH7\nnqDWk76Ki4sxcuRIL3gnfkR5keUsZ86cweTJkzFo0CCUl5czcVshaYHp30lf8fHxmDlzJgwGg+QG\nGXgayQpsNpuRmpqKjIwMZGRk4MSJE6Ka9CUWRN8H2+Ldu3dISEhAYWEhcnNzsWjRIqFdEi2SE/iP\nP/5AdHQ0/vrrL1y8eJG9Rs8BkjpFl5SUgOM4mEwmlJaWMnF5IBmBf/75Z0ydOhXBwcEoKyvDiBEj\nhHZJEoheYCLCTz/9hLlz5yIuLg6XL1+2/756RjtELXBjYyMWLFiArKwsZGVlQafTYc2aNeJ7qC5m\nvJo4Jf6PC1+9ekX9+/cnABQVFWX93Rce8bkTST4urKysBMdxqKmpAfDP/OQW/heP+NyI6AQuLCwE\nx3GQy+VITk5uJ6ZUZvWJBVEJfOTIEcyYMQOjRo1CaWkpDh48iNDQUGzbtg3jxo0T2j1JIgqBm5ub\nsXbtWixcuBBmsxkGgwEbNmwAANy8ebPNJ8M5BBe4vr4ec+fOtQraQst0zfDw8DafDOcQXOBz585B\nr9fj+PHjVhFlMpm1371x4waICDdu3BDSTcnCKxdtMBig1WphMpkQEhKCjRs3titszcfGFnFxcZgx\nYwYCAwORmJjYsX/BsIvDCH7z5g1Wr16N3bt3o6CgAAMHDsTWrVudtrGHn58fAgMDO+Y9wyEOI9hW\nlR21Wm2tNsPXpgWLxQLgn+IuDNdp2Y8t+/VzHAr8X1V2Wk7BfGxaaKn30bpiDcN1jEajzdfiORSY\nT5UdPjYthIWFQafTISgoCH5+fo6aZzjAYrHAaDTaLT7uUOB+/frhzp071u+2quzwsWlBoVCwpIWb\n+a8XWrqlyg4fG4YwyIgcl6MrLCyEVquF2WzGoEGDsHnzZlRXV2Pt2rXIy8uza8OujoWHl8AM6SJo\nJotPmWJGW4gImZmZOHDgAC97wQR2JTnyf+Xx48dYsGABLly4wHsdwQTmU6aY0RadToe4uDgolUre\n6wg2LtqZ5AjjH7KysgAAZWVlvNcRLIKdSY4wOo5ge5NPmWKG6wgmMEuOeAfB+uCePXsiOzsbaWlp\nbZIjDPfCEh0+Drui8XGYwD4OE9jHYQL7OExgH4cJ7OMwgX0cJrCP8zesO4V8ZRISBQAAAABJRU5E\nrkJggg==\n", 528 | "text/plain": [ 529 | "" 530 | ] 531 | }, 532 | "metadata": {}, 533 | "output_type": "display_data" 534 | } 535 | ], 536 | "source": [ 537 | "# select the property of interest\n", 538 | "property_ = 'green_norm'\n", 539 | "\n", 540 | "# format data for property \n", 541 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n", 542 | "\n", 543 | "# encode sequences\n", 544 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n", 545 | "\n", 546 | "# train and CV model\n", 547 | "measured_CV, predicted_CV = cross_validation(X, log_data, property_)\n", 548 | "\n", 549 | "# train model on whole test set\n", 550 | "final_prams = ML_train(X, y)\n", 551 | "# use model to predict on test set and evaluate accuracy\n", 552 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": { 558 | "deletable": true, 559 | "editable": true 560 | }, 561 | "source": [ 562 | "### Kinetics_off" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 9, 568 | "metadata": { 569 | "collapsed": false, 570 | "deletable": true, 571 | "editable": true 572 | }, 573 | "outputs": [ 574 | { 575 | "name": "stdout", 576 | "output_type": "stream", 577 | "text": [ 578 | "20-fold corss validation of GP regression model\n", 579 | "R = 0.79\n" 580 | ] 581 | }, 582 | { 583 | "data": { 584 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHwAAABvCAYAAAAuXKSLAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADLZJREFUeJztnX9oG+Ufx9+Xa5N0lLai3aKr2m7asa11SipWVuxUnA5m\nGU62drWM/YBqslp7cW6gTtD9chh1wzmlc+hkU6bMBhlUELrJQBDEqbWKKJNlbO3SgWJrlzTJ5/vH\nuHwvySW9y13u8uN5QaC5e+55nt77+Xye5z7Pc084IiIwigaL2RVgGAsTvMhgghcZTPAigwleZJQY\nUci1a9cwPDyM6upq8DxvRJEFTSQSQSAQQENDA+x2u6prDRF8eHgYnZ2dRhRVVBw7dgxNTU2qrjFE\n8OrqagDXK+hwOIwosqAZHR1FZ2dn7L6qwRDBRTfucDhQU1NjRJFZx+Px4ODBg3C73fB6vabUIZPu\nkQ3aMuTgwYMIBoN49913za6KKpjgGeJ2u2G32+FyucyuiioMcemFiNfrNc2Va4FZuEY8Hg/sdjs8\nHo/ZVVEEE1wj+daXM8E1km99OevDNZJvfTmz8CKDCW4yRg/6mOAmIx30GSF+UQlutDUpKU866DNk\nxE8G4Pf7qb6+nvx+vxHFpcRmsxEAstvtOVmeIAhkt9tJEIS449FolKampmLftdzPorJwPR+hROtt\nampKacVqy/N6vZiamoob9fv9ftTV1aGsrAyCIGiud1FZuJ6I1it+MvEagiCQzWZLsmii61Z95MgR\nqqioSCqDWbgJiNbrdDoz9hqp+uzLly+jra0NGzduRFtbG5555hn9gjuqm4hMS9y2bRsdPnw4ZZpC\ntHC1yFlzYp8djUbp+PHjdMMNN9Ds2bPp5MmTsnlpuZ+aBP/jjz+oq6uL7rrrLib4DMw0gLty5Qqt\nXr2aANCTTz5JV65cSZmXaS792LFjeOKJJ7BixQrtrqbASTeAO3nyJBYvXoyhoSF88sknOHHiREbL\nlxShuonIwFx6Zly9epXWrVtHAOjxxx+ny5cvK7qODdpMQksg59SpU2hoaMCpU6fw4YcfwufzGbLA\nkwmugUwiY//88w82btyIlStXorGxEcPDw1i/fj04jstiTf8PE1wDagMrX3/9NRobG3HixAm89957\nGBwcNHwVb8EKrjVuruR6uciYHBMTE3C5XHjkkUcwf/58/Pzzz+ju7jbMquNQ3etngBmDNq1xc73i\n7mfOnKF58+ZRWVkZ7d+/nyKRiKb8iNigTRatcfNU1yv1HFNTU+jr68OyZcswZ84ctLe344UXXsDW\nrVszqo9uaG5uCiikxzIllv/tt99SfX09Wa1W2rdvH4XDYUXXpYutS2EWbhAejwfhcBg8z8t6jmAw\niO3bt2Pp0qWoqKjADz/8gK1bt4LneUUeh82H5wBSq0tnpZ2dncRxHFksFnrttdcoFAplVJbcfHgi\npsXSlZLPgktFlhMkGAzSjh07YlOYVqs163ViguuM1KqdTicBIKfTmZTup59+onvuuYd4nqf77rtP\ntv9V2i+ruYYJrjNSq060cJvNRs899xzt3r2bSktLaeHChfTdd98l5SGm5Xle9ePdTAM8JrhClFqb\n1HVL/xaF4DiOOI6j559/Pm6tmRQxLc/zivrlVOXLwQRXSCaLCsUGEg6HqbW1lQBQVVUVnT17dsZr\n1QqtFCa4QuRESGf1YgOxWq3U0tJCAGjLli00MTGRtgy1fbZamOAaSGf1fX19VFJSQhaLJbYSRUt+\ncmTSQJjgKpHeZNHqnU5n3I3fvHlz7FGL4zjFIqp15ZnE7JngM5BoRXI3WTxms9no8OHDcUuQMxl4\nqamb2ryZ4DOQKHC6vryuro4A0KJFi8hisRDP81ntj5WQ2GBNFXxoaIhWrlxJy5cvp56eHvr333+T\n0pgt+ExWFI1G6eOPP6aqqiqaM2cODQwM6FauHgO4xAZrmuBXr16l5uZmOn/+PBER7du3j1555ZWk\ndGYLno7R0VFatWoVAaC1a9dSIBDQLW+95tQTG6xps2Vnz55FY2MjamtrAQAdHR348ssvQXnyIwuf\nffYZ6urqMDAwAI7jMHfuXNx000265a/Xu2xKV9YoQZPgo6OjcSstHQ4HJiYmMDk5qbliiShZeODx\neFBSUoKSkpKkdNJzLpcLHR0dWLNmDYLBIACAiHSfltRTKN3Q4moOHTpEL7/8cuz79PQ01dfX0+Tk\nZFw6PVy6EvcofcEvMV3iy3+VlZV09OhR6uvrI57nc2JwphTTXPrNN9+MQCAQ+z42NobKykrMmjVL\nS7ayJLpHOYt3u93geR4cx2F6ejru3KZNm2KLBmtra/HLL7+gq6sLb775JsLhMMLhcG5ZYrbQ0tLG\nx8fp/vvvjw3a3njjDdq+fbuuLTIV6Sw+8dzg4CDNnTuXysvLqb+/n6LRqG71kGJEWJXIRAu/8cYb\nsWfPHjz77LNYsWIFfv/9d2zbtk2fljgD6QZE4rnNmzeju7sbjz32GCwWC0KhEH799desLQ/Oi036\nstAAk8j2Y5mcZQ0NDVFtbS3NmjWL3nnnHbJarUnz2npbYjZnyKQUfaRN6sInJyepp6cnFgPfsGED\nEcWLYfReL3pTcIKrtUBRzPb2drrjjjtmXGlilCVmi4ITXC72naoBCIJAVquVmpqaYla9fv36lLNg\nRpKtrqPgBE+0wHQuuLS0NG5WKzGdme47W2UX5IsIJAnPyo3IQ6EQmpubMT09DQDo6upCb29vLJ34\nnF5ZWQkAWLx4sbH/QIp6m46uTS8FWl16IufOnaMlS5bELNtms8XOJa4WRYrIWz5TcBYuRsyCwWBc\nXDwcDqOlpQV33303Ll68iHXr1sFut8Ptdscsev/+/bH4uNZttQqSLDTAJORapCAIKWPYgiAkWefI\nyAjde++9ccfl0mdzdUqukJeDNqUTHRaLhR544AGy2Wx05513ygouTV/IQovkpUsX3bbcm5jiYGfD\nhg1wOBz45ptvsGjRIpw7dw5OpzOWTvwZRjG9IAhxEyDppksTybcfq8mYLDTAJNS2yEgkQgcOHKCy\nsjJZLwCJlYsrT+Wed9N5kUTyKfqWly49FefPn6cHH3yQAFB3dzdt2bIlqU8Wlw2LAqUK1DidTsVz\n3fkUfSsIwaPRKL3//vtUXl5ONTU19NVXX8WdT1xLzvM8cRxHPM+T0+lUHKgpBPJecL/fT48++mhs\nkOZyuZLSSF/Om+kFfaXWatT8td7kreDRaJQ++ugjqqysJIfDQSUlJUmiiojvaUvfAtHqhvPVE5gq\neCbbZ4sTHvPnzycA1NHRQePj40miQvKYpeX121TkU78txTTBM90+Wzrh8fnnn8fSSUVNHF3nqzjZ\nwLTn8Ey3z167di14nsfTTz+N1atXx46Lz9O9vb0QBCEuJJqTS37zkZlaxOnTp2nhwoVJny+++CKW\nhm2fbSxZtfDW1laMjIwkfVatWpWVBpgY8ZJ+TxUNK5oomR7o0eL0tPDEkbM0WpZq2VK+jrYzJS9j\n6alIXDTgdrvjzstNdebkQoNcRY8Wl+0+nI3Q49FyP3X5/fC9e/fqkU1K8u03unOZnHPpjOzCBC8y\nTBecPVIZi+mC58ULeAWE6YKzRypj0WWUrgU2AjcWQwSPRCIAru8Jw9COeB/F+6oGQwQXtwXp7Ow0\noriiIRAI4Pbbb1d1DUeU/T22rl27huHhYVRXV4Pn+WwXV/BEIhEEAgE0NDTAbrerutYQwRm5g+mj\ndIaxMMGLDCZ4kWH6c3i2OH36NLxeL0KhEBYsWIDdu3ejvLxcdRqz8Pl8+OCDD8BxHMrKyvDiiy+i\nsbExLs3evXsxODgY2/Sgrq4Ob7/9dvqMdZ6qzQmU7PKsdCdoM/jzzz9p6dKlNDY2RkTX1xW2trYm\npVuzZg19//33qvIuSJeuZJfnXN4J2mq1YufOnZg9ezYAoKGhAePj4wiFQrE0oVAIIyMjOHLkCNra\n2tDT04NLly7NmHdBCq5kl2cjd4JWS01NDZYtWwbg+l43e/bswUMPPQSr1RpLMzY2hubmZgiCAJ/P\nhyVLlsDlcs3YYAtS8Gg0KnvcYrGoSmM2//33H3p7e3HhwgXs3Lkz7tytt96K/v5+zJs3DxzHYdOm\nTbhw4QIuXryYNs/c+e90RMkuz0buBJ0Jly5dQnt7O3iex9GjR1FRURF3/rfffsPAwEDcMSJCaWlp\n2nwLUvCWlhb8+OOP+OuvvwAAn376KR5++GHVaczi77//xlNPPYXly5fjrbfekg2fWiwW7Nq1C36/\nHwBw/PhxLFiwIK6bkqNgQ6tnzpyB1+vF9PQ0brvtNrz++uvw+/146aWX4PP5UqapqqoyuebAoUOH\ncODAAdTX18cdf/XVV7Fjx45Y/X0+H/r7+xGJROBwOLBr1y7ccsstafMuWMEZ8hSkS2ekhgleZDDB\niwwmeJHBBC8ymOBFBhO8yGCCFxn/A332OQXZ67SxAAAAAElFTkSuQmCC\n", 585 | "text/plain": [ 586 | "" 587 | ] 588 | }, 589 | "metadata": {}, 590 | "output_type": "display_data" 591 | }, 592 | { 593 | "name": "stdout", 594 | "output_type": "stream", 595 | "text": [ 596 | "Full GP regression model\n", 597 | "Hyperparameters: 0.0127667028933 19.179798011\n", 598 | "GP regression model test set\n", 599 | "R = 0.959\n" 600 | ] 601 | }, 602 | { 603 | "data": { 604 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAG8AAABvCAYAAADixZ5gAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAC11JREFUeJztnX1ME3cYx799sW1UdHNDiYJCNpEtJBAlaiIRrYhijG9j\nk4y5yRKVUHxJi8H4AmoaxWz4EoX4lsHmCGyDZQgJAxRbiYnZYmKUGDFBEIygFdFkUyhtn/0hZaiF\nXtu7a6+9T8IfwO+e35P73vO733O/3z0nISKCiCCRetsBEfcRxRMwongCRhRPwIjiCRg5W4b6+vrQ\n3NyM4OBgyGQytswGLFarFSaTCdHR0VCpVA7bsCZec3Mz0tLS2DInMkhpaSni4uIc/o818YKDg4c6\nCwkJYctswNLd3Y20tLSh8+oI1sSzD5UhISEIDQ1ly2zA8PLlS9y/fx/R0dFv/H20WxBr4om4z4sX\nL7BixQp0dnaio6OD8XGieF6mt7cXy5YtQ0tLC/7880+XjhXF8yJPnz5FUlIS2tvbcfny5REnJiMh\niuclnjx5gsTERHR1daGxsRGxsbEu2xDF8wJdXV1YsmQJenp6cOXKlXcmKUwRxeOZhw8fQq1W459/\n/oHRaERUVJTbtkTxeOTBgwdQq9Uwm80wGo2YOXOmR/bEZ5s80draioULF8Jms+Hq1aseCweI4vHC\nvXv3kJCQgDFjxsBoNCIiIoIVu6J4HHPnzh0kJCRg/PjxMBqNmD59Omu2RfE45NatW1i0aBEmTZoE\ng8GAadOmvdNGp9NBpVIhLi4OKpUKOp2OeQfEEp2dnRQZGUmdnZ1smRQ0N27coEmTJlFMTAw9efJk\nxHZKpZIADP2oVCoiYnY+xcjjgL/++gtLlixBREQEGhsbR10Z0Gg0UKlUmDNnDlQqFTIzM5l3xNaV\nJkbea65du0ZBQUE0b9486u3tdduOGHk8YzQakZSUhJiYGNTX1+O9997jtD9RPJa4dOkSkpOTMXfu\nXNTW1mLChAmc9ymKxwK1tbVYuXIl4uPjUVNTg/Hjx/PSL6PHYz///DPKysogkUgQFhYGvV6PDz74\ngGvfBMHFixfx+eefY+nSpaioqBhxsxAXOI285uZm/PDDDygvL0dNTQ3Cw8Nx4sQJPnzzeSorK/HZ\nZ59hxYoV+P3333kVDmAgXnR0NOrq6hAUFIT+/n48fvyY8xuxECgrK8P69euxdu1a/Prrr1AoFCO2\ntSfiLiXgTGA6dW1oaKC5c+dSfHw8tbW1uTW19RdKSkpIKpVSWloaDQwMOG1vT8TtCTgTmJxPl/O8\nX375hdRqNVmtVpc78wfOnTtHEomENm7cSBaLhdExWq2WVCoVabVaxv2wIl57ezv9/fffQ79bLBaK\nioqiZ8+eudyZ0CksLCQAtHnz5ncuXrZhJUk3mUzQarV49uwZAKC6uhozZ87E+++/z+747eMcP34c\nGo0GWVlZOH36NKRS72dZTlOFuLg4ZGRk4Ouvv4ZMJsPkyZNRWFjIh28+w5EjR7Br1y7odDp89913\nkEgk3nbpNXyGuRA5ePAgAaDdu3eTzWbjrV/x2aYHEBH27duH3Nxc7N+/H3q93ncibpCAF89RDkZE\nyMnJgV6vx6FDh5CXl+dzwgHw72FTq9WSUqkcdYr+dg5ms9lo+/btBIAKCgpcssUmnOR5nnTGN0yS\n4+E5mNVqpYyMDAJAJ0+edNkWmwS8eK4kxxaLhb799luSSCR05swZj2yxQcCLx5SBgQHasGEDSSQS\nKi4u9rY7RMTsfAb8jumBgQFs2LABv/32Gy5cuCCoV7MDdrap0+mgVCrx6aeforKyEuXl5UPCcbYK\nwDZ8hrk3GT5b1Gq1b2y3++OPP95oy/fkxBHiPW+Q4WKpVCpSKBRDv69Zs8Zhez4nJ44QxRtk+MbW\nrVu3UlhYGAGgdevWedu1EREfjw1i39ialZWFmzdvoqenB42NjaisrPS2ax7ht+INn3QUFBSgu7sb\nN27cwM2bN1FXV4fFixd720WP8VvxCgsL0d/fj6KiIvT29iIxMRF37txBQ0MD4uPjve0eK/hlnqfT\n6WCxWCCTyZCeng61Wo0HDx7g8uXLmDNnjrfdYw2/FK+wsBBWqxVKpRJNTU3o7u7GlStXEBMT423X\nWIXRsFlVVYVVq1Zh9erVSE1Nxe3bt7n2yyM0Gg2USiXGjRsHk8kEg8Hgd8IBcJ6kt7a20oIFC+jx\n48dERGQwGCghIcGtqS1fdHR00Mcff0xTp06lu3fvetsdt2AlVVAoFNDr9Zg8eTKA15twnz59CrPZ\nzPmF5Q7t7e1ISEhAf38/jEYjZs2a5W2XOMPpPS80NHSoih8R4fDhw1Cr1aPuEPYWra2tUKvVkEql\nuHr1KsLDw73tEqcwThVevnyJ7du3o6OjA3q9nnVHPH0Y3NLSgoULF0KhUASEcABD8R49eoTU1FTI\nZDL89NNPnLx7Zs/LTpw44bKI9ooLEyZMgNFoRFhYGOv++SJOxXv+/Dm++uorJCUl4dixY6y9CfN2\npNkfYQEYSq6dHQP8X3Hhww8/hMFgwNSpU1nxTxA4m/UUFRVRVFQUrVq16o0fT7e7j7TsMtoT/beP\nYVpxQYj49KqCO8suw4+5fv06TZw4keLi4qinp8ddt30WnxbPE5qamigoKIjmz59Pz58/57w/b+CX\nS0IGgwHLly9HbGws6uvrMXHiRG+75DW8Jp6rqYFOp4NCocDSpUuHKi4EBQVx7KWPw2eYD8fVfSJy\nuZwAkFQqpX///dcTVwWBTw+b9tTAXq5ptEisqqqCzWaDVCpFVlYWxo4dy7e7vgmfV8po+/3fjkR7\n25UrV5JcLqe1a9dSf38/W+76PD432xxtqHw7dRi+aeiLL74gs9nMlquCwOd2TGs0GhQVFTmsbFdQ\nUAAAQ2/dLlq0CHV1dfjkk09QWloKudwv1409g88rxRn2aJPL5SSRSCg9PZ1xxQV/wycmLK6kBBqN\nBnK5HBaLBVu2bMH58+fFb/GNAufiDd/F5YzQ0FBYLBZs27YNRUVFPlFxwZfh7OzYI87+5UVnFVzz\n8/Oh1WqRnZ2N48eP++ZrxL4GV2M00yTcZrPRgQMHCADt2bOH14oLvoxX73nDk/CR7ntEhL179yIv\nLw8HDx70yYoLPg0fV4qjKLTZbJSdnU0AKD8/ny03/AafmG0C7z4KIyLs2LED33//PY4ePYqcnBw+\n3PA7eMt8iQgAYLPZkJmZiTNnzuDUqVPQaDR8ueB/MAlhm81GOTk5dP78ebfC3D5sKpVKSk9PJ4lE\nQmfPnmXSdcDCyrDZ2tqKb775BrW1tW5fIPbt5xEREfjxxx9RXFyMTZs2uW1P5DVOh83S0lKsW7fO\no11Z+fn5ePjwISorK3HhwgV8+eWXbtsS+R+n4uXm5gIArl+/7lYHZrMZqampqK6uRnl5OVJSUtyy\nI/IunE5Y+vr6kJKSgvr6elRUVGD16tVcdhdwcCbeq1evsGbNGhiNRlRVVSE5OZmrrgIWzsSrrq5G\nU1MTampqkJiYyFU3AQ1n4qWkpCApKUn8BgOHMBYvPz/fJcNSqVQUjmNYizyr1QoA6O7uZstkQGM/\nj/bz6gjWxDOZTAAgqKp5QsBkMmHGjBkO/ychGnzo6CF9fX1obm5GcHCwuHWBBaxWK0wm09BitiNY\nE0+Ef8RNIgJGFE/AiOIJGMGLJ7TqTEy4dOkSZs+e7bwhT2uLnMC0OpOQaGtro8TERIqNjXXaVtCR\nJ7TqTM549eoVdu7ciV27djFqL+i3N4RUnYkJubm5WL9+PeOSW4KOPDtcV2fiA/ubUK4sVgs+SX/0\n6BEyMjLw0Ucf4fDhw7x/7potUlJS0NfXB5lMhoGBAbS1tSEyMhJnz57FlClTHB/E9Q2YS3p7e2nx\n4sXvfLRJ6HR2djKasAj6nldWVoauri40NDSgoaFh6O8lJSUB8U1bwQ+bgYxfTFgCFVE8ASOKJ2BE\n8QSMKJ6AEcUTMKJ4AkYUT8D8B/eBZzDOS3kWAAAAAElFTkSuQmCC\n", 605 | "text/plain": [ 606 | "" 607 | ] 608 | }, 609 | "metadata": {}, 610 | "output_type": "display_data" 611 | } 612 | ], 613 | "source": [ 614 | "# select the property of interest\n", 615 | "property_ = 'kinetics_off'\n", 616 | "colors = sns.color_palette('colorblind')[2]\n", 617 | "\n", 618 | "# format data for property \n", 619 | "log_data, y, seq, y_true_test, seq_test, df_select, df_select_test = data_format(property_, df_input)\n", 620 | "# encode sequences\n", 621 | "X, X_true_test = encoding_inputs(df_select, df_select_test, ss, contacts)\n", 622 | "# train and CV model\n", 623 | "cross_validation(X, log_data, property_)\n", 624 | "\n", 625 | "# train model on whole test set\n", 626 | "final_prams = ML_train(X, y)\n", 627 | "\n", 628 | "# use model to predict on test set and evaluate accuracy\n", 629 | "ML_predict(X, y, X_true_test, y_true_test, log_data, final_prams, property_)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": { 636 | "collapsed": true 637 | }, 638 | "outputs": [], 639 | "source": [] 640 | } 641 | ], 642 | "metadata": { 643 | "kernelspec": { 644 | "display_name": "Python 2", 645 | "language": "python", 646 | "name": "python2" 647 | }, 648 | "language_info": { 649 | "codemirror_mode": { 650 | "name": "ipython", 651 | "version": 2 652 | }, 653 | "file_extension": ".py", 654 | "mimetype": "text/x-python", 655 | "name": "python", 656 | "nbconvert_exporter": "python", 657 | "pygments_lexer": "ipython2", 658 | "version": "2.7.6" 659 | } 660 | }, 661 | "nbformat": 4, 662 | "nbformat_minor": 2 663 | } 664 | -------------------------------------------------------------------------------- /regression/GP_matern_5_2_kernel_LASSO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "from __future__ import division\n", 14 | "import numpy as np\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "import seaborn as sns\n", 17 | "import pandas as pd\n", 18 | "import pickle\n", 19 | "\n", 20 | "# ML imports\n", 21 | "from sklearn import linear_model\n", 22 | "from scipy import optimize\n", 23 | "import scipy\n", 24 | "\n", 25 | "# custom imports\n", 26 | "import encoding_tools as encoding\n", 27 | "import chimera_tools as chimera\n", 28 | "import GP_tools as GP\n", 29 | "import lasso_tools as lasso_tools\n", 30 | "\n", 31 | "# Plot adjustments:\n", 32 | "sns.set_context(\"paper\")\n", 33 | "sns.set_style(\"white\")\n", 34 | "plt.rcParams.update({'ytick.labelsize': 12})\n", 35 | "plt.rcParams.update({'xtick.labelsize': 12})\n", 36 | "plt.rcParams.update({'axes.labelsize': 14})\n", 37 | "plt.rcParams.update({'legend.fontsize': 12})\n", 38 | "sns.color_palette('colorblind')\n", 39 | "\n", 40 | "plt.close('all')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "deletable": true, 47 | "editable": true 48 | }, 49 | "source": [ 50 | "## Convert data to usable form" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 8, 56 | "metadata": { 57 | "collapsed": false, 58 | "deletable": true, 59 | "editable": true, 60 | "scrolled": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "# load ephys data\n", 65 | "path_inputs = 'inputs/'\n", 66 | "df_input = pd.read_csv(path_inputs+'Ephys_data_formatted.csv')\n", 67 | "df_input = df_input[df_input.gen != 3]\n", 68 | "\n", 69 | "# load library files\n", 70 | "file_c = path_inputs + 'shmetis_c_10_21_0/chimeras.output'\n", 71 | "file_n = path_inputs + 'shmetis_n_10_21_0/chimeras.output'\n", 72 | "\n", 73 | "# add sequence information to dataframe based on chimera code\n", 74 | "df_input = chimera.chimera_code2seq_convert(file_c,file_n,df_input)\n", 75 | "\n", 76 | "# load contact information\n", 77 | "fname_1 = path_inputs + 'alignment_and_contacts_C1C2.pkl'\n", 78 | "\n", 79 | "# load the contact map\n", 80 | "with open(fname_1, 'rb') as f:\n", 81 | " ss, contacts = pickle.load(f)\n", 82 | " \n", 83 | "# only use the first three parents\n", 84 | "ss = [i[0:3] for i in ss]" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "deletable": true, 91 | "editable": true 92 | }, 93 | "source": [ 94 | "## Spectra" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 9, 100 | "metadata": { 101 | "collapsed": false, 102 | "deletable": true, 103 | "editable": true, 104 | "scrolled": false 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "20-fold corss validation of GP regression model\n", 112 | "R = 0.92\n" 113 | ] 114 | }, 115 | { 116 | "data": { 117 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHUAAABvCAYAAADSSY9BAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADTpJREFUeJztnX9sE2UYx7/tzfVYxBGTQQFRQB2LlEzTmYAoZVNgmg1I\nzMC5YdSJ2daMaRvjDBJMlDl/dBKXMRR/ZUHdP0YWFHEmuJqRGNMlmJThH5phNmdnpwPcj3bb9fGP\n5c7+bq+9XrvjPkn/6N3b93173/d93ud93vfuNEREUFEU2nRXQEV6VFEViCqqAlFFVSCqqAokS45C\nPB4PnE4n8vLywDCMHEUqGo7j4Ha7YTAYwLJsyHlZRHU6naiqqpKjqOuKTz/9FEVFRSHHZRE1Ly9P\nqIRer5ejSEXjcrlQVVUlXNdgZBGVN7l6vR633HKLHEUqCqvVivb2dpjNZthsNuF4pKFMdZQyiKKi\nImg0Gixbtgwsy8JqtQIAWltb4fV60draGlc+qqhpxGq1BojX398PAPjrr78EETUajeh8VVHTSHt7\nO7xeL44dOwYAMBqNAIClS5cmla8qahoxm81gWRb19fUAAIfDASLC6OgoLBZLwvmqosoIb26LioqE\n+eX09DRsNluIKbbZbFi/fn1iBZEMDA0NUX5+Pg0NDclRXEZisVgIQMCHZdmQcwzDkE6no+rq6pD0\nS5cuJaLY11OWKY3K/PjJYzQacfHiRaxfvx4sy2Jubk44x3EcOI7DyZMnkZeXh2vXrsFgMMDpdKK6\nujquslTzKxP8+GmxWOBwOFBfX4/+/n54vd6w6VesWIGKigoAwIULFwIcqpjIYXpU8xuKTqcTzKrR\naCSGYQJM7eTkpJCGYRhiWZYsFgsRxb6eak+VGd4h4oPxFosFTqcTHMcBADQaDRobG5GTkyP07sbG\nRsGhigs5WqXaU/+H7328k0REZDKZhB6q0+li5qH21AzDf25KRNi0aRPsdjuWLVsGnU4Hs9mcdBmq\n9yszNpsNNpsNPp8PjY2N+PHHHwEAV65cgcfjkaQMtacmQXDAIF7m5ubw1FNPoa2tDSUlJWBZVpIe\nKiDVWJHMGLBQCTc+xuKee+4Rxs/Ozk4img8+6HQ6MhqNpNPpBC83ErGupypqElgsloCpRqy02dnZ\nAdMWXkD/6Q3/iZanKmqGoNVqA0TTaDRCL+cbh9FoDAkhhkP1fjMAl8sFn88nfCciPP/884IXbLPZ\nMD09DYfDAYvFApZlhRCi2PGaLyDlLPSeypvIeMxsMIODg3T77bcHBOzjIdp4nfKe2tvbi/LycuzY\nsQMHDhzAxMREsllmHMGL2TyxvN8nn3wSa9aswdjYGJ5++mkhOhQPwWutohDd9Pz4+++/aePGjTQ4\nOEhERG+++SYdPnxYdMvKdCI5RNF6k8PhEBUlEkNKe2pfXx82bNiA1atXAwAqKytx+vRp0AK+OzJS\n7wv3n4J7E//bvXv3ori4WNIokSiSaTHvvfceHTp0SPg+OztL+fn59O+//4pqWZlEuN4X7ljwOBu8\nCL5161a6evVqxLSJjtFEKZ7SdHR0hBV1cnJSVCUyiXCmNtyxYKH955q5ubmk1WqJYZiAeWhwWjFB\nC39SKuqpU6eotrZW+D48PEz33nuv6EosRIKFfuihhwgAFRQUBAQZ/Oeh/j013qBFOFIq6tjYGG3a\ntElwlN5++21qamoSXYmFTktLCwGguro64jiOLBYLMQwj9NRIJGqGUx5R6u3tpfLyciotLaVnn32W\nxsfHRVdioeLz+aipqYkA0EsvvUQ+n0/U7/ndDvHOXXlSvvHMZDLBZDIlm82Cw+fzwWw24/jx42AY\nBl6vN+Ju+kj3wqQMUU0kQRZyTw1nImdmZujxxx8njUZDWVlZMZ2eSI5RomOrGtBPkmBBpqamqKys\njLKysqirqysuYZJ1jIJRRY2DaA6LvyBXr14lk8lELMvS119/HVe+wQ5TsnNUIlXUuIhn3uh2u6mo\nqIgWL15Mdrs9Zp7BwQip5qhE6tJbTKxWK+bm5sAwTMTg+R9//IEtW7bg8uXL+P7777Fly5aY+frv\nyPfPO6lAfbwk3FwkbFnpJFbP+fXXX2n16tW0cuVKunTpUtz5Sj2O+qP21ChE6qVWqxVZWVlgGAYF\nBQX4/fffUVpaioKCgrg3m/EL37JMYYKRvBkl0LLShX8v9Xdgwu0Z4u9G4wMG4aYnyTpA8aI6SlHw\nN5H+At95552CmIWFhcSybED0R+zaqtSoosYJL/DOnTvDLm7zm8KMRmPU36s9NcPo7OwkhmHojjvu\nCDGlcvbEWKiOUpy0tbXhiSeewL59+3Dp0iV4PJ4AJ0eWqYhUZELLEoPUDonP56NXX32VAFBjYyNx\nHCdJvqlEceZXSjPo8/nIarUSAHrllVdEL52lC8WZX6nMIMdx2L9/P2w2G44ePYrDhw9HfRBVojdD\npYVMaFly4/V6qaKigrRaLX388cdx/SbYQsg5Lw1GceZXLMEXf2Jignbs2EHZ2dn0xRdfiMrHf8qS\nTm/4uhbVf6WEZVkaHx+nzZs3U05ODvX09CSdt1zz0mAULWosE+gf7qutraW7776blixZQufPn5e0\nHnKjOEfJn0j3uPDwTtUzzzyDc+fOYWRkBL29vbjvvvtkrqm8LDhR/b3QSLc9WK1WYbNXZWUlvv32\nW3g8HvT19aGwsDDN/0AGMsFciCHSykrwOX/Tq9FoaP/+/Wn1WKVEcWNqpJWV4P1Ae/fuDdlOkknT\nkmRQnKj+RBL47NmztGjRInrggQfIbDYLafxvw4+2NprpKFpUf3jBysrKSKvVklarpYaGhrBp+QbA\nP3ch0nJapqJo79cfm82G9vZ2nDlzBsD8DvoTJ06ETcs7WFrt/N+/cOHCwgkBxoFiRG1tbUVNTQ1q\nampw4MCBqPFhfv9QY2MjGIYBx3HiHr0agYyJD2eCuUgGn89Hhw4dIgD0wgsviF5p8feSE3WY+DFd\nrjFa0WMqx3HU0NBAAKi5uTmhpTMpwn2RnsubKhQr6uzsLO3bt48AUHt7u2T5JoLccWBFijo9PU27\nd+8mhmHo5MmTkuS5kEj5/alyMzExgV27duH8+fP48ssvUV5enu4qZRxJidrd3Y0PP/wQGo0GixYt\nwsGDB7Fhwwap6hbCP//8g0ceeQQXL17EN998g+Li4pSVtaBJ1AT89ttvtHnzZhodHSWi+ccEmEym\nhMxFPIyMjJDBYKCbb76Zfvrpp4TzUQIpM7/Z2dl47bXXhPeSGQwGjI2NYWZmBtnZ2ZI1OgAYHBzE\ntm3bMDU1hR9++CHxNytdJ8QU1W63o66uLuR4c3Mzdu/eDWD+aWCvv/46SkpKJBd0YGAA27ZtA8uy\n6Ovrw9q1ayXNX4nEFNVkMmFgYCDi+ampKTQ1NcHlcuGDDz6QtHIOhwOlpaXQ6/Xo6enBihUrJM1f\nqSQVJhwZGcFjjz0GhmHQ2dmJm266KeG8gkNsdrsdJSUlWLt2Lex2uyqoGBIdrMfHx6m4uJja2tqS\nHtiJAhe4v/rqK2JZloqLi+natWuJVpGIFu6aaTRStkrz+eef488//8R3332HXbt2CZ/x8fGE8uNX\nTpYvX46ysjKsXLkSZ86cweLFi4U0iQTMY+1jUiSZ0LJ4Ojo6oj4jN5EnhKVzK2eqyMj11HA9rqWl\nBXV1dSgsLJTsGbn85jP+OfXXDeloWf7jp8/noxdffJEA0MGDB6OutIjtdZl0T6mUZGRAnxfnueee\no9raWgJAb731luTlKtH0EmWoqETzz/errKwkjUZD77//vhzVUAwZuUozPT2NiooK9PT0oKurC3v2\n7ElHNRRLWkQ9ffo0zp07h+7ubjz88MPpqIKiSYuojz76KLZv344lS5ako3jFk5YpDcMwqqApRJae\nyr9v2+VyyVGc4uGvI39dg5FFVLfbDQCoqqqSo7jrBrfbjdtuuy3kuIYo9a978ng8cDqdyMvLA8Mw\nqS5O8XAcB7fbDYPBAJZlQ87LIqqKvCjmtguV/1FFVSCqqAok7Zu549073NLSgrNnzyI3NxcAsGbN\nGhw9elTu6gbQ29sLm82GmZkZrFu3Ds3NzbjxxhtFp5EcGePQIYjZO7xnzx7q7++XsXbRieeFwPG+\nNFhq0mp+o+0d9mdmZgYDAwP46KOPsHPnTjQ0NGBkZCQdVRaI54XA6XppsCyi2u123HXXXSEfh8OB\nrVu3Aoi+d3h0dBQbN26ExWJBd3c3CgsLUV9fn9Y3KrtcLuj1euG7Xq/HxMQEJicnRaVJBbKMqcnu\nHV61alXArf41NTU4duwYhoeHsWrVqpTUORY+ny/scf6RA/GmSQVp937j2Tv8yy+/4NSpUwHHiAg3\n3HCDXNUMYfny5UL4E5i3Jrm5ucjJyRGVJhWkVdQrV66guroa27dvxzvvvBM25AXMt+wjR45gaGgI\nAPDZZ59h3bp1AaZNbu6//378/PPPuHz5MgCgq6sLDz74oOg0qSCtYcKOjg68++67yM/PDzj+ySef\nYHh4GC+//DK6u7sBzE99Tpw4AY7joNfrceTIkbTv2rfb7bDZbJidncWtt96KN954A0NDQwH1Dpcm\n1cuOauxXgaR9TFWRHlVUBaKKqkBUURWIKqoCUUVVIKqoCkQVVYH8B/xakPtx+JySAAAAAElFTkSu\nQmCC\n", 118 | "text/plain": [ 119 | "" 120 | ] 121 | }, 122 | "metadata": {}, 123 | "output_type": "display_data" 124 | } 125 | ], 126 | "source": [ 127 | "# property of interest\n", 128 | "property_ = 'green_norm'\n", 129 | "lasso_alpha = 2.5e-2\n", 130 | "\n", 131 | "# format data for property \n", 132 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n", 133 | "\n", 134 | "# encode sequences\n", 135 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n", 136 | "X = np.array(X)\n", 137 | "\n", 138 | "# use lasso to limit the input \n", 139 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n", 140 | "\n", 141 | "# reformat X with only lasso-limited set for GP model\n", 142 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n", 143 | "\n", 144 | "# evaluate cross-validation performance of GP model with lasso-limited set\n", 145 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n", 146 | "\n", 147 | "# Bayesian ridge regression to find weights\n", 148 | "clf_ff = linear_model.BayesianRidge()\n", 149 | "clf_ff.fit(X_lasso, y)\n", 150 | "weights = clf_ff.coef_\n", 151 | "\n", 152 | "# find features \n", 153 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n", 154 | "\n", 155 | "# find the correct numbering for each feature\n", 156 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": { 162 | "deletable": true, 163 | "editable": true 164 | }, 165 | "source": [ 166 | "## Peak photcurrent" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 10, 172 | "metadata": { 173 | "collapsed": false, 174 | "deletable": true, 175 | "editable": true 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "20-fold corss validation of GP regression model\n", 183 | "R = 0.80\n" 184 | ] 185 | }, 186 | { 187 | "data": { 188 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHgAAABvCAYAAAAntwTxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADIxJREFUeJztnW1sFEUYx/93e3ddtKFgaHOaWl+IbSIF0x5iEaTRoNWY\nGtLEWKxRSJBQTmi6NQREYjCkVetpIhRigoaQEP1govUbKYkH+kEkGiDV+AWIKUKxVTC8XXvtPX7A\n2Wz39vZ29/btjvklF3p7szPD/Od5ZubZ2d0AERE4JUvQ6wpwnIULXOJwgUscLnCJwwUucUJuFJJK\npTA8PIzKykoIguBGkSXN9PQ0xsbGUF9fD1EUddO6IvDw8DA6OjrcKOq24tChQ1i8eLFuGlcErqys\nlCsUjUbdKLKkGR0dRUdHh9yuergiMHPL0WgU1dXVbhRZMvT09GBgYADxeByJRGLGb0aGOz7J8jkD\nAwOYmJjA3r17LZ3PBfY58Xgcoihi48aNls53xUVzrJNIJLJcsxm4BZc4XGAX6enpQSgUQigUwuLF\niyGKInp6evL+VhDkAiMjI1RbW0sjIyNuFOdbysrKCMCMjyiKeX9TY6Y9uQW7SDwehyAIEAQBsVhs\nxuSJ/aaE/dbT02Pdom3tojngFmwMSZJk6xUEQT7OrJtZNLdgmynIgkwwMDCQVS5Q4FLJse6ooNgt\nWG1BTiFJEomiSIIg8DHYTaxYkBWrTyQSuHnzJrq6uuTyJiYmsH37djz99NMgK9vnbO6EmhS7BVuB\nWb0gCFRWVkaSJJnO45dffqGFCxdSKBSi/v5++Ti3YB/ArB6A6VhyOp3Gzp07sWTJEgDATz/9hDff\nfNNSPbjADqHlbo0wPDyMpqYmvPvuu9iyZQtOnDiBhoYG6xUx7TcscDu6aLNMTU3Re++9R5FIhOrq\n6ujHH3/MmZa7aB+jnHyxv9euXYvly5dj27Zt2LRpE5555hk0NzfzUGUxwpZAgiBQJBKRAxvz58+n\n77//nojyL8u4BXuMkSUSEclbbh555BGcOnUKy5cvB1D4NWB1QY7jdwuWJMnyUkYLPQvs7u6mUChE\n4XCYampq6MiRI6bzN9OeXGCyP1LFIlLqDrNu3ToKBAIEgNatW0f//vuvpfy5wCbJJYhdMKvF/+Nt\nOBwuKD8usANIkkSCIJAgCCRJkmG3/ueff1IwGCQAFAgEbBkKuMAGUApkRCzlBXlRFHOGIlle3d3d\n9NxzzxEAecy1y0NwgQ2gHHeVSxc1TLBYLDbDgquqqmbsvhAEQR5f1R/WAVg+hVoxF9gAynFXKbDa\nmnNNwNQisjy0Psoy1MetwAU2iVJspStWW6pSkFgsJh+vqqqizs5OzT1V7Bwtga3O2rnABaDcNiOK\noiFBvv32W4pGo1RRUUH3338/AaBYLJaVL3PxsViMRFHMcvvKtHpunAusgZlghlKMWbNm5XSpV65c\noTVr1hAAamlpMf3/U0/c1MftCFXeNgKrXa+Z9FqNffjwYSovLycAtHLlSspkMqbrpF56KY/rrcuL\nXmC7Q4csT7VYeuWwRmbulC2nIpEILVq0SF7XaonvRP2VFL3ATm1yU1tGvnLUQoXDYbmT7N69m7q7\nu7MsTd2RnBC76AV2OnTIYDNh9YSIwTpAWVkZbd68WbbatWvXyvVUB0uUa2HlrNzOzlr0AruFEQuO\nRCI0Z84cKisroxUrVlAkEsnyAFpLIBY0ydeJrODq9eBkMonW1la0tLRg8+bNuHbtmuW8tK6jOrnp\nXO+6ayqVgiAImJqawkMPPYT29nYcO3YMk5OT8gY65cY6BrslpaurC8CtPVYA8Ouvv9pef0MU0pP+\n/vtvampqonPnzhER0QcffEDvvPOO5R6nZVFubTpX8vLLL8uWuGzZMkqn07qzcL0hxYnhxjUXPTg4\nSK+//vqMghsbG7OWDEYrpNUYbo3HREQTExP09ttvay6PnHC1VnHNRY+Ojs54ak40GsW1a9dw/fp1\ny3mSavc+235ayF3uRjh9+jSWLFmCvr4+PPbYYwgGgxAEQXbfJ0+enPGvW/crFUwhPWnfvn20Y8cO\n+Xs6naba2lq6fv26pR7nxM6KfEuUdDpNu3btonA4TA8//DCdOHFCM536ggQKjCcXgmsu+ptvvqEN\nGzbI38+fP0+PPvqo5QrZ7Y71rhIREb322msUCAQoEAjQli1b6ObNm5p1Ul8uVM6a2RJJKyLlFK4J\nPD4+TkuXLpUnWR9++CFt3bq1oArZiVJgpXeYmpqi/v5+WaRIJJIzD63rxuzDxmPlBEzZCUoikpVM\nJqm1tZWeffZZWr9+PV2+fLmgCpnFSLhRkiR5krRgwQJ6/PHHCQA1NDRknavOT+u6sVJMlkbLup1y\n30UT6LAjjGd03FZuMn/ggQcomUxmpVFe49XKj4nNolW5doCUlAUbIVeFjIpj1EpzpTt37hzde++9\nBIAWLVpEV69e1SxHy/3mqo9bSzctikZgow2VqyPk216TyWTo008/pfLycqqurqbDhw9r5s/yUe7e\n8GJ2bJSiEVgPdSBfqyOoBVWmGxkZoZaWFgJAa9as0ZwbsHOUouYqy+lLgGbwvcBGdhgacd9aYmQy\nGTpw4ABVVFRQNBqlVatW6QqTKwRpdPOdF/he4Hy7JYisjXMXL16k1tZWAkCrV6+m8fFxXWH0JkR6\n3sFrfC+w1m6JQvnyyy/prrvuonnz5tFXX30lH9cTJp/4fhFUje8FtpO//vqLXnzxRQJAbW1tdOnS\nJcPn+llEPW4bgb/++muqqqqiuXPn0qFDh+SrWH6aEDlBSQqsFO2ff/6hV155hQDQ888/T+vXr/ft\nhMgJil5gLQtkooXDYbrnnnto9uzZ9Pnnn1Mmk8k5IbLjPiA/UvQCa1lgPB6X47wrV66kP/74Q/7N\n6DpZTbG68qIUWC+wceTIEaqpqaE777yT9u7da3iTuTofP69tzVCUAms19tWrV2njxo0EgFasWEFn\nzpwhIuuW5+e1rRmKUmB1Yx87dowefPBBEkWRPv74Y5qenpbT6t3Pq0exCqqmKAVm3LhxgyRJokAg\nQE1NTfT7779npbEqcKng6r5oOzl+/DgaGhqwZ88e9PX14YcffkBdXV1WOvb8R7b3mKODCx0ub49L\npVK0bds2CgaD1NjYSK+++qrpMdbuGbGfZ9hF5aKVz0TeuXMndXV1Wboma/eM2M8z7KJw0epnIre3\nt6O3txd79uyR05h5lJ+tj/9zID/PcKHDZfW48+fPU2NjIwWDQXrrrbcolUrNuJHLyZmun12vUXzv\nooeGhuiJJ56Y8UxkM0sYrds2jQrmZ9drFN8LrMbsTkSlSFrBCz3B83UkP92DlIuiEli5J4oJZUYk\ntWCFWqiyLn6lqARW3xXAxLUqUqHRKm7BFshnwWpBiqGRvcT3yyTlrZdat4eyu+J//vln/9+e6XM8\nETjfe+nj8bj890cffcRFLgBPBM4XREgkEpAkSf5u5qVSHBUuDBmm7g/OdWdfvrS3E74fg3Oh5bop\nxwsZ87l5zi18JbDadeuJWDKxYqdx3qFouxQjLrZUdmDYjavP6GhtbaUXXniBXnrpJTp9+rThCpVC\nTNgrXBmDz549i/7+fuzfvx+Dg4Po7OzEpk2bDJ/PXaw7hKyeGIlEsGvXLlRVVQEA6uvrMT4+jsnJ\nSUQikbznJxIJx599xTEg8NGjR9HZ2Zl1vLe3F6tWrQJwa6bb19eHp556SlPc6elpALcenMYpHNaO\nrF31yCtwc3Mzfvvtt5y/37hxA1u3bsXo6Cj279+vmWZsbAwA0NHRkbdCHOOMjY3hvvvu000TIMqx\n0DTAhQsXsGHDBsyfPx99fX1ZT15lpFIpDA8Po7KyEoIgWC2O8z/T09MYGxtDfX19zjZnWBb4ypUr\naGtrQ1tbG9544w1LFeU4j+VJ1hdffIGLFy9iaGgIQ0ND8vEDBw5g7ty5tlSOUzgFuWiO//FVqJJj\nP5ZdtB0MDg7is88+QyAQwKxZs7B9+3YsXLjQyyohmUwikUhgcnISdXV16O3tRXl5uad1UmK6zZwM\nqelx5swZWrZsmfzQlGQySc3NzV5Vh4iMv6LAK6y0mWcCj4yM0HfffSd/Hx8fpwULFtDExIRXVTL8\nigKvsNJmjrtoOyJhbqH3igI/uOnq6mpUV1cDMN5mjgtsRyTMLTKZjObxYNBfc1EzbeZpzS9cuID2\n9nYIgoCDBw9i9uzZXlYHd999txxWBYBLly6hoqICd9xxh4e1monpNnN84MjB5cuX6cknn6Tdu3d7\nVYUsjL6iwCustJlngY59+/bhk08+QW1t7YzjXkfCjh49ikQigXQ6jZqaGrz//vuYM2eOZ/VRYqXN\neCSrxPHX7IFjO1zgEocLXOJwgUscLnCJwwUucbjAJQ4XuMT5D6iJKNYPmN+3AAAAAElFTkSuQmCC\n", 189 | "text/plain": [ 190 | "" 191 | ] 192 | }, 193 | "metadata": {}, 194 | "output_type": "display_data" 195 | } 196 | ], 197 | "source": [ 198 | "# property of interest\n", 199 | "property_ = 'max_peak'\n", 200 | "lasso_alpha = 5e-2\n", 201 | "\n", 202 | "# format data for property \n", 203 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n", 204 | "\n", 205 | "# encode sequences\n", 206 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n", 207 | "X = np.array(X)\n", 208 | "\n", 209 | "# use lasso to limit the input \n", 210 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n", 211 | "\n", 212 | "# reformat X with only lasso-limited set for GP model\n", 213 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n", 214 | "\n", 215 | "# evaluate cross-validation performance of GP model with lasso-limited set\n", 216 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n", 217 | "\n", 218 | "# Bayesian ridge regression to find weights\n", 219 | "clf_ff = linear_model.BayesianRidge()\n", 220 | "clf_ff.fit(X_lasso, y)\n", 221 | "weights = clf_ff.coef_\n", 222 | "\n", 223 | "# find features \n", 224 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n", 225 | "\n", 226 | "# find the correct numbering for each feature\n", 227 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": { 233 | "deletable": true, 234 | "editable": true 235 | }, 236 | "source": [ 237 | "## Off-kinetics" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 11, 243 | "metadata": { 244 | "collapsed": false, 245 | "deletable": true, 246 | "editable": true 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "20-fold corss validation of GP regression model\n", 254 | "R = 0.92\n" 255 | ] 256 | }, 257 | { 258 | "data": { 259 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAHEAAABvCAYAAADboi87AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADSZJREFUeJztnX9oW1UUx78vL2syW9f6o6PTDaeok6VlY6nQH2PVKiVW\n24rOUtb5g41NbVZjU9HhEIfoKqOZFteCKyKUzTmG0CCT0iGNukWE1k1bYRaqo9H+sJ112C1rzcvx\nj+09XtL8eEleXn69DwTSvJt7b9+559xzzr3vhiEigkpKo0l0B1RiRxViGqAKMQ1QhZgGqEJMA7Ry\nVXTt2jWMjIwgPz8fLMvKVW3GwnEcZmZmUFhYCL1eH7KsbEIcGRlBY2OjXNWp3ODYsWMoLi4OWUY2\nIebn5wuNFhQUyFVtxjI1NYXGxkbhvoZCNiHyJrSgoACrV6+Wq9q0pbW1FZ2dnTCbzbDZbEHLSZma\nVMcmQXR2dmJhYQFdXV0x16UKMUGYzWbo9Xo0NTUBuK6Zer0era2tkVdGMuFyuej+++8nl8slV5UZ\nhU6nIwCk1+uJKLL7qWpikuCvmZGgCjFB+JtPm80Gt9sd0skJhirEOBNsrvN3bH777TccPXo0qjYk\nCdFut6O2thZ1dXVoaGjA8PBwVI2lEjE5GiKCeaFi8zk4OIiSkhLYbDZQNMu74SbNsbExKi8vp+np\naSIicjgcVFFRsaRcujk2/o5GtFitVtLr9WS1WgNe7+vro+zsbHrwwQeFe0wU2f0MK0SXy0UDAwPC\n37Ozs2QwGGhhYWFJuXQSYribLwc9PT2k1WrJZDLRv//+63NNViGK8Xq91NraSs3NzUuupZsQ5cRq\ntZJOpxMGhNfrpYMHDxIAeu6552hxcXHJd+IixCtXrlBzczM988wzdPny5ZgazTTEppnjOHr11VcJ\nAO3du5e8Xm/A78geJ05MTKChoQEsy6KnpwcrVqyIfPLNYHgnZvfu3di2bRs6OjrQ0dGBtrY2MAwT\newPhpDw3N0cPP/wwffTRRyHLpaIm+pu5eHL58mWqrKykrKwsOnHiRNjysmri8ePHMTk5idOnT6Ou\nrk54zc3NxT6CEoycSehQTE5OYsuWLRgcHERfXx/q6+vlbUCOURbpyEkWlPBAL1y4QGvXrqVVq1bR\n+fPnJX9PzZ1KJJZUVyj4RMG2bdtQXl4OnU4Hp9OJDRs2yNoOT0YLMV7wZvr48eO49957cebMGaxd\nuzZu7alCjAMVFRUAgLvvvhtff/01br/99ri2pwoxBvzzq1arFVqtFv39/dixYwdGR0eRnZ0d/47E\nMmlHOxGnC+Ig3uPxEMMwBIAYhgkaxEtFdWwUQhzE19fXCysQGo0Gr732mqClcq2IBCWm4RLlyElm\npCQAxGXm5uZoy5YtpNPpqKamRghZxFoqfi81wRC3BLhcjSYzUpag+DI6nY4KCwspLy+Pvv32W58y\n4hhU/F7qEpcqxBgwGo0EgIxGY9AyVquVsrKy6Oabb6Y777yThoeHJdcvNcGgCjEGeE0BEPRGnzlz\nhm655RZav349jY+Px6UfqmMTA2azWXgfKKdqt9vx6KOPwmAw4LvvvsOaNWuU7F5AVCH6YbPZYLVa\nA24fPHLkCJ566imYTCb09/fj1ltvTVAv/UiE+icDkSxDeb1e2r9/PwGgF198kTweT9z7p86JEpDi\nJfIOTFFREQGgd955J+YgXirqnCgBKTuuDx8+jMXFRQwPD6O7uxtvvfWWPCvxcpOIkZMIIjGfvAZm\nZ2cTAKqrq4uqnlhQzWkAwplPXjh8nMi/zp49G7AehAhB5EA1pwEIZz75NcChoSHhs+effx5ffPGF\nT94zXAiSEBIxcpINq9VKLMuSRqMhlmWJYRjatWsXEQXWYCW2dWSEJkpZGZC6etDZ2QmO4+D1erF5\n82b8/fffOHLkCABfDebrAxCXbR1Rk4iRIwf+GhLI4Qg2D/qXraysJAB03333kdvtltxmPMkIx8bf\npEVi9sSrEG+++SYBILPZHDaIV8KM8mSEEP3hb7DRaBS0LFg4wH9uMBgIAB04cECxIF4qGSlEHl7L\nWJb1CRXEgpyfn6fq6mpiWZY+/fTTxHU2BBnh2ASDd0T8OXToELRaLTZu3IicnBx89dVXICIcPnxY\nNgcpYSRi5CiB1Wr10UQpr2BzXTTbK2Ilo82pGPE8qdFoCABlZWWRRqMhhmGEmJAXYqhsTqTbK2Il\no82pP//99x/OnTsnmNKLFy8KMaHH40FLSwtYlgXLsiGT4XRjJ1ssR5XEjUSMHKUQ5zk1Gk3Ah2Mj\nqUeJ+JBH1cQblJaWAgAYhkFzc3PUD8cmpfaJkTIqBgYG6IknnqCqqipqbm5eckhApCNHbvydDY7j\n6PXXXycA1NLSQhzHKd6nWJHVsbl06RKVlJTQ77//TkREBw8epLfffjumRuVGbDYtFgs9++yzBIDa\n29sV74tcyCpEu90uZPT5yjdt2rQkw5FoTYRo7tNqtXT06NGQ5ZV6zDtaZJ0Tp6amfE4SLigowPz8\nPK5cuRInAx85NpsNL730EhiGAcuyqK2txc6dO5cE53zQ3tHRochj3koRVoherzfwFzXJ4xONjY3h\n9OnTyM/Px/fff49Tp04FFBK/8AsguR2VCAkriVWrVmFmZkb4e3p6Grm5ubjpppvi2jExvAYVFxcv\nSX8NDQ2hrKwMAOB0OmE0GoN6k/znFosl6vXApEzBhbO3s7OzVFpaKjg27e3ttHfv3phseKSIHReI\n4rX+/n7Kyckho9Hocy5aPEnJjM1tt92GtrY2vPLKK3jssccwOjqKN954I95jywdeg4xGo6Bhx44d\nQ3V1NcrKyuBwOLBy5UpF+5JUpjgRIydW2tvbCQBt376dLBZL0nua0ZA2CXD/x8w4jqNNmzYRACou\nLiaO4xKSElOCtEm78dsHh4aGoNPpsH79evz4448Arv8ijkajSU7zpjBJLUSj0Si8X1xcxK+//orq\n6mro9XoYDAZotVp0dHRgxYoVOHToUNif40lXklqIg4ODmJycFH5qZ+vWrTh16hTcbjdGRkbAcRw4\njsNff/0FAD4bfzOJpBbi6OgoSktLwbIszp07h5MnTwrXzGazsA7Ie6Zizc0kEiJEKQHzDz/8gPLy\ncixbtgxOpxMbN270uW6z2eDxeODxeDA9PQ0iQkVFRfIF4koQb28qULKZ34nGsuySB1i0Wq3P3xqN\nRtiGuHz5cp9rK1eu9Klf6mbhVCCpQoxAN1YsRLFQYnnxe2BCbRZOpTAkqUKMQCGAxWIRcpjh5jGW\nZSU92NnU1BT06Mu0D0PiOXLC7cDmP/d4PLRnzx4CQPv27aOWlhaf68E0TMlt9UqTNOY02PMREJlA\nt9tNW7duJYZhqLKy0sfEsiwrV/dSjqQxp4HMWGdnp/B+x44dMJlM+PLLL3Hy5EmcPXsWHMcJ1zmO\nU+aAu1RH6ZHDm8Bdu3ZRUVER5ebm0jfffCNcQwCHJRUdk1iJRBNl+01hqdhsNly6dAnd3d3IycmB\n0+lEUVGRcA24/hi1wWDAL7/8ImhxV1dX+jomsaL0yHE6nYKW6XQ6uZpPO5JiThTPY8XFxWAYBnl5\neSgrK0N2djZ0Op3PIQYqMRCvkSOexxAmUOczMeLjKVMxyyInSaGJvGf68ssv44477ghZ1u12A/Bd\nhVDq12PSgbgJ0WazYX5+Hm63GxMTE9i/fz9aWlpCZl/E19I+yyIn8VL/q1ev0pNPPkkajYY+/vjj\nkN9N58xLtCRFxubEiROk1+upt7dXriYyiqSIE59++mlUVVUhLy8vXk2o3CBucyLLsqoAFUI2TeRz\nnlNTU3JVmdHw91GcSw6GbELkn9dobGyUq0oVXL+vd911V8gyDNGNEwVi5Nq1axgZGUF+fj5YlpWj\nyoyG4zjMzMygsLAw4Lk8YmQTokriSOotiyrSUIWYBqhCTAMUXxSWA4fDAZvNhsXFRaxbtw4HDhxA\nTk5OxGWUxm6345NPPgHDMFi+fDn27dsnLIjzvP/+++jr60Nubi6A6z9p++GHH4auOM7ZI9mRciSL\n1GNblGRsbIzKy8uFJ5odDgdVVFQsKVdfX09DQ0MR1Z1yQpRyJIvUY1uUxOVy0cDAgPD37OwsGQwG\nWlhYED5bWFigwsJCMpvNVFNTQ3v27KE///wzbN0pNydKOZIlGY9tWb16NR566CEA1w/7a2trQ2Vl\nJbKysoQy09PTKCkpgdVqhd1ux4YNG9DU1CQcDhiMlBOilCNZkvnYlqtXr8JisWB8fBzvvvuuz7U1\na9agu7sb99xzDxiGwc6dOzE+Po4//vgjZJ2J/68iRMqRLMlwbEsgJiYm0NDQAJZl0dPTs+TAwAsX\nLqC3t9fnMyLCsmXLQtabckLcvHkzfvrpJ1y8eBEA8Pnnn+ORRx6JuIzS/PPPP9i+fTuqqqrwwQcf\nBEylaTQavPfee3C5XACAzz77DOvWrfOZGgIi+wyuAA6Hg2pqashkMtHu3btpbm6Ofv75Z6qtrQ1Z\nJpF0dXXRAw88QLW1tT6v8+fP+/S7t7eXHn/8cTKZTPTCCy9IcmzU3GkakHLmVGUpqhDTAFWIaYAq\nxDRAFWIaoAoxDVCFmAaoQkwD/geodBJbOZ0P5wAAAABJRU5ErkJggg==\n", 260 | "text/plain": [ 261 | "" 262 | ] 263 | }, 264 | "metadata": {}, 265 | "output_type": "display_data" 266 | } 267 | ], 268 | "source": [ 269 | "# property of interest\n", 270 | "property_ = 'kinetics_off'\n", 271 | "lasso_alpha = 3e-2\n", 272 | "\n", 273 | "# format data for property \n", 274 | "log_data, y, seq, df_select = lasso_tools.data_format_all(property_, df_input)\n", 275 | "\n", 276 | "# encode sequences\n", 277 | "X = encoding.one_hot_(df_select['seq'].values, ss, contacts)\n", 278 | "X = np.array(X)\n", 279 | "\n", 280 | "# use lasso to limit the input \n", 281 | "coeffs = lasso_tools.lasso_(lasso_alpha, X, y)\n", 282 | "\n", 283 | "# reformat X with only lasso-limited set for GP model\n", 284 | "X_lasso = lasso_tools.lasso_reformat_X(coeffs, X)\n", 285 | "\n", 286 | "# evaluate cross-validation performance of GP model with lasso-limited set\n", 287 | "measured_CV, predicted_CV = lasso_tools.cross_validation(X_lasso, log_data, property_)\n", 288 | "\n", 289 | "# Bayesian ridge regression to find weights\n", 290 | "clf_ff = linear_model.BayesianRidge()\n", 291 | "clf_ff.fit(X_lasso, y)\n", 292 | "weights = clf_ff.coef_\n", 293 | "\n", 294 | "# find features \n", 295 | "df_features = lasso_tools.find_features(df_select, ss, contacts, coeffs, X, weights)\n", 296 | "\n", 297 | "# find the correct numbering for each feature\n", 298 | "df_features_reformat = lasso_tools.refromat_feature_numbering(df_features, df_select, property_, lasso_alpha)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": true, 306 | "deletable": true, 307 | "editable": true 308 | }, 309 | "outputs": [], 310 | "source": [] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python 2", 316 | "language": "python", 317 | "name": "python2" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 2 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython2", 329 | "version": "2.7.6" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 2 334 | } 335 | -------------------------------------------------------------------------------- /regression/GP_tools.py: -------------------------------------------------------------------------------- 1 | ## Tools for GP 2 | 3 | from __future__ import division 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import os 8 | import pandas as pd 9 | import pickle 10 | from scipy.linalg import cho_solve 11 | 12 | 13 | # ML imports 14 | from sklearn import linear_model 15 | from sklearn.cross_validation import train_test_split 16 | from sklearn.model_selection import LeaveOneOut 17 | from sklearn.metrics.pairwise import euclidean_distances 18 | from scipy.spatial import distance 19 | from scipy import optimize, linalg 20 | import scipy 21 | 22 | def matern_5_2_kernel(X, X_, hypers): 23 | """ Calculate the Matern kernel between X and X_. 24 | Parameters: 25 | X (np.ndarray): 26 | X_ (np.ndarray) 27 | hypers (iterable): default is ell=1.0. 28 | Returns: 29 | K (np.ndarray) 30 | """ 31 | D = euclidean_distances(X, X_) 32 | D_L = D / hypers[0] 33 | 34 | first = (1.0 + np.sqrt(5.0) * D_L) + 5.0 * D_L ** 2 / 3.0 35 | second = np.exp(-np.sqrt(5.0) * D_L) 36 | 37 | K = first * second 38 | return K 39 | 40 | def predict_GP(X_train, y_train, X_test, prams): 41 | """ Gaussian process regression predictions. 42 | Parameters: 43 | X_train (np.ndarray): n x d training inputs 44 | y_train (np.ndarray): n training observations 45 | X_test (np.ndarray): m x d points to predict 46 | Returns: 47 | mu (np.ndarray): m predicted means 48 | var (np.ndarray): m predictive variances 49 | """ 50 | 51 | K = matern_5_2_kernel(X_train, X_train, prams[1:]) 52 | Ky = K + np.identity(len(K))*prams[0] 53 | 54 | # To invert K_y we use the Cholesky decomposition (L) 55 | L = np.linalg.cholesky(Ky) 56 | 57 | # solve for z=L^-1y 58 | z = linalg.solve_triangular(L, y_train, lower=True) 59 | alpha = linalg.solve_triangular(L.T, z, lower=False) 60 | 61 | K_star = matern_5_2_kernel(X_train, X_test, prams[1:]) 62 | mu = np.matmul(K_star.T, alpha) 63 | 64 | 65 | # Compute the variance at the test points 66 | z = linalg.solve_triangular(L, K_star, lower=True) 67 | alpha = linalg.solve_triangular(L.T, z, lower=False) 68 | K_star_star = matern_5_2_kernel(X_test, X_test, prams[1:]) 69 | v = np.diag(K_star_star) - np.dot(K_star.T, alpha) 70 | v = np.diag(v) 71 | return mu, v 72 | 73 | def neg_log_marg_likelihood(log_prams, X, y): 74 | """ Calculate the negative log marginal likelihood loss. 75 | We pass the log hypers here because it makes the optimization 76 | more stable. 77 | Parameters: 78 | log_hypers (np.ndarray): natural log of the hyper-parameters 79 | X (np.ndarray) 80 | y (np.ndarray) 81 | Returns: 82 | (float) The negative log marginal likelihood. 83 | """ 84 | 85 | non_log_prams = np.exp(log_prams) 86 | #print(non_log_prams) 87 | 88 | # Evaluate kernel on training data 89 | K = matern_5_2_kernel(X, X, non_log_prams[1:]) 90 | 91 | # To invert K we use the Cholesky decomposition (L), because symmetric and positive definite 92 | n = len(y) 93 | Ky = K + np.identity(len(K))*non_log_prams[0] 94 | L = np.linalg.cholesky(Ky) 95 | z = linalg.solve_triangular(L, y, lower=True) 96 | alpha = linalg.solve_triangular(L.T, z, lower=False) #dont know about this 97 | 98 | first = 0.5 * np.dot(y, alpha) 99 | second = np.sum(np.log(np.diag(L))) 100 | third = 0.5 * len(K) * np.log(2 * np.pi) 101 | 102 | #log_p_y_X = 0.5*np.matmul(y, alpha) + np.sum(np.log(np.diag(L))) + 0.5*n*np.log(2*np.pi) 103 | log_p_y_X = (first + second + third) 104 | return log_p_y_X -------------------------------------------------------------------------------- /regression/__pycache__/.gitignore: -------------------------------------------------------------------------------- 1 | *.cpython-36.pyc 2 | -------------------------------------------------------------------------------- /regression/chimera_tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import matplotlib.pyplot as pl 4 | import seaborn as sns 5 | import os 6 | import pandas as pd 7 | import pickle 8 | 9 | 10 | 11 | def chimera_code2seq_convert(file_c,file_n,df_data): 12 | # Load each position file as a df 13 | df_c = pd.read_csv(file_c, sep=' ', 14 | names = ['chimera', 'E', 'm', 'seq']) 15 | df_n = pd.read_csv(file_n, sep=' ', 16 | names = ['chimera', 'E', 'm', 'seq']) 17 | 18 | seq_input = [] 19 | for i in df_data.block_k: 20 | if i[0] == 'c': 21 | seq_input.append(df_c[df_c.chimera == i].seq.values[0]) 22 | elif i[0] == 'n': 23 | seq_input.append(df_n[df_n.chimera == i].seq.values[0]) 24 | 25 | df_data['seq'] = seq_input 26 | 27 | return df_data 28 | 29 | def normalize_(data): 30 | """ 31 | Normalize data by subtracting the mean and dividing by the std 32 | Also, make positive 33 | """ 34 | return (data - np.mean(data)) / np.std(data) 35 | 36 | def un_normalize_(norm_data, data): 37 | """ 38 | Normalize data by subtracting the mean and dividing by the std 39 | Also, make positive 40 | """ 41 | return norm_data*np.std(data) + np.mean(data) -------------------------------------------------------------------------------- /regression/encoding_tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import matplotlib.pyplot as plt 3 | import os 4 | import numpy as np 5 | import pandas as pd 6 | import pickle 7 | 8 | 9 | def one_hot_seq(seq_input): 10 | # make amino acid directory 11 | my_dict = {'-':0, 'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,\ 12 | 'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,\ 13 | 'V':18,'W':19,'Y':20} 14 | 15 | #print(seq_input[0]) 16 | L = len(seq_input[0]) 17 | n = len(seq_input) 18 | 19 | X = np.zeros((n, len(my_dict)*L)) 20 | 21 | # loop through each sequence and one_hot encode 22 | for i, seq in enumerate(seq_input): 23 | for j, aa in enumerate(seq): 24 | # fine one index that should be '1' 25 | aa_indx = my_dict[aa] 26 | X[i][21*j+aa_indx] = 1 27 | 28 | return X 29 | 30 | def one_hot_contacts(seq_input, ss, contacts): 31 | # make contact directory 32 | my_contact_dict = {} 33 | AAs = ['-','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S',\ 34 | 'T','V','W','Y'] 35 | count = 0 36 | for k in AAs: 37 | for j in AAs: 38 | my_contact_dict[(k, j)] = count 39 | count += 1 40 | 41 | n = len(seq_input) 42 | 43 | X = np.zeros((n, len(my_contact_dict)*len(contacts))) 44 | 45 | # loop through each sequence and one_hot encode contacts 46 | for i, seq in enumerate(seq_input): 47 | for p, (j, k) in enumerate(contacts): 48 | # find the contact 49 | contact_index = my_contact_dict[(seq[j],seq[k])] 50 | X[i][len(my_contact_dict)*p+contact_index] = 1 51 | 52 | return X 53 | 54 | def one_hot_(seq_input, ss, contacts): 55 | # reshape to contain both contact and sequence info 56 | X_seq = one_hot_seq(seq_input) 57 | X_contact = one_hot_contacts(seq_input, ss, contacts) 58 | 59 | X = [] 60 | for i, x in enumerate(X_seq): 61 | x_c = X_contact[i] 62 | X.append(np.concatenate((x, x_c))) 63 | return X 64 | 65 | -------------------------------------------------------------------------------- /regression/inputs/lit_alignment_and_contacts_pro2.pkl: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:39f67bf7044c78b7884aafe40df15ce515adc9302772f3cd927767f9efe3b5b9 3 | size 19286 4 | -------------------------------------------------------------------------------- /regression/lasso_tools.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import seaborn as sns 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import pickle 8 | 9 | 10 | # ML imports 11 | from sklearn import linear_model 12 | from scipy import optimize 13 | import scipy 14 | from sklearn.model_selection import KFold 15 | 16 | # custom imports 17 | import encoding_tools as encoding 18 | import GP_tools as GP 19 | 20 | 21 | def data_format_all(property_, df): 22 | # remove ChR_29_10 & ChR_30_10 for kinetics and spectra because currents too low for accurate measurements 23 | if property_ == 'green_norm' or property_ == 'kinetics_off': 24 | df = df[df.chimera != 'ChR_29_10'] 25 | df = df[df.chimera != 'ChR_30_10'] 26 | 27 | # make a seperate dataframe for the selected property 28 | df_select = pd.DataFrame() 29 | df_select['prop'] = df[str(property_)] 30 | df_select['seq'] = df['seq'] 31 | df_select['block_k'] = df['block_k'] 32 | df_select['chimera'] = df['chimera'] 33 | df_select.dropna(inplace=True) 34 | 35 | # normalize training data 36 | log_data = np.log(df_select.prop.values) 37 | y = (log_data - np.mean(log_data))/np.std(log_data) 38 | seq = df_select.seq.values 39 | 40 | return log_data, y, seq, df_select 41 | 42 | def cross_validation(X, log_data, property_): 43 | path_outputs = 'outputs/' 44 | 45 | kf = KFold(n_splits=20) # Define the split 46 | kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator 47 | 48 | mu_s = [] 49 | var_s = [] 50 | y_s = [] 51 | 52 | for train_index, test_index in kf.split(X): 53 | X_train, X_test = X[train_index], X[test_index] 54 | 55 | log_data_train, log_data_test = log_data[train_index], log_data[test_index] 56 | 57 | y_train = (log_data_train - np.mean(log_data_train))/np.std(log_data_train) 58 | y_test = (log_data_test - np.mean(log_data_train))/np.std(log_data_train) 59 | 60 | initial_guess = [0.1,10] 61 | 62 | # take the log of the initial guess for optimiziation 63 | initial_guess_log = np.log(initial_guess) 64 | 65 | # optimize to fit model 66 | result = scipy.optimize.minimize(GP.neg_log_marg_likelihood, initial_guess_log, args=(X_train,y_train), method='L-BFGS-B')#, 67 | 68 | # next set of hyper prams 69 | prams_me = [np.exp(result.x[0])**2, np.exp(result.x[1])] 70 | 71 | # next used trained GP model to predict on test data 72 | mu, var = GP.predict_GP(X_train, y_train, X_test, prams_me) 73 | 74 | # append 75 | mu_s.append(mu) 76 | var_s.append(var) 77 | y_s.append(y_test) 78 | 79 | # reformat all 80 | y_s_all = [j for i in y_s for j in i] 81 | mu_s_all = [j for i in mu_s for j in i] 82 | 83 | # plot results 84 | plt.figure('GP test set', figsize=(1.5, 1.5)) 85 | plt.plot(y_s_all, mu_s_all, 'o', color='k', ms=3) 86 | 87 | # calc correlation 88 | measured = y_s_all 89 | predicted = mu_s_all 90 | 91 | par = np.polyfit(measured, predicted, 1, full=True) 92 | slope=par[0][0] 93 | intercept=par[0][1] 94 | 95 | # calc correlation 96 | variance = np.var(predicted) 97 | residuals = np.var([(slope*xx + intercept - yy) for xx,yy in zip(measured, predicted)]) 98 | Rsqr = np.round(1-residuals/variance, decimals=2) 99 | 100 | print('20-fold corss validation of GP regression model') 101 | print('R = %0.2f'% np.sqrt(Rsqr)) 102 | 103 | max_x = np.max(y_s_all) 104 | min_x = np.min(y_s_all) 105 | 106 | plt.plot([min_x, max_x], [slope*min_x+intercept, slope*max_x+intercept], '-', color='k') 107 | plt.savefig(path_outputs + str(property_)+'_matern_kernel_LASSO_CV.pdf', bbox_inches='tight', transparent=True) 108 | plt.show() 109 | return measured, predicted 110 | 111 | def lasso_(alpha_, X, y): 112 | """ import alpha and full X matrix and y to give limited feature set""" 113 | clf = linear_model.Lasso(alpha=alpha_) 114 | 115 | # fit model with training data 116 | clf.fit(X,y) 117 | 118 | # get the coeff for the input to the next model 119 | lasso_coeff = clf.coef_ 120 | return lasso_coeff 121 | 122 | def lasso_reformat_X(lasso_coeff, X): 123 | X_lasso = [] 124 | for x in X: 125 | X_lasso.append(x[lasso_coeff != 0]) 126 | return np.array(X_lasso) 127 | 128 | 129 | def id_sequence_features(index_seq, seqs): 130 | # make sequence directory 131 | my_dict = {'-':0, 'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,\ 132 | 'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,\ 133 | 'V':18,'W':19,'Y':20} 134 | 135 | # make a vector with amino acids filling 136 | L = len(seqs[0]) 137 | 138 | seq_key = np.chararray(len(my_dict)*L) 139 | 140 | 141 | for j in range(L): 142 | for k, v in my_dict.items(): 143 | seq_key[j*len(my_dict)+v] = k 144 | 145 | amino_acid_numb = np.floor(index_seq / len(my_dict)) 146 | amino_acid = seq_key[index_seq] 147 | # print(str(amino_acid)+str(int(amino_acid_numb))) 148 | aa_numb=amino_acid_numb 149 | aa=amino_acid 150 | return aa_numb, aa 151 | 152 | 153 | def id_contact_features(index_contacts, contacts): 154 | # make contact directory 155 | my_contact_dict = {} 156 | AAs = ['-','A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y'] 157 | count = 0 158 | for k in AAs: 159 | for j in AAs: 160 | my_contact_dict[(k, j)] = count 161 | count += 1 162 | 163 | contact_key = np.chararray((len(my_contact_dict), 2)) 164 | 165 | for k, v in my_contact_dict.items(): 166 | contact_key[v][0] = k[0] 167 | contact_key[v][1] = k[1] 168 | 169 | contact_numbers = [] 170 | contact_amino_acids = [] 171 | 172 | contact_numb = int(np.floor(index_contacts / len(my_contact_dict))) 173 | contact_numbers = contacts[contact_numb] 174 | amino_acid_contact = contact_key[index_contacts % len(my_contact_dict)] 175 | contact_amino_acids = [amino_acid_contact[0], amino_acid_contact[1]] 176 | return [amino_acid_contact[0], amino_acid_contact[1]], contacts[contact_numb] 177 | 178 | def unique_columns2(data): 179 | """ 180 | Identify co-varying columns 181 | """ 182 | dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0])) 183 | dataf = np.asfortranarray(data).view(dt) 184 | u,uind = np.unique(dataf, return_inverse=True) 185 | u = u.view(data.dtype).reshape(-1,data.shape[0]).T 186 | return (u,uind) 187 | 188 | def find_features(df, ss, contacts, coeffs, X, weights): 189 | # re make the sequence and contact X 190 | X_seq = encoding.one_hot_seq(df['seq'].values) 191 | X_contact = encoding.one_hot_contacts(df['seq'].values, ss, contacts) 192 | X_seq = np.array(X_seq) 193 | X_contact = np.array(X_contact) 194 | 195 | # find the non-zeros features 196 | index_not_zero = np.where(coeffs != 0)[0] 197 | 198 | # find the co-varying 199 | u,uind = unique_columns2(X) 200 | vects_covary = uind[index_not_zero] 201 | lim_set = list(set(vects_covary)) 202 | 203 | # make dataframe of lasso features with weights 204 | df_lasso_features = pd.DataFrame() 205 | df_lasso_features['index_not_zero'] = index_not_zero 206 | df_lasso_features['vects_covary'] = vects_covary 207 | df_lasso_features['weights'] = weights 208 | 209 | # of the lasso limited set, remove replicated values and ID the covarying set of features for each lasso-limited set 210 | co_vary_lim_set = [] 211 | co_vary_lim_set_weights = [] 212 | for i in lim_set: 213 | j, = np.where(uind == i) 214 | co_vary_lim_set.append(j) 215 | w = df_lasso_features[df_lasso_features.vects_covary == i].weights.values[0] 216 | co_vary_lim_set_weights.append(w) 217 | 218 | # build list of all features, seperate sequence vs contact features 219 | all_features = [] 220 | feature_type = [] 221 | weights_ = [] 222 | aa_ = [] 223 | for ind, i in enumerate(co_vary_lim_set): 224 | for j in i: 225 | if j < np.shape(X_seq)[1]: 226 | aa_number, aa = id_sequence_features(j, df['seq'].values) 227 | all_features.append(aa_number) 228 | aa_.append(aa) 229 | weights_.append(co_vary_lim_set_weights[ind]) 230 | feature_type.append('seq') 231 | 232 | elif j > np.shape(X_seq)[1]: 233 | j_contact = j - np.shape(X_seq)[1] 234 | contact_aas, contact_pos = id_contact_features(j_contact, contacts) 235 | all_features.append(contact_pos) 236 | aa_.append(contact_aas) 237 | weights_.append(co_vary_lim_set_weights[ind]) 238 | feature_type.append('contact') 239 | 240 | df_features = pd.DataFrame(dtype=object) 241 | df_features['weights'] = weights_ 242 | df_features['feature'] = all_features 243 | df_features['type'] = feature_type 244 | df_features['aa'] = aa_ 245 | 246 | # define different co-varying groups 247 | groups_ = list(set(df_features.weights)) 248 | group_number = range(len(groups_)) 249 | 250 | feature_by_group = [] 251 | for i in df_features.weights: 252 | feature_by_group.append(groups_.index(i)) 253 | df_features['feature_group'] = feature_by_group 254 | return df_features 255 | 256 | def refromat_feature_numbering(df_features, df_select, property_, lasso_alpha): 257 | C1C2_seq = df_select[df_select.chimera == 'C1C2'].seq.values[0] 258 | CheRiff_seq = df_select[df_select.chimera == 'CheRiff'].seq.values[0] 259 | CsChrim_seq = df_select[df_select.chimera == 'CsChrim'].seq.values[0] 260 | 261 | C1C2_seq_numb = range(len(C1C2_seq)) 262 | CheRiff_seq_numb = range(len(CheRiff_seq)) 263 | CsChrim_seq_numb = range(len(CsChrim_seq)) 264 | 265 | # C1C2 numbering: first drop the gaps in the alignment sequence, but keep proper index 266 | seq_numb_mod = [] 267 | C1C2_seq_mod = [] 268 | for ind,i in enumerate(C1C2_seq): 269 | if i != '-': 270 | seq_numb_mod.append(ind) 271 | C1C2_seq_mod.append(i) 272 | 273 | gaps = ['-']*49 274 | C1C2_seq_mod = ''.join(gaps+C1C2_seq_mod) 275 | C1C2_seq_numb_mod = [-1]*49 + seq_numb_mod 276 | 277 | # CheRiff numbering: first drop the gaps in the alignment sequence, but keep proper index 278 | seq_numb_mod = [] 279 | CheRiff_seq_mod = [] 280 | for ind,i in enumerate(CheRiff_seq): 281 | if ind == 22: 282 | seq_numb_mod.append(ind) 283 | CheRiff_seq_mod.append(i) 284 | elif ind == 23: 285 | seq_numb_mod.append(ind) 286 | CheRiff_seq_mod.append(i) 287 | elif ind == 24: 288 | seq_numb_mod.append(ind) 289 | CheRiff_seq_mod.append(i) 290 | elif i != '-': 291 | seq_numb_mod.append(ind) 292 | CheRiff_seq_mod.append(i) 293 | 294 | gaps = ['-']*74 295 | CheRiff_seq_mod = ''.join(gaps+CheRiff_seq_mod) 296 | CheRiff_seq_numb_mod = [-1]*74 + seq_numb_mod 297 | 298 | # CsChrim numbering: first drop the gaps in the alignment sequence, but keep proper index 299 | seq_numb_mod = [] 300 | CsChrim_seq_mod = [] 301 | for ind,i in enumerate(CsChrim_seq): 302 | if ind == 22: 303 | seq_numb_mod.append(ind) 304 | CsChrim_seq_mod.append(i) 305 | elif i != '-': 306 | seq_numb_mod.append(ind) 307 | CsChrim_seq_mod.append(i) 308 | 309 | gaps = ['-']*47 310 | CsChrim_seq_mod = ''.join(gaps+CsChrim_seq_mod) 311 | CsChrim_seq_numb_mod = [-1]*47 + seq_numb_mod 312 | 313 | path_outputs = 'outputs/' 314 | 315 | # go through sequence/contact features and adjust numbering for plotting on 3ug9.pdb 316 | feature_adjust = [] 317 | aa_feature_adjust = [] 318 | for ind, i in enumerate(df_features.feature): 319 | if df_features.type[ind] == 'seq': 320 | feature_adjust.append(C1C2_seq_numb_mod.index(i)) 321 | aa_feature_adjust.append(C1C2_seq_mod[C1C2_seq_numb_mod.index(i)]) 322 | else: 323 | feature_adjust.append([C1C2_seq_numb_mod.index(i[0]), C1C2_seq_numb_mod.index(i[1])]) 324 | aa_feature_adjust.append([C1C2_seq_mod[C1C2_seq_numb_mod.index(i[0])], 325 | C1C2_seq_mod[C1C2_seq_numb_mod.index(i[1])]]) 326 | df_features['C1C2_features_adjust'] = feature_adjust 327 | df_features['C1C2_aa_adjust'] = aa_feature_adjust 328 | 329 | # go through sequence/contact features and adjust numbering for CheRiff parent 330 | feature_adjust = [] 331 | aa_feature_adjust = [] 332 | for ind, i in enumerate(df_features.feature): 333 | if df_features.type[ind] == 'seq': 334 | feature_adjust.append(CheRiff_seq_numb_mod.index(i)) 335 | aa_feature_adjust.append(CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i)]) 336 | else: 337 | feature_adjust.append([CheRiff_seq_numb_mod.index(i[0]), CheRiff_seq_numb_mod.index(i[1])]) 338 | aa_feature_adjust.append([CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i[0])], 339 | CheRiff_seq_mod[CheRiff_seq_numb_mod.index(i[1])]]) 340 | 341 | df_features['CheRiff_features_adjust'] = feature_adjust 342 | df_features['CheRiff_aa_adjust'] = aa_feature_adjust 343 | 344 | # go through sequence/contact features and adjust numbering for CsChrim parent 345 | feature_adjust = [] 346 | aa_feature_adjust = [] 347 | for ind, i in enumerate(df_features.feature): 348 | if df_features.type[ind] == 'seq': 349 | feature_adjust.append(CsChrim_seq_numb_mod.index(i)) 350 | aa_feature_adjust.append(CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i)]) 351 | else: 352 | feature_adjust.append([CsChrim_seq_numb_mod.index(i[0]), CsChrim_seq_numb_mod.index(i[1])]) 353 | aa_feature_adjust.append([CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i[0])], 354 | CsChrim_seq_mod[CsChrim_seq_numb_mod.index(i[1])]]) 355 | df_features['CsChrim_features_adjust'] = feature_adjust 356 | df_features['CsChrim_aa_adjust'] = aa_feature_adjust 357 | 358 | df_features.to_csv(path_outputs+'matern_'+ str(property_) +'_' + str(lasso_alpha) + '_LASSO.csv') 359 | return df_features -------------------------------------------------------------------------------- /regression/outputs/green_norm_matern_kernel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel.pdf -------------------------------------------------------------------------------- /regression/outputs/green_norm_matern_kernel_CV_fig1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel_CV_fig1.pdf -------------------------------------------------------------------------------- /regression/outputs/green_norm_matern_kernel_LASSO_CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/green_norm_matern_kernel_LASSO_CV.pdf -------------------------------------------------------------------------------- /regression/outputs/kinetics_off_matern_kernel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel.pdf -------------------------------------------------------------------------------- /regression/outputs/kinetics_off_matern_kernel_CV_fig1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel_CV_fig1.pdf -------------------------------------------------------------------------------- /regression/outputs/kinetics_off_matern_kernel_LASSO_CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/kinetics_off_matern_kernel_LASSO_CV.pdf -------------------------------------------------------------------------------- /regression/outputs/matern_green_norm_0.025_LASSO.csv: -------------------------------------------------------------------------------- 1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust 2 | 0,-0.0477001478388,"(161, 197)",contact,"['D', 'T']",3,"[195, 227]","['D', 'T']","[220, 252]","['D', 'T']","[193, 225]","['C', 'M']" 3 | 1,-0.0477001478388,"(164, 197)",contact,"['T', 'T']",3,"[198, 227]","['T', 'T']","[223, 252]","['T', 'T']","[196, 225]","['M', 'M']" 4 | 2,-0.0999688919254,"(164, 190)",contact,"['T', 'G']",14,"[198, 220]","['T', 'G']","[223, 245]","['T', 'G']","[196, 218]","['M', 'S']" 5 | 3,-0.0999688919254,"(170, 186)",contact,"['T', 'F']",14,"[204, 216]","['T', 'F']","[229, 241]","['T', 'F']","[202, 214]","['A', 'L']" 6 | 4,-0.225551909895,"(172, 247)",contact,"['A', 'F']",16,"[206, 269]","['A', 'F']","[231, 294]","['A', 'F']","[204, 267]","['G', 'W']" 7 | 5,-0.106229081439,"(158, 161)",contact,"['L', 'D']",2,"[192, 195]","['L', 'D']","[217, 220]","['L', 'D']","[190, 193]","['I', 'C']" 8 | 6,-0.0140522194844,158.0,seq,L,11,192,L,217,L,190,I 9 | 7,-0.0140522194844,"(134, 158)",contact,"['P', 'L']",11,"[168, 192]","['P', 'L']","[193, 217]","['P', 'L']","[166, 190]","['P', 'I']" 10 | 8,-0.0140522194844,"(137, 158)",contact,"['L', 'L']",11,"[171, 192]","['L', 'L']","[196, 217]","['L', 'L']","[169, 190]","['L', 'I']" 11 | 9,-0.0140522194844,"(138, 158)",contact,"['I', 'L']",11,"[172, 192]","['I', 'L']","[197, 217]","['I', 'L']","[170, 190]","['I', 'I']" 12 | 10,-0.0140522194844,"(154, 158)",contact,"['T', 'L']",11,"[188, 192]","['T', 'L']","[213, 217]","['T', 'L']","[186, 190]","['T', 'I']" 13 | 11,-0.0140522194844,"(155, 158)",contact,"['M', 'L']",11,"[189, 192]","['M', 'L']","[214, 217]","['M', 'L']","[187, 190]","['M', 'I']" 14 | 12,-0.0140522194844,"(157, 158)",contact,"['L', 'L']",11,"[191, 192]","['L', 'L']","[216, 217]","['L', 'L']","[189, 190]","['L', 'I']" 15 | 13,-0.0140522194844,"(158, 159)",contact,"['L', 'V']",11,"[192, 193]","['L', 'V']","[217, 218]","['L', 'V']","[190, 191]","['I', 'V']" 16 | 14,-0.0140522194844,"(158, 160)",contact,"['L', 'S']",11,"[192, 194]","['L', 'S']","[217, 219]","['L', 'S']","[190, 192]","['I', 'S']" 17 | 15,-0.00115578746512,177.0,seq,G,9,210,G,235,G,208,D 18 | 16,-0.00115578746512,179.0,seq,V,9,212,V,237,V,210,L 19 | 17,-0.00115578746512,186.0,seq,F,9,216,F,241,F,214,L 20 | 18,-0.00115578746512,190.0,seq,G,9,220,G,245,G,218,S 21 | 19,-0.00115578746512,191.0,seq,L,9,221,L,246,L,219,C 22 | 20,-0.00115578746512,"(171, 186)",contact,"['A', 'F']",9,"[205, 216]","['A', 'F']","[230, 241]","['A', 'F']","[203, 214]","['A', 'L']" 23 | 21,-0.00115578746512,"(177, 179)",contact,"['G', 'V']",9,"[210, 212]","['G', 'V']","[235, 237]","['G', 'V']","[208, 210]","['D', 'L']" 24 | 22,-0.00115578746512,"(179, 186)",contact,"['V', 'F']",9,"[212, 216]","['V', 'F']","[237, 241]","['V', 'F']","[210, 214]","['L', 'L']" 25 | 23,-0.00115578746512,"(186, 190)",contact,"['F', 'G']",9,"[216, 220]","['F', 'G']","[241, 245]","['F', 'G']","[214, 218]","['L', 'S']" 26 | 24,-0.00115578746512,"(190, 191)",contact,"['G', 'L']",9,"[220, 221]","['G', 'L']","[245, 246]","['G', 'L']","[218, 219]","['S', 'C']" 27 | 25,-0.00115578746512,"(190, 193)",contact,"['G', 'Y']",9,"[220, 223]","['G', 'Y']","[245, 248]","['G', 'Y']","[218, 221]","['S', 'Y']" 28 | 26,-0.00115578746512,"(190, 194)",contact,"['G', 'G']",9,"[220, 224]","['G', 'G']","[245, 249]","['G', 'G']","[218, 222]","['S', 'G']" 29 | 27,-0.00115578746512,"(191, 193)",contact,"['L', 'Y']",9,"[221, 223]","['L', 'Y']","[246, 248]","['L', 'Y']","[219, 221]","['C', 'Y']" 30 | 28,-0.00115578746512,"(191, 194)",contact,"['L', 'G']",9,"[221, 224]","['L', 'G']","[246, 249]","['L', 'G']","[219, 222]","['C', 'G']" 31 | 29,0.00115578746512,177.0,seq,D,19,210,G,235,G,208,D 32 | 30,0.00115578746512,179.0,seq,L,19,212,V,237,V,210,L 33 | 31,0.00115578746512,186.0,seq,L,19,216,F,241,F,214,L 34 | 32,0.00115578746512,188.0,seq,I,19,218,L,243,C,216,I 35 | 33,0.00115578746512,189.0,seq,V,19,219,M,244,I,217,V 36 | 34,0.00115578746512,190.0,seq,S,19,220,G,245,G,218,S 37 | 35,0.00115578746512,191.0,seq,C,19,221,L,246,L,219,C 38 | 36,0.00115578746512,192.0,seq,I,19,222,C,247,V,220,I 39 | 37,0.00115578746512,"(171, 186)",contact,"['A', 'L']",19,"[205, 216]","['A', 'F']","[230, 241]","['A', 'F']","[203, 214]","['A', 'L']" 40 | 38,0.00115578746512,"(174, 179)",contact,"['A', 'L']",19,"[208, 212]","['S', 'V']","[233, 237]","['A', 'V']","[206, 210]","['A', 'L']" 41 | 39,0.00115578746512,"(174, 186)",contact,"['A', 'L']",19,"[208, 216]","['S', 'F']","[233, 241]","['A', 'F']","[206, 214]","['A', 'L']" 42 | 40,0.00115578746512,"(176, 177)",contact,"['T', 'D']",19,"[209, 210]","['K', 'G']","[234, 235]","['T', 'G']","[207, 208]","['T', 'D']" 43 | 41,0.00115578746512,"(176, 179)",contact,"['T', 'L']",19,"[209, 212]","['K', 'V']","[234, 237]","['T', 'V']","[207, 210]","['T', 'L']" 44 | 42,0.00115578746512,"(177, 178)",contact,"['D', 'W']",19,"[210, 211]","['G', 'Y']","[235, 236]","['G', 'W']","[208, 209]","['D', 'W']" 45 | 43,0.00115578746512,"(177, 179)",contact,"['D', 'L']",19,"[210, 212]","['G', 'V']","[235, 237]","['G', 'V']","[208, 210]","['D', 'L']" 46 | 44,0.00115578746512,"(177, 180)",contact,"['D', 'K']",19,"[210, 213]","['G', 'R']","[235, 238]","['G', 'K']","[208, 211]","['D', 'K']" 47 | 45,0.00115578746512,"(177, 184)",contact,"['D', 'W']",19,"[210, 214]","['G', 'V']","[235, 239]","['G', 'W']","[208, 212]","['D', 'W']" 48 | 46,0.00115578746512,"(178, 179)",contact,"['W', 'L']",19,"[211, 212]","['Y', 'V']","[236, 237]","['W', 'V']","[209, 210]","['W', 'L']" 49 | 47,0.00115578746512,"(179, 180)",contact,"['L', 'K']",19,"[212, 213]","['V', 'R']","[237, 238]","['V', 'K']","[210, 211]","['L', 'K']" 50 | 48,0.00115578746512,"(179, 184)",contact,"['L', 'W']",19,"[212, 214]","['V', 'V']","[237, 239]","['V', 'W']","[210, 212]","['L', 'W']" 51 | 49,0.00115578746512,"(179, 185)",contact,"['L', 'L']",19,"[212, 215]","['V', 'I']","[237, 240]","['V', 'L']","[210, 213]","['L', 'L']" 52 | 50,0.00115578746512,"(179, 186)",contact,"['L', 'L']",19,"[212, 216]","['V', 'F']","[237, 241]","['V', 'F']","[210, 214]","['L', 'L']" 53 | 51,0.00115578746512,"(180, 186)",contact,"['K', 'L']",19,"[213, 216]","['R', 'F']","[238, 241]","['K', 'F']","[211, 214]","['K', 'L']" 54 | 52,0.00115578746512,"(184, 186)",contact,"['W', 'L']",19,"[214, 216]","['V', 'F']","[239, 241]","['W', 'F']","[212, 214]","['W', 'L']" 55 | 53,0.00115578746512,"(184, 188)",contact,"['W', 'I']",19,"[214, 218]","['V', 'L']","[239, 243]","['W', 'C']","[212, 216]","['W', 'I']" 56 | 54,0.00115578746512,"(185, 186)",contact,"['L', 'L']",19,"[215, 216]","['I', 'F']","[240, 241]","['L', 'F']","[213, 214]","['L', 'L']" 57 | 55,0.00115578746512,"(185, 188)",contact,"['L', 'I']",19,"[215, 218]","['I', 'L']","[240, 243]","['L', 'C']","[213, 216]","['L', 'I']" 58 | 56,0.00115578746512,"(185, 189)",contact,"['L', 'V']",19,"[215, 219]","['I', 'M']","[240, 244]","['L', 'I']","[213, 217]","['L', 'V']" 59 | 57,0.00115578746512,"(186, 187)",contact,"['L', 'Y']",19,"[216, 217]","['F', 'F']","[241, 242]","['F', 'Y']","[214, 215]","['L', 'Y']" 60 | 58,0.00115578746512,"(186, 188)",contact,"['L', 'I']",19,"[216, 218]","['F', 'L']","[241, 243]","['F', 'C']","[214, 216]","['L', 'I']" 61 | 59,0.00115578746512,"(186, 189)",contact,"['L', 'V']",19,"[216, 219]","['F', 'M']","[241, 244]","['F', 'I']","[214, 217]","['L', 'V']" 62 | 60,0.00115578746512,"(186, 190)",contact,"['L', 'S']",19,"[216, 220]","['F', 'G']","[241, 245]","['F', 'G']","[214, 218]","['L', 'S']" 63 | 61,0.00115578746512,"(187, 188)",contact,"['Y', 'I']",19,"[217, 218]","['F', 'L']","[242, 243]","['Y', 'C']","[215, 216]","['Y', 'I']" 64 | 62,0.00115578746512,"(187, 189)",contact,"['Y', 'V']",19,"[217, 219]","['F', 'M']","[242, 244]","['Y', 'I']","[215, 217]","['Y', 'V']" 65 | 63,0.00115578746512,"(187, 190)",contact,"['Y', 'S']",19,"[217, 220]","['F', 'G']","[242, 245]","['Y', 'G']","[215, 218]","['Y', 'S']" 66 | 64,0.00115578746512,"(187, 191)",contact,"['Y', 'C']",19,"[217, 221]","['F', 'L']","[242, 246]","['Y', 'L']","[215, 219]","['Y', 'C']" 67 | 65,0.00115578746512,"(188, 189)",contact,"['I', 'V']",19,"[218, 219]","['L', 'M']","[243, 244]","['C', 'I']","[216, 217]","['I', 'V']" 68 | 66,0.00115578746512,"(188, 190)",contact,"['I', 'S']",19,"[218, 220]","['L', 'G']","[243, 245]","['C', 'G']","[216, 218]","['I', 'S']" 69 | 67,0.00115578746512,"(188, 191)",contact,"['I', 'C']",19,"[218, 221]","['L', 'L']","[243, 246]","['C', 'L']","[216, 219]","['I', 'C']" 70 | 68,0.00115578746512,"(188, 192)",contact,"['I', 'I']",19,"[218, 222]","['L', 'C']","[243, 247]","['C', 'V']","[216, 220]","['I', 'I']" 71 | 69,0.00115578746512,"(189, 190)",contact,"['V', 'S']",19,"[219, 220]","['M', 'G']","[244, 245]","['I', 'G']","[217, 218]","['V', 'S']" 72 | 70,0.00115578746512,"(189, 191)",contact,"['V', 'C']",19,"[219, 221]","['M', 'L']","[244, 246]","['I', 'L']","[217, 219]","['V', 'C']" 73 | 71,0.00115578746512,"(189, 192)",contact,"['V', 'I']",19,"[219, 222]","['M', 'C']","[244, 247]","['I', 'V']","[217, 220]","['V', 'I']" 74 | 72,0.00115578746512,"(189, 193)",contact,"['V', 'Y']",19,"[219, 223]","['M', 'Y']","[244, 248]","['I', 'Y']","[217, 221]","['V', 'Y']" 75 | 73,0.00115578746512,"(190, 191)",contact,"['S', 'C']",19,"[220, 221]","['G', 'L']","[245, 246]","['G', 'L']","[218, 219]","['S', 'C']" 76 | 74,0.00115578746512,"(190, 192)",contact,"['S', 'I']",19,"[220, 222]","['G', 'C']","[245, 247]","['G', 'V']","[218, 220]","['S', 'I']" 77 | 75,0.00115578746512,"(190, 193)",contact,"['S', 'Y']",19,"[220, 223]","['G', 'Y']","[245, 248]","['G', 'Y']","[218, 221]","['S', 'Y']" 78 | 76,0.00115578746512,"(190, 194)",contact,"['S', 'G']",19,"[220, 224]","['G', 'G']","[245, 249]","['G', 'G']","[218, 222]","['S', 'G']" 79 | 77,0.00115578746512,"(191, 192)",contact,"['C', 'I']",19,"[221, 222]","['L', 'C']","[246, 247]","['L', 'V']","[219, 220]","['C', 'I']" 80 | 78,0.00115578746512,"(191, 193)",contact,"['C', 'Y']",19,"[221, 223]","['L', 'Y']","[246, 248]","['L', 'Y']","[219, 221]","['C', 'Y']" 81 | 79,0.00115578746512,"(191, 194)",contact,"['C', 'G']",19,"[221, 224]","['L', 'G']","[246, 249]","['L', 'G']","[219, 222]","['C', 'G']" 82 | 80,0.00115578746512,"(192, 193)",contact,"['I', 'Y']",19,"[222, 223]","['C', 'Y']","[247, 248]","['V', 'Y']","[220, 221]","['I', 'Y']" 83 | 81,0.00115578746512,"(192, 194)",contact,"['I', 'G']",19,"[222, 224]","['C', 'G']","[247, 249]","['V', 'G']","[220, 222]","['I', 'G']" 84 | 82,0.0837512338557,"(167, 187)",contact,"['F', 'Y']",10,"[201, 217]","['W', 'F']","[226, 242]","['M', 'Y']","[199, 215]","['F', 'Y']" 85 | 83,0.0837512338557,"(170, 174)",contact,"['A', 'A']",10,"[204, 208]","['T', 'S']","[229, 233]","['T', 'A']","[202, 206]","['A', 'A']" 86 | 84,0.0837512338557,"(172, 174)",contact,"['G', 'A']",10,"[206, 208]","['A', 'S']","[231, 233]","['A', 'A']","[204, 206]","['G', 'A']" 87 | 85,0.0837512338557,"(172, 180)",contact,"['G', 'K']",10,"[206, 213]","['A', 'R']","[231, 238]","['A', 'K']","[204, 211]","['G', 'K']" 88 | 86,-0.456801277898,"(202, 237)",contact,"['G', 'Y']",17,"[232, 259]","['A', 'F']","[257, 284]","['G', 'Y']","[230, 257]","['A', 'F']" 89 | 87,-0.339700102915,"(192, 195)",contact,"['C', 'I']",0,"[222, 225]","['C', 'I']","[247, 250]","['V', 'T']","[220, 223]","['I', 'G']" 90 | 88,0.00260809419317,36.0,seq,P,8,85,N,110,L,83,P 91 | 89,0.00260809419317,37.0,seq,G,8,86,A,111,W,84,G 92 | 90,0.00260809419317,40.0,seq,I,8,89,L,114,E,87,I 93 | 91,0.00260809419317,41.0,seq,G,8,90,A,115,T,88,G 94 | 92,0.00260809419317,43.0,seq,Q,8,92,N,117,R,90,Q 95 | 93,0.00260809419317,44.0,seq,V,8,93,I,118,G,91,V 96 | 94,0.00260809419317,45.0,seq,C,8,94,L,119,F,92,C 97 | 95,0.00260809419317,"(34, 36)",contact,"['G', 'P']",8,"[83, 85]","['G', 'N']","[108, 110]","['G', 'L']","[81, 83]","['G', 'P']" 98 | 96,0.00260809419317,"(35, 36)",contact,"['T', 'P']",8,"[84, 85]","['T', 'N']","[109, 110]","['A', 'L']","[82, 83]","['T', 'P']" 99 | 97,0.00260809419317,"(35, 37)",contact,"['T', 'G']",8,"[84, 86]","['T', 'A']","[109, 111]","['A', 'W']","[82, 84]","['T', 'G']" 100 | 98,0.00260809419317,"(36, 37)",contact,"['P', 'G']",8,"[85, 86]","['N', 'A']","[110, 111]","['L', 'W']","[83, 84]","['P', 'G']" 101 | 99,0.00260809419317,"(36, 38)",contact,"['P', 'E']",8,"[85, 87]","['N', 'E']","[110, 112]","['L', 'E']","[83, 85]","['P', 'E']" 102 | 100,0.00260809419317,"(36, 39)",contact,"['P', 'K']",8,"[85, 88]","['N', 'K']","[110, 113]","['L', 'Q']","[83, 86]","['P', 'K']" 103 | 101,0.00260809419317,"(36, 40)",contact,"['P', 'I']",8,"[85, 89]","['N', 'L']","[110, 114]","['L', 'E']","[83, 87]","['P', 'I']" 104 | 102,0.00260809419317,"(37, 38)",contact,"['G', 'E']",8,"[86, 87]","['A', 'E']","[111, 112]","['W', 'E']","[84, 85]","['G', 'E']" 105 | 103,0.00260809419317,"(37, 39)",contact,"['G', 'K']",8,"[86, 88]","['A', 'K']","[111, 113]","['W', 'Q']","[84, 86]","['G', 'K']" 106 | 104,0.00260809419317,"(37, 40)",contact,"['G', 'I']",8,"[86, 89]","['A', 'L']","[111, 114]","['W', 'E']","[84, 87]","['G', 'I']" 107 | 105,0.00260809419317,"(37, 41)",contact,"['G', 'G']",8,"[86, 90]","['A', 'A']","[111, 115]","['W', 'T']","[84, 88]","['G', 'G']" 108 | 106,0.00260809419317,"(38, 40)",contact,"['E', 'I']",8,"[87, 89]","['E', 'L']","[112, 114]","['E', 'E']","[85, 87]","['E', 'I']" 109 | 107,0.00260809419317,"(38, 41)",contact,"['E', 'G']",8,"[87, 90]","['E', 'A']","[112, 115]","['E', 'T']","[85, 88]","['E', 'G']" 110 | 108,0.00260809419317,"(39, 40)",contact,"['K', 'I']",8,"[88, 89]","['K', 'L']","[113, 114]","['Q', 'E']","[86, 87]","['K', 'I']" 111 | 109,0.00260809419317,"(39, 41)",contact,"['K', 'G']",8,"[88, 90]","['K', 'A']","[113, 115]","['Q', 'T']","[86, 88]","['K', 'G']" 112 | 110,0.00260809419317,"(39, 43)",contact,"['K', 'Q']",8,"[88, 92]","['K', 'N']","[113, 117]","['Q', 'R']","[86, 90]","['K', 'Q']" 113 | 111,0.00260809419317,"(40, 41)",contact,"['I', 'G']",8,"[89, 90]","['L', 'A']","[114, 115]","['E', 'T']","[87, 88]","['I', 'G']" 114 | 112,0.00260809419317,"(40, 42)",contact,"['I', 'A']",8,"[89, 91]","['L', 'A']","[114, 116]","['E', 'A']","[87, 89]","['I', 'A']" 115 | 113,0.00260809419317,"(40, 43)",contact,"['I', 'Q']",8,"[89, 92]","['L', 'N']","[114, 117]","['E', 'R']","[87, 90]","['I', 'Q']" 116 | 114,0.00260809419317,"(40, 44)",contact,"['I', 'V']",8,"[89, 93]","['L', 'I']","[114, 118]","['E', 'G']","[87, 91]","['I', 'V']" 117 | 115,0.00260809419317,"(41, 42)",contact,"['G', 'A']",8,"[90, 91]","['A', 'A']","[115, 116]","['T', 'A']","[88, 89]","['G', 'A']" 118 | 116,0.00260809419317,"(41, 43)",contact,"['G', 'Q']",8,"[90, 92]","['A', 'N']","[115, 117]","['T', 'R']","[88, 90]","['G', 'Q']" 119 | 117,0.00260809419317,"(41, 44)",contact,"['G', 'V']",8,"[90, 93]","['A', 'I']","[115, 118]","['T', 'G']","[88, 91]","['G', 'V']" 120 | 118,0.00260809419317,"(41, 45)",contact,"['G', 'C']",8,"[90, 94]","['A', 'L']","[115, 119]","['T', 'F']","[88, 92]","['G', 'C']" 121 | 119,0.00260809419317,"(42, 43)",contact,"['A', 'Q']",8,"[91, 92]","['A', 'N']","[116, 117]","['A', 'R']","[89, 90]","['A', 'Q']" 122 | 120,0.00260809419317,"(42, 44)",contact,"['A', 'V']",8,"[91, 93]","['A', 'I']","[116, 118]","['A', 'G']","[89, 91]","['A', 'V']" 123 | 121,0.00260809419317,"(42, 45)",contact,"['A', 'C']",8,"[91, 94]","['A', 'L']","[116, 119]","['A', 'F']","[89, 92]","['A', 'C']" 124 | 122,0.00260809419317,"(43, 44)",contact,"['Q', 'V']",8,"[92, 93]","['N', 'I']","[117, 118]","['R', 'G']","[90, 91]","['Q', 'V']" 125 | 123,0.00260809419317,"(43, 45)",contact,"['Q', 'C']",8,"[92, 94]","['N', 'L']","[117, 119]","['R', 'F']","[90, 92]","['Q', 'C']" 126 | 124,0.00260809419317,"(43, 46)",contact,"['Q', 'Q']",8,"[92, 95]","['N', 'Q']","[117, 120]","['R', 'Q']","[90, 93]","['Q', 'Q']" 127 | 125,0.00260809419317,"(43, 47)",contact,"['Q', 'W']",8,"[92, 96]","['N', 'W']","[117, 121]","['R', 'W']","[90, 94]","['Q', 'W']" 128 | 126,0.00260809419317,"(44, 45)",contact,"['V', 'C']",8,"[93, 94]","['I', 'L']","[118, 119]","['G', 'F']","[91, 92]","['V', 'C']" 129 | 127,0.00260809419317,"(44, 46)",contact,"['V', 'Q']",8,"[93, 95]","['I', 'Q']","[118, 120]","['G', 'Q']","[91, 93]","['V', 'Q']" 130 | 128,0.00260809419317,"(44, 47)",contact,"['V', 'W']",8,"[93, 96]","['I', 'W']","[118, 121]","['G', 'W']","[91, 94]","['V', 'W']" 131 | 129,0.00260809419317,"(44, 48)",contact,"['V', 'I']",8,"[93, 97]","['I', 'I']","[118, 122]","['G', 'F']","[91, 95]","['V', 'I']" 132 | 130,0.00260809419317,"(45, 46)",contact,"['C', 'Q']",8,"[94, 95]","['L', 'Q']","[119, 120]","['F', 'Q']","[92, 93]","['C', 'Q']" 133 | 131,0.00260809419317,"(45, 47)",contact,"['C', 'W']",8,"[94, 96]","['L', 'W']","[119, 121]","['F', 'W']","[92, 94]","['C', 'W']" 134 | 132,0.00260809419317,"(45, 48)",contact,"['C', 'I']",8,"[94, 97]","['L', 'I']","[119, 122]","['F', 'F']","[92, 95]","['C', 'I']" 135 | 133,0.00260809419317,"(45, 268)",contact,"['C', 'I']",8,"[94, 290]","['L', 'I']","[119, 315]","['F', 'I']","[92, 288]","['C', 'I']" 136 | 134,-0.0852051821975,"(242, 245)",contact,"['M', 'I']",1,"[264, 267]","['M', 'I']","[289, 292]","['M', 'G']","[262, 265]","['S', 'I']" 137 | 135,-0.0852051821975,"(243, 245)",contact,"['F', 'I']",1,"[265, 267]","['F', 'I']","[290, 292]","['F', 'G']","[263, 265]","['Y', 'I']" 138 | 136,-0.146294383456,"(48, 50)",contact,"['F', 'V']",5,"[97, 99]","['I', 'F']","[122, 124]","['F', 'V']","[95, 97]","['I', 'F']" 139 | 137,0.0140522194844,158.0,seq,I,20,192,L,217,L,190,I 140 | 138,0.0140522194844,"(134, 158)",contact,"['P', 'I']",20,"[168, 192]","['P', 'L']","[193, 217]","['P', 'L']","[166, 190]","['P', 'I']" 141 | 139,0.0140522194844,"(137, 158)",contact,"['L', 'I']",20,"[171, 192]","['L', 'L']","[196, 217]","['L', 'L']","[169, 190]","['L', 'I']" 142 | 140,0.0140522194844,"(138, 158)",contact,"['I', 'I']",20,"[172, 192]","['I', 'L']","[197, 217]","['I', 'L']","[170, 190]","['I', 'I']" 143 | 141,0.0140522194844,"(154, 158)",contact,"['T', 'I']",20,"[188, 192]","['T', 'L']","[213, 217]","['T', 'L']","[186, 190]","['T', 'I']" 144 | 142,0.0140522194844,"(155, 158)",contact,"['M', 'I']",20,"[189, 192]","['M', 'L']","[214, 217]","['M', 'L']","[187, 190]","['M', 'I']" 145 | 143,0.0140522194844,"(157, 158)",contact,"['L', 'I']",20,"[191, 192]","['L', 'L']","[216, 217]","['L', 'L']","[189, 190]","['L', 'I']" 146 | 144,0.0140522194844,"(158, 159)",contact,"['I', 'V']",20,"[192, 193]","['L', 'V']","[217, 218]","['L', 'V']","[190, 191]","['I', 'V']" 147 | 145,0.0140522194844,"(158, 160)",contact,"['I', 'S']",20,"[192, 194]","['L', 'S']","[217, 219]","['L', 'S']","[190, 192]","['I', 'S']" 148 | 146,-0.0903677077125,"(50, 52)",contact,"['V', 'L']",18,"[99, 101]","['F', 'L']","[124, 126]","['V', 'L']","[97, 99]","['F', 'I']" 149 | 147,-0.0903677077125,"(50, 53)",contact,"['V', 'S']",18,"[99, 102]","['F', 'S']","[124, 127]","['V', 'S']","[97, 100]","['F', 'A']" 150 | 148,-0.0903677077125,"(50, 54)",contact,"['V', 'A']",18,"[99, 103]","['F', 'A']","[124, 128]","['V', 'A']","[97, 101]","['F', 'I']" 151 | 149,-0.0903677077125,"(53, 91)",contact,"['S', 'L']",18,"[102, 130]","['S', 'M']","[127, 155]","['S', 'L']","[100, 128]","['A', 'V']" 152 | 150,-0.0903677077125,"(54, 91)",contact,"['A', 'L']",18,"[103, 130]","['A', 'M']","[128, 155]","['A', 'L']","[101, 128]","['I', 'V']" 153 | 151,-0.0325495757327,50.0,seq,V,21,99,F,124,V,97,F 154 | 152,-0.0325495757327,62.0,seq,W,21,111,Y,136,W,109,F 155 | 153,-0.0325495757327,63.0,seq,H,21,112,Q,137,H,110,S 156 | 154,-0.0325495757327,65.0,seq,Y,21,114,W,139,Y,112,W 157 | 155,-0.0325495757327,68.0,seq,S,21,117,T,142,S,115,T 158 | 156,-0.0325495757327,69.0,seq,V,21,118,C,143,V,116,C 159 | 157,-0.0325495757327,88.0,seq,S,21,127,T,152,S,125,C 160 | 158,-0.0325495757327,91.0,seq,L,21,130,M,155,L,128,V 161 | 159,-0.0325495757327,99.0,seq,Y,21,138,F,163,Y,136,F 162 | 160,-0.0325495757327,100.0,seq,F,21,139,H,164,F,137,K 163 | 161,-0.0325495757327,105.0,seq,T,21,142,D,167,T,140,S 164 | 162,-0.0325495757327,"(46, 50)",contact,"['Q', 'V']",21,"[95, 99]","['Q', 'F']","[120, 124]","['Q', 'V']","[93, 97]","['Q', 'F']" 165 | 163,-0.0325495757327,"(47, 50)",contact,"['W', 'V']",21,"[96, 99]","['W', 'F']","[121, 124]","['W', 'V']","[94, 97]","['W', 'F']" 166 | 164,-0.0325495757327,"(50, 91)",contact,"['V', 'L']",21,"[99, 130]","['F', 'M']","[124, 155]","['V', 'L']","[97, 128]","['F', 'V']" 167 | 165,-0.0325495757327,"(50, 94)",contact,"['V', 'V']",21,"[99, 133]","['F', 'F']","[124, 158]","['V', 'V']","[97, 131]","['F', 'V']" 168 | 166,-0.0325495757327,"(50, 95)",contact,"['V', 'I']",21,"[99, 134]","['F', 'I']","[124, 159]","['V', 'I']","[97, 132]","['F', 'T']" 169 | 167,-0.0325495757327,"(57, 88)",contact,"['L', 'S']",21,"[106, 127]","['L', 'T']","[131, 152]","['L', 'S']","[104, 125]","['L', 'C']" 170 | 168,-0.0325495757327,"(57, 91)",contact,"['L', 'L']",21,"[106, 130]","['L', 'M']","[131, 155]","['L', 'L']","[104, 128]","['L', 'V']" 171 | 169,-0.0325495757327,"(69, 79)",contact,"['V', 'G']",21,"[118, 119]","['C', 'G']","[143, 144]","['V', 'G']","[116, 117]","['C', 'G']" 172 | 170,-0.0325495757327,"(69, 80)",contact,"['V', 'W']",21,"[118, 120]","['C', 'W']","[143, 145]","['V', 'W']","[116, 118]","['C', 'W']" 173 | 171,-0.0325495757327,"(69, 82)",contact,"['V', 'E']",21,"[118, 121]","['C', 'E']","[143, 146]","['V', 'E']","[116, 119]","['C', 'E']" 174 | 172,-0.0325495757327,"(69, 285)",contact,"['V', 'R']",21,"[118, 307]","['C', 'R']","[143, 332]","['V', 'R']","[116, 305]","['C', 'R']" 175 | 173,-0.0325495757327,"(84, 88)",contact,"['V', 'S']",21,"[123, 127]","['I', 'T']","[148, 152]","['V', 'S']","[121, 125]","['V', 'C']" 176 | 174,-0.0325495757327,"(85, 88)",contact,"['Y', 'S']",21,"[124, 127]","['Y', 'T']","[149, 152]","['Y', 'S']","[122, 125]","['Y', 'C']" 177 | 175,-0.0325495757327,"(86, 88)",contact,"['V', 'S']",21,"[125, 127]","['V', 'T']","[150, 152]","['V', 'S']","[123, 125]","['V', 'C']" 178 | 176,-0.0325495757327,"(87, 88)",contact,"['C', 'S']",21,"[126, 127]","['A', 'T']","[151, 152]","['C', 'S']","[124, 125]","['C', 'C']" 179 | 177,-0.0325495757327,"(87, 91)",contact,"['C', 'L']",21,"[126, 130]","['A', 'M']","[151, 155]","['C', 'L']","[124, 128]","['C', 'V']" 180 | 178,-0.0325495757327,"(88, 89)",contact,"['S', 'V']",21,"[127, 128]","['T', 'I']","[152, 153]","['S', 'V']","[125, 126]","['C', 'V']" 181 | 179,-0.0325495757327,"(88, 90)",contact,"['S', 'E']",21,"[127, 129]","['T', 'E']","[152, 154]","['S', 'E']","[125, 127]","['C', 'E']" 182 | 180,-0.0325495757327,"(88, 91)",contact,"['S', 'L']",21,"[127, 130]","['T', 'M']","[152, 155]","['S', 'L']","[125, 128]","['C', 'V']" 183 | 181,-0.0325495757327,"(88, 92)",contact,"['S', 'I']",21,"[127, 131]","['T', 'I']","[152, 156]","['S', 'I']","[125, 129]","['C', 'L']" 184 | 182,-0.0325495757327,"(89, 91)",contact,"['V', 'L']",21,"[128, 130]","['I', 'M']","[153, 155]","['V', 'L']","[126, 128]","['V', 'V']" 185 | 183,-0.0325495757327,"(89, 92)",contact,"['V', 'I']",21,"[128, 131]","['I', 'I']","[153, 156]","['V', 'I']","[126, 129]","['V', 'L']" 186 | 184,-0.0325495757327,"(89, 93)",contact,"['V', 'K']",21,"[128, 132]","['I', 'K']","[153, 157]","['V', 'K']","[126, 130]","['V', 'F']" 187 | 185,-0.0325495757327,"(90, 91)",contact,"['E', 'L']",21,"[129, 130]","['E', 'M']","[154, 155]","['E', 'L']","[127, 128]","['E', 'V']" 188 | 186,-0.0325495757327,"(91, 92)",contact,"['L', 'I']",21,"[130, 131]","['M', 'I']","[155, 156]","['L', 'I']","[128, 129]","['V', 'L']" 189 | 187,-0.0325495757327,"(91, 93)",contact,"['L', 'K']",21,"[130, 132]","['M', 'K']","[155, 157]","['L', 'K']","[128, 130]","['V', 'F']" 190 | 188,-0.0325495757327,"(91, 94)",contact,"['L', 'V']",21,"[130, 133]","['M', 'F']","[155, 158]","['L', 'V']","[128, 131]","['V', 'V']" 191 | 189,-0.0325495757327,"(91, 95)",contact,"['L', 'I']",21,"[130, 134]","['M', 'I']","[155, 159]","['L', 'I']","[128, 132]","['V', 'T']" 192 | 190,-0.0325495757327,"(92, 94)",contact,"['I', 'V']",21,"[131, 133]","['I', 'F']","[156, 158]","['I', 'V']","[129, 131]","['L', 'V']" 193 | 191,-0.0325495757327,"(92, 96)",contact,"['I', 'L']",21,"[131, 135]","['I', 'I']","[156, 160]","['I', 'L']","[129, 133]","['L', 'L']" 194 | 192,-0.0325495757327,"(93, 94)",contact,"['K', 'V']",21,"[132, 133]","['K', 'F']","[157, 158]","['K', 'V']","[130, 131]","['F', 'V']" 195 | 193,-0.0325495757327,"(93, 96)",contact,"['K', 'L']",21,"[132, 135]","['K', 'I']","[157, 160]","['K', 'L']","[130, 133]","['F', 'L']" 196 | 194,-0.0325495757327,"(94, 95)",contact,"['V', 'I']",21,"[133, 134]","['F', 'I']","[158, 159]","['V', 'I']","[131, 132]","['V', 'T']" 197 | 195,-0.0325495757327,"(95, 96)",contact,"['I', 'L']",21,"[134, 135]","['I', 'I']","[159, 160]","['I', 'L']","[132, 133]","['T', 'L']" 198 | 196,-0.0325495757327,"(95, 98)",contact,"['I', 'I']",21,"[134, 137]","['I', 'Y']","[159, 162]","['I', 'I']","[132, 135]","['T', 'I']" 199 | 197,-0.0325495757327,"(95, 99)",contact,"['I', 'Y']",21,"[134, 138]","['I', 'F']","[159, 163]","['I', 'Y']","[132, 136]","['T', 'F']" 200 | 198,-0.0325495757327,"(96, 99)",contact,"['L', 'Y']",21,"[135, 138]","['I', 'F']","[160, 163]","['L', 'Y']","[133, 136]","['L', 'F']" 201 | 199,-0.0325495757327,"(96, 100)",contact,"['L', 'F']",21,"[135, 139]","['I', 'H']","[160, 164]","['L', 'F']","[133, 137]","['L', 'K']" 202 | 200,-0.0325495757327,"(97, 99)",contact,"['E', 'Y']",21,"[136, 138]","['E', 'F']","[161, 163]","['E', 'Y']","[134, 136]","['E', 'F']" 203 | 201,-0.0325495757327,"(97, 100)",contact,"['E', 'F']",21,"[136, 139]","['E', 'H']","[161, 164]","['E', 'F']","[134, 137]","['E', 'K']" 204 | 202,-0.0325495757327,"(98, 99)",contact,"['I', 'Y']",21,"[137, 138]","['Y', 'F']","[162, 163]","['I', 'Y']","[135, 136]","['I', 'F']" 205 | 203,-0.0325495757327,"(98, 100)",contact,"['I', 'F']",21,"[137, 139]","['Y', 'H']","[162, 164]","['I', 'F']","[135, 137]","['I', 'K']" 206 | 204,-0.0325495757327,"(99, 100)",contact,"['Y', 'F']",21,"[138, 139]","['F', 'H']","[163, 164]","['Y', 'F']","[136, 137]","['F', 'K']" 207 | 205,-0.0325495757327,"(99, 101)",contact,"['Y', 'E']",21,"[138, 140]","['F', 'E']","[163, 165]","['Y', 'E']","[136, 138]","['F', 'E']" 208 | 206,-0.0325495757327,"(99, 105)",contact,"['Y', 'T']",21,"[138, 142]","['F', 'D']","[163, 167]","['Y', 'T']","[136, 140]","['F', 'S']" 209 | 207,-0.0325495757327,"(100, 101)",contact,"['F', 'E']",21,"[139, 140]","['H', 'E']","[164, 165]","['F', 'E']","[137, 138]","['K', 'E']" 210 | 208,-0.0325495757327,"(100, 102)",contact,"['F', 'F']",21,"[139, 141]","['H', 'F']","[164, 166]","['F', 'F']","[137, 139]","['K', 'F']" 211 | 209,-0.0325495757327,"(100, 105)",contact,"['F', 'T']",21,"[139, 142]","['H', 'D']","[164, 167]","['F', 'T']","[137, 140]","['K', 'S']" 212 | 210,-0.0325495757327,"(100, 108)",contact,"['F', 'A']",21,"[139, 145]","['H', 'A']","[164, 170]","['F', 'A']","[137, 143]","['K', 'A']" 213 | 211,-0.0325495757327,"(101, 105)",contact,"['E', 'T']",21,"[140, 142]","['E', 'D']","[165, 167]","['E', 'T']","[138, 140]","['E', 'S']" 214 | 212,-0.0325495757327,"(102, 105)",contact,"['F', 'T']",21,"[141, 142]","['F', 'D']","[166, 167]","['F', 'T']","[139, 140]","['F', 'S']" 215 | 213,-0.0325495757327,"(105, 107)",contact,"['T', 'P']",21,"[142, 144]","['D', 'P']","[167, 169]","['T', 'P']","[140, 142]","['S', 'P']" 216 | 214,-0.0325495757327,"(105, 108)",contact,"['T', 'A']",21,"[142, 145]","['D', 'A']","[167, 170]","['T', 'A']","[140, 143]","['S', 'A']" 217 | 215,-0.00632980766847,"(89, 132)",contact,"['V', 'T']",15,"[128, 166]","['I', 'T']","[153, 191]","['V', 'T']","[126, 164]","['V', 'S']" 218 | 216,-0.106774499838,"(112, 114)",contact,"['L', 'G']",7,"[149, 151]","['S', 'N']","[174, 176]","['L', 'G']","[147, 149]","['L', 'T']" 219 | 217,0.0423640139123,"(228, 287)",contact,"['V', 'K']",12,"[250, 309]","['V', 'L']","[275, 334]","['L', 'K']","[248, 307]","['V', 'K']" 220 | 218,0.0651927695805,"(206, 208)",contact,"['V', 'A']",13,"[236, 238]","['I', 'A']","[261, 263]","['V', 'S']","[234, 236]","['V', 'A']" 221 | 219,0.0651927695805,"(206, 217)",contact,"['V', 'H']",13,"[236, 240]","['I', 'H']","[261, 265]","['V', 'Y']","[234, 238]","['V', 'H']" 222 | 220,-0.010932852442,"(49, 50)",contact,"['A', 'F']",4,"[98, 99]","['T', 'F']","[123, 124]","['A', 'V']","[96, 97]","['A', 'F']" 223 | 221,0.0325495757327,50.0,seq,F,6,99,F,124,V,97,F 224 | 222,0.0325495757327,65.0,seq,W,6,114,W,139,Y,112,W 225 | 223,0.0325495757327,68.0,seq,T,6,117,T,142,S,115,T 226 | 224,0.0325495757327,69.0,seq,C,6,118,C,143,V,116,C 227 | 225,0.0325495757327,99.0,seq,F,6,138,F,163,Y,136,F 228 | 226,0.0325495757327,"(46, 50)",contact,"['Q', 'F']",6,"[95, 99]","['Q', 'F']","[120, 124]","['Q', 'V']","[93, 97]","['Q', 'F']" 229 | 227,0.0325495757327,"(47, 50)",contact,"['W', 'F']",6,"[96, 99]","['W', 'F']","[121, 124]","['W', 'V']","[94, 97]","['W', 'F']" 230 | 228,0.0325495757327,"(69, 79)",contact,"['C', 'G']",6,"[118, 119]","['C', 'G']","[143, 144]","['V', 'G']","[116, 117]","['C', 'G']" 231 | 229,0.0325495757327,"(69, 80)",contact,"['C', 'W']",6,"[118, 120]","['C', 'W']","[143, 145]","['V', 'W']","[116, 118]","['C', 'W']" 232 | 230,0.0325495757327,"(69, 82)",contact,"['C', 'E']",6,"[118, 121]","['C', 'E']","[143, 146]","['V', 'E']","[116, 119]","['C', 'E']" 233 | 231,0.0325495757327,"(69, 285)",contact,"['C', 'R']",6,"[118, 307]","['C', 'R']","[143, 332]","['V', 'R']","[116, 305]","['C', 'R']" 234 | 232,0.0325495757327,"(97, 99)",contact,"['E', 'F']",6,"[136, 138]","['E', 'F']","[161, 163]","['E', 'Y']","[134, 136]","['E', 'F']" 235 | 233,0.0325495757327,"(99, 101)",contact,"['F', 'E']",6,"[138, 140]","['F', 'E']","[163, 165]","['Y', 'E']","[136, 138]","['F', 'E']" 236 | -------------------------------------------------------------------------------- /regression/outputs/matern_kernel_gen10_green_norm.csv: -------------------------------------------------------------------------------- 1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real 2 | 0,1.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,1.02271618013,0.941918382512,1.0,0.931513986392 3 | 2,0.089599974,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,-1.72475257563,-1.36366794838,0.089599974,0.123026952317 4 | 6,0.661021095,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,0.551248670309,-0.151029742925,0.661021095,0.356792345662 5 | 14,0.563438909,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,0.369336692916,0.398130050637,0.563438909,0.577865283452 6 | 18,0.041108704,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,-2.61210511773,-2.00641530807,0.041108704,0.0699683203893 7 | 25,0.060136195,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,-2.1788788232,-0.306443744649,0.060136195,0.311280213508 8 | 26,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,1.02271618013,0.976088888203,1.0,0.959885910951 9 | 29,0.602216508,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,0.445139346208,0.295835534249,0.602216508,0.528224622577 10 | 43,1.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,1.02271618013,0.781325681647,1.0,0.809003822973 11 | 47,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,1.02271618013,0.922198703383,1.0,0.915523878307 12 | 48,0.095908825,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,-1.64725882791,-1.70056941884,0.095908825,0.091522873194 13 | 58,0.429018765,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,0.0589216655269,0.0925431737285,0.429018765,0.441872705821 14 | 62,0.03504343,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,-2.79390877129,-1.95258034556,0.03504343,0.0733551031453 15 | 65,0.096970698,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,-1.63471863447,-0.372566386263,0.096970698,0.293722335116 16 | 68,0.47181211,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,0.167207929544,0.0648253122398,0.47181211,0.431248428662 17 | 72,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,1.02271618013,0.939255490174,1.0,0.92933852205 18 | 85,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,1.02271618013,1.04215023866,1.0,1.01721039238 19 | 90,0.153510737,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,-1.11155428205,-1.24322365882,0.153510737,0.136750612502 20 | 92,0.517913171,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,0.273383361603,0.184983461237,0.517913171,0.479233783856 21 | 102,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,1.02271618013,0.991534410348,1.0,0.972992420924 22 | 105,0.102952425,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,-1.56654654578,-1.7489922083,0.102952425,0.0877131253377 23 | 114,0.056003285,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,-2.25996986632,-1.91137246332,0.056003285,0.0760578652421 24 | 116,0.402343781,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,-0.0141882375507,-0.217789697629,0.402343781,0.336478995455 25 | 118,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,1.02271618013,0.907852220524,1.0,0.904063492935 26 | 123,0.543114006,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,0.327494105173,0.0107699724399,0.543114006,0.411258233522 27 | 126,0.498502553,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,0.229878861244,0.179842908138,0.498502553,0.477075571754 28 | 130,1.0,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,1.02271618013,0.863813252773,1.0,0.869772265278 29 | -------------------------------------------------------------------------------- /regression/outputs/matern_kernel_gen10_kinetics_off.csv: -------------------------------------------------------------------------------- 1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real 2 | 0,29.8,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,-0.755347722203,-0.742174445599,29.8,30.4865002826 3 | 2,169.7333333,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,0.250897421852,0.314322993012,169.7333333,189.404794518 4 | 6,607.3916667,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,0.988319154052,0.576867733954,607.3916667,298.21243049 5 | 14,389.5,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,0.731332570021,0.856499057671,389.5,483.604324564 6 | 18,9.616666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,-1.40951844698,-0.941908310311,9.616666667,21.5841562125 7 | 25,8491.4,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,2.51391376936,1.74859381358,8491.4,2261.15276527 8 | 26,61.75,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,-0.333937595146,-0.244556081469,61.75,72.0692681014 9 | 29,3544.2,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,2.00854625622,1.2881798356,3544.2,1020.05293881 10 | 43,28.66,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,-0.77790857387,-0.601035249998,28.66,38.9120023574 11 | 47,19.64,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,-0.996503711137,-0.207834506392,19.64,76.7932297794 12 | 48,14.11666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,-1.18749672014,-0.62859173715,14.11666667,37.1015856156 13 | 58,217.2,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,0.39352380661,0.42969986086,217.2,231.218741072 14 | 62,11.1,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,-1.32654921146,-0.865548082173,11.1,24.6303759824 15 | 65,6723.0,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,2.37884735606,1.76019034112,6723.0,2306.94527442 16 | 68,310.94,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,0.601041098251,0.521387186806,310.94,270.936507276 17 | 72,56.31666667,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,-0.387209754147,-0.487852780348,56.31666667,47.3225298679 18 | 85,12.81,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,-1.24367621877,-1.05464775675,12.81,17.7616517302 19 | 90,146.35,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,0.165163366157,0.189255167907,146.35,152.574633873 20 | 92,228.7,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,0.423364648898,0.506905195001,228.7,264.236954877 21 | 102,33.16,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,-0.69355436972,-0.167098783557,33.16,82.3967012581 22 | 105,37.63333333,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,-0.620360904267,-0.645813123384,37.63333333,36.0131899965 23 | 114,12.81,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,-1.24367621877,-1.20036116763,12.81,13.8061548443 24 | 116,322.7666667,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,0.622632403696,0.574998449291,322.7666667,297.250208691 25 | 118,27.82857143,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,-0.794936043221,-0.152354285467,27.82857143,84.5241677159 26 | 123,179.26,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,0.28248280422,0.795286857215,179.26,435.039090984 27 | 126,940.5,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,1.24121524215,0.806265333155,940.5,443.375406037 28 | 130,18.9,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,-1.01871776291,-0.468718805493,18.9,48.914196253 29 | -------------------------------------------------------------------------------- /regression/outputs/matern_kernel_gen10_max_peak.csv: -------------------------------------------------------------------------------- 1 | ,prop,seq,block_k,chimera,y,mu,y_real,mu_real 2 | 0,0.408825104,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1222221222,ChR_20_10,0.639787095292,0.444058543315,0.408825104,0.304164705421 3 | 2,0.712525792,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000012000,ChR_4_10,1.00747788655,1.08943456963,0.712525792,0.806449308107 4 | 6,2.106051626,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111100,ChR_14_10,1.72478812836,1.65014706839,2.106051626,1.8814502068 5 | 14,2.560177881,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1211001101,ChR_18_10,1.85402713135,1.5918989079,2.560177881,1.72295036936 6 | 18,0.722079305,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1000002000,ChR_3_10,1.01629331501,0.792040809852,0.722079305,0.514564661896 7 | 25,0.461560568,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001011002,ChR_6_10,0.720089692223,0.792045333536,0.461560568,0.514568178774 8 | 26,0.328829052,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2022222220,ChR_24_10,0.495664091484,0.502572079309,0.328829052,0.332279009925 9 | 29,1.608608124,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111102,ChR_15_10,1.5464486702,1.25021262662,1.608608124,1.0281886028 10 | 39,0.025428131,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1210021101,ChR_29_10,-1.19852657072,0.224774494202,0.025428131,0.218385584035 11 | 43,0.495936951,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c1122221222,ChR_17_10,0.767635858036,0.556701201507,0.495936951,0.360595358134 12 | 47,0.651422528,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2122222220,ChR_27_10,0.948135698315,0.646168922392,0.651422528,0.412786075646 13 | 48,0.399744553,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSTGNHAYCL---RYFEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2002000100,ChR_22_10,0.62492023374,0.486512601336,0.399744553,0.32431375037 14 | 58,2.398501892,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFTFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1010001101,ChR_8_10,1.81085139985,1.59043849564,2.398501892,1.71915291192 15 | 62,0.848262695,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c2000000100,ChR_21_10,1.12289192958,0.897126412923,0.848262695,0.603104617465 16 | 65,0.314668112,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNITGLKNDYSKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVESY-------YIMPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1002011002,ChR_7_10,0.466528634001,0.769472200629,0.314668112,0.497314815463 17 | 66,0.026046454,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTPGEKIGAQVCQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1210001121,ChR_30_10,-1.18262464527,0.388215662507,0.026046454,0.279554893367 18 | 68,3.472616895,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1110001101,ChR_11_10,2.05578776486,1.6211742775,3.472616895,1.80086856743 19 | 69,0.768058679,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMGLLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1111111002,ChR_13_10,1.05715165607,1.09203574513,0.768058679,0.809624895854 20 | 72,0.239768735,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2022221222,ChR_23_10,0.28660201521,0.463819433189,0.239768735,0.313382739147 21 | 85,0.550266779,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTPGEKIGAQVCQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLTGLANDYNKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGH-CRMVVKLMAYAYFASWGSYPILFILGPEGFGVLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1221122222,ChR_19_10,0.836440730862,0.373304299162,0.550266779,0.273327228192 22 | 90,2.118434326,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000001100,ChR_2_10,1.7286682827,1.27146856584,2.118434326,1.06174451804 23 | 92,1.995635986,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111100,ChR_10_10,1.6891447199,1.54053211299,1.995635986,1.59429272704 24 | 102,0.432389432,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2022222221,ChR_25_10,0.676878066064,0.549789259576,0.432389432,0.356849267807 25 | 105,1.415689748,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPATVYLSGGNHAYWL---RYAEWLLTCPVILIHLSNITGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWLMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,n1001002000,ChR_5_10,1.46189247888,0.784853465205,1.415689748,0.509007202901 26 | 114,0.460355225,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAVFLSALFLAFYGWHAYKASV---------GW-EEVYVCSVELIKVILEIYFEF--TSPAMLFLYGGNITPWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVESY-------YIMPAGG-CKKLVLAMTAVYYSSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HIIMYGD-----IRRPVSSQFL-GRKVDVLAFVTEE,c1000000100,ChR_1_10,0.718358975785,0.216981459528,0.460355225,0.215829353167 27 | 116,2.626104167,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLKNDYSKRTMALLVSDIGTIVWGTTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVYGSTVGHTIIDLMSKNCWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1112111100,ChR_16_10,1.87085511758,1.51395541011,2.626104167,1.53154430469 28 | 118,1.221857893,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAN-------HSVPKGH-CRMVVKLMAYAYFASWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c2122222221,ChR_28_10,1.36443508774,0.702031245763,1.221857893,0.44913770535 29 | 123,3.323136548,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGTNAEKLAANILQWITFALSALCLMFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSNGNKTVWL---RYAEWLLTCPVILIHLSNITGLSEAYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHFLRIKI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,c1111001101,ChR_12_10,2.02666565635,1.56239449688,3.323136548,1.64783312887 30 | 126,2.564170288,RMLFQTSYTLENNGSVICIPNNGQCFCLAWLKSNGALWEQETARGFQWFAFFLSALFLAFYGYQTWKSTC---------GW-EEIYVATIEMIKFIIEYFHEF--DEPAVIYSSGGNKTVWL---RYAEWLLTCPVILIHLSNLTGLANDYNKRTMALLVSDLGTICMGVTAALA-TGWVK---WLFYCIGLVYGTQTFYNAGIIYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGMFPILFILGPEGFGVLSVAGSTIGHTIADLLSKNIWGLLGHYLRVLI-----------HE---HILIHGD-----IRKTTKLNIG-GTEIEVETLVEDE,n1011111000,ChR_9_10,1.85505847296,1.54973774044,2.564170288,1.6166217053 31 | 130,0.332408124,GFDELAKGAVVPEDHFVCGPAD-KCYCSAWLHSRGTNAEKLAANILQWIAFSIAIALLTFYGFSAWKATC---------GW-EEVYVCCVEVLFVTLEIFKEF--SSPATVYLSTGNHAYCL---RYFEWLLSCPVILIRLSNLSGLKNDYSKRTMGLIVSCVGMIVFGMAAGLA-TDWLK---WLLYIVSCIYGGYMYFQAAKCYVEAY-------HTVPKGR-CRQVVTGMAWLFFVSWGSYPILWAVGPEGLLKLSPYANSIGHSICDIIAKEFWTFLAHHLRIKI-----------HE---HILIHGD-----IRKTTKMEIG-GEEVEVEEFVEEE,c2122221222,ChR_26_10,0.502829215139,0.584138672734,0.332408124,0.375857682512 32 | -------------------------------------------------------------------------------- /regression/outputs/matern_kinetics_off_0.03_LASSO.csv: -------------------------------------------------------------------------------- 1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust 2 | 0,-0.469988726749,"(144, 147)",contact,"['T', 'S']",14,"[178, 181]","['T', 'A']","[203, 206]","['T', 'S']","[176, 179]","['S', 'K']" 3 | 1,0.139320184843,188.0,seq,C,5,218,L,243,C,216,I 4 | 2,0.139320184843,189.0,seq,I,5,219,M,244,I,217,V 5 | 3,0.139320184843,192.0,seq,V,5,222,C,247,V,220,I 6 | 4,0.139320184843,"(174, 179)",contact,"['A', 'V']",5,"[208, 212]","['S', 'V']","[233, 237]","['A', 'V']","[206, 210]","['A', 'L']" 7 | 5,0.139320184843,"(174, 186)",contact,"['A', 'F']",5,"[208, 216]","['S', 'F']","[233, 241]","['A', 'F']","[206, 214]","['A', 'L']" 8 | 6,0.139320184843,"(176, 177)",contact,"['T', 'G']",5,"[209, 210]","['K', 'G']","[234, 235]","['T', 'G']","[207, 208]","['T', 'D']" 9 | 7,0.139320184843,"(176, 179)",contact,"['T', 'V']",5,"[209, 212]","['K', 'V']","[234, 237]","['T', 'V']","[207, 210]","['T', 'L']" 10 | 8,0.139320184843,"(177, 178)",contact,"['G', 'W']",5,"[210, 211]","['G', 'Y']","[235, 236]","['G', 'W']","[208, 209]","['D', 'W']" 11 | 9,0.139320184843,"(177, 180)",contact,"['G', 'K']",5,"[210, 213]","['G', 'R']","[235, 238]","['G', 'K']","[208, 211]","['D', 'K']" 12 | 10,0.139320184843,"(177, 184)",contact,"['G', 'W']",5,"[210, 214]","['G', 'V']","[235, 239]","['G', 'W']","[208, 212]","['D', 'W']" 13 | 11,0.139320184843,"(178, 179)",contact,"['W', 'V']",5,"[211, 212]","['Y', 'V']","[236, 237]","['W', 'V']","[209, 210]","['W', 'L']" 14 | 12,0.139320184843,"(179, 180)",contact,"['V', 'K']",5,"[212, 213]","['V', 'R']","[237, 238]","['V', 'K']","[210, 211]","['L', 'K']" 15 | 13,0.139320184843,"(179, 184)",contact,"['V', 'W']",5,"[212, 214]","['V', 'V']","[237, 239]","['V', 'W']","[210, 212]","['L', 'W']" 16 | 14,0.139320184843,"(179, 185)",contact,"['V', 'L']",5,"[212, 215]","['V', 'I']","[237, 240]","['V', 'L']","[210, 213]","['L', 'L']" 17 | 15,0.139320184843,"(180, 186)",contact,"['K', 'F']",5,"[213, 216]","['R', 'F']","[238, 241]","['K', 'F']","[211, 214]","['K', 'L']" 18 | 16,0.139320184843,"(184, 186)",contact,"['W', 'F']",5,"[214, 216]","['V', 'F']","[239, 241]","['W', 'F']","[212, 214]","['W', 'L']" 19 | 17,0.139320184843,"(184, 188)",contact,"['W', 'C']",5,"[214, 218]","['V', 'L']","[239, 243]","['W', 'C']","[212, 216]","['W', 'I']" 20 | 18,0.139320184843,"(185, 186)",contact,"['L', 'F']",5,"[215, 216]","['I', 'F']","[240, 241]","['L', 'F']","[213, 214]","['L', 'L']" 21 | 19,0.139320184843,"(185, 188)",contact,"['L', 'C']",5,"[215, 218]","['I', 'L']","[240, 243]","['L', 'C']","[213, 216]","['L', 'I']" 22 | 20,0.139320184843,"(185, 189)",contact,"['L', 'I']",5,"[215, 219]","['I', 'M']","[240, 244]","['L', 'I']","[213, 217]","['L', 'V']" 23 | 21,0.139320184843,"(186, 187)",contact,"['F', 'Y']",5,"[216, 217]","['F', 'F']","[241, 242]","['F', 'Y']","[214, 215]","['L', 'Y']" 24 | 22,0.139320184843,"(186, 188)",contact,"['F', 'C']",5,"[216, 218]","['F', 'L']","[241, 243]","['F', 'C']","[214, 216]","['L', 'I']" 25 | 23,0.139320184843,"(186, 189)",contact,"['F', 'I']",5,"[216, 219]","['F', 'M']","[241, 244]","['F', 'I']","[214, 217]","['L', 'V']" 26 | 24,0.139320184843,"(187, 188)",contact,"['Y', 'C']",5,"[217, 218]","['F', 'L']","[242, 243]","['Y', 'C']","[215, 216]","['Y', 'I']" 27 | 25,0.139320184843,"(187, 189)",contact,"['Y', 'I']",5,"[217, 219]","['F', 'M']","[242, 244]","['Y', 'I']","[215, 217]","['Y', 'V']" 28 | 26,0.139320184843,"(187, 190)",contact,"['Y', 'G']",5,"[217, 220]","['F', 'G']","[242, 245]","['Y', 'G']","[215, 218]","['Y', 'S']" 29 | 27,0.139320184843,"(187, 191)",contact,"['Y', 'L']",5,"[217, 221]","['F', 'L']","[242, 246]","['Y', 'L']","[215, 219]","['Y', 'C']" 30 | 28,0.139320184843,"(188, 189)",contact,"['C', 'I']",5,"[218, 219]","['L', 'M']","[243, 244]","['C', 'I']","[216, 217]","['I', 'V']" 31 | 29,0.139320184843,"(188, 190)",contact,"['C', 'G']",5,"[218, 220]","['L', 'G']","[243, 245]","['C', 'G']","[216, 218]","['I', 'S']" 32 | 30,0.139320184843,"(188, 191)",contact,"['C', 'L']",5,"[218, 221]","['L', 'L']","[243, 246]","['C', 'L']","[216, 219]","['I', 'C']" 33 | 31,0.139320184843,"(188, 192)",contact,"['C', 'V']",5,"[218, 222]","['L', 'C']","[243, 247]","['C', 'V']","[216, 220]","['I', 'I']" 34 | 32,0.139320184843,"(189, 190)",contact,"['I', 'G']",5,"[219, 220]","['M', 'G']","[244, 245]","['I', 'G']","[217, 218]","['V', 'S']" 35 | 33,0.139320184843,"(189, 191)",contact,"['I', 'L']",5,"[219, 221]","['M', 'L']","[244, 246]","['I', 'L']","[217, 219]","['V', 'C']" 36 | 34,0.139320184843,"(189, 192)",contact,"['I', 'V']",5,"[219, 222]","['M', 'C']","[244, 247]","['I', 'V']","[217, 220]","['V', 'I']" 37 | 35,0.139320184843,"(189, 193)",contact,"['I', 'Y']",5,"[219, 223]","['M', 'Y']","[244, 248]","['I', 'Y']","[217, 221]","['V', 'Y']" 38 | 36,0.139320184843,"(190, 192)",contact,"['G', 'V']",5,"[220, 222]","['G', 'C']","[245, 247]","['G', 'V']","[218, 220]","['S', 'I']" 39 | 37,0.139320184843,"(191, 192)",contact,"['L', 'V']",5,"[221, 222]","['L', 'C']","[246, 247]","['L', 'V']","[219, 220]","['C', 'I']" 40 | 38,0.139320184843,"(192, 193)",contact,"['V', 'Y']",5,"[222, 223]","['C', 'Y']","[247, 248]","['V', 'Y']","[220, 221]","['I', 'Y']" 41 | 39,0.139320184843,"(192, 194)",contact,"['V', 'G']",5,"[222, 224]","['C', 'G']","[247, 249]","['V', 'G']","[220, 222]","['I', 'G']" 42 | 40,-1.48382127775,"(161, 197)",contact,"['D', 'T']",13,"[195, 227]","['D', 'T']","[220, 252]","['D', 'T']","[193, 225]","['C', 'M']" 43 | 41,-1.48382127775,"(164, 197)",contact,"['T', 'T']",13,"[198, 227]","['T', 'T']","[223, 252]","['T', 'T']","[196, 225]","['M', 'M']" 44 | 42,0.142100116668,"(191, 195)",contact,"['L', 'G']",9,"[221, 225]","['L', 'I']","[246, 250]","['L', 'T']","[219, 223]","['C', 'G']" 45 | 43,0.0592470722772,"(164, 190)",contact,"['M', 'G']",6,"[198, 220]","['T', 'G']","[223, 245]","['T', 'G']","[196, 218]","['M', 'S']" 46 | 44,0.0592470722772,"(167, 186)",contact,"['F', 'F']",6,"[201, 216]","['W', 'F']","[226, 241]","['M', 'F']","[199, 214]","['F', 'L']" 47 | 45,0.0592470722772,"(167, 190)",contact,"['F', 'G']",6,"[201, 220]","['W', 'G']","[226, 245]","['M', 'G']","[199, 218]","['F', 'S']" 48 | 46,0.0592470722772,"(170, 186)",contact,"['A', 'F']",6,"[204, 216]","['T', 'F']","[229, 241]","['T', 'F']","[202, 214]","['A', 'L']" 49 | 47,-0.248666834914,"(167, 187)",contact,"['W', 'F']",12,"[201, 217]","['W', 'F']","[226, 242]","['M', 'Y']","[199, 215]","['F', 'Y']" 50 | 48,-0.248666834914,"(167, 189)",contact,"['W', 'M']",12,"[201, 219]","['W', 'M']","[226, 244]","['M', 'I']","[199, 217]","['F', 'V']" 51 | 49,-0.225257299268,"(53, 91)",contact,"['A', 'V']",2,"[102, 130]","['S', 'M']","[127, 155]","['S', 'L']","[100, 128]","['A', 'V']" 52 | 50,-0.225257299268,"(54, 91)",contact,"['I', 'V']",2,"[103, 130]","['A', 'M']","[128, 155]","['A', 'L']","[101, 128]","['I', 'V']" 53 | 51,0.0326238176403,"(235, 272)",contact,"['L', 'L']",10,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']" 54 | 52,0.0326238176403,"(235, 276)",contact,"['L', 'I']",10,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']" 55 | 53,0.123927605457,"(202, 237)",contact,"['G', 'F']",8,"[232, 259]","['A', 'F']","[257, 284]","['G', 'Y']","[230, 257]","['A', 'F']" 56 | 54,0.22516450092,"(238, 242)",contact,"['V', 'M']",7,"[260, 264]","['V', 'M']","[285, 289]","['S', 'M']","[258, 262]","['A', 'S']" 57 | 55,-0.56035946208,"(238, 242)",contact,"['S', 'M']",1,"[260, 264]","['V', 'M']","[285, 289]","['S', 'M']","[258, 262]","['A', 'S']" 58 | 56,-0.467384269913,"(105, 109)",contact,"['S', 'T']",4,"[142, 146]","['D', 'V']","[167, 171]","['T', 'M']","[140, 144]","['S', 'T']" 59 | 57,-0.387740792677,"(105, 106)",contact,"['S', 'S']",11,"[142, 143]","['D', 'E']","[167, 168]","['T', 'S']","[140, 141]","['S', 'S']" 60 | 58,-0.357155396295,"(89, 132)",contact,"['V', 'S']",3,"[128, 166]","['I', 'T']","[153, 191]","['V', 'T']","[126, 164]","['V', 'S']" 61 | 59,0.350950624649,"(204, 208)",contact,"['I', 'A']",0,"[234, 238]","['V', 'A']","[259, 263]","['I', 'S']","[232, 236]","['C', 'A']" 62 | -------------------------------------------------------------------------------- /regression/outputs/matern_max_peak_0.05_LASSO.csv: -------------------------------------------------------------------------------- 1 | ,weights,feature,type,aa,feature_group,C1C2_features_adjust,C1C2_aa_adjust,CheRiff_features_adjust,CheRiff_aa_adjust,CsChrim_features_adjust,CsChrim_aa_adjust 2 | 0,0.136551668956,"(52, 271)",contact,"['L', 'L']",7,"[101, 293]","['L', 'L']","[126, 318]","['L', 'L']","[99, 291]","['I', 'I']" 3 | 1,0.136551668956,"(52, 275)",contact,"['L', 'N']",7,"[101, 297]","['L', 'N']","[126, 322]","['L', 'N']","[99, 295]","['I', 'E']" 4 | 2,0.136551668956,"(53, 275)",contact,"['S', 'N']",7,"[102, 297]","['S', 'N']","[127, 322]","['S', 'N']","[100, 295]","['A', 'E']" 5 | 3,0.487954323853,"(105, 109)",contact,"['S', 'T']",11,"[142, 146]","['D', 'V']","[167, 171]","['T', 'M']","[140, 144]","['S', 'T']" 6 | 4,-0.0659944556613,114.0,seq,N,6,151,N,176,G,149,T 7 | 5,-0.0659944556613,"(114, 115)",contact,"['N', 'G']",6,"[151, 152]","['N', 'G']","[176, 177]","['G', 'G']","[149, 150]","['T', 'G']" 8 | 6,-0.0659944556613,"(114, 116)",contact,"['N', 'N']",6,"[151, 153]","['N', 'N']","[176, 178]","['G', 'N']","[149, 151]","['T', 'N']" 9 | 7,-0.0659944556613,"(114, 173)",contact,"['N', 'L']",6,"[151, 207]","['N', 'L']","[176, 232]","['G', 'L']","[149, 205]","['T', 'L']" 10 | 8,-0.0213551669437,"(139, 278)",contact,"['H', 'G']",4,"[173, 300]","['H', 'G']","[198, 325]","['H', 'G']","[171, 298]","['R', 'T']" 11 | 9,-0.0213551669437,"(139, 281)",contact,"['H', 'G']",4,"[173, 303]","['H', 'G']","[198, 328]","['H', 'G']","[171, 301]","['R', 'A']" 12 | 10,-0.199600712935,"(118, 169)",contact,"['T', 'M']",10,"[155, 203]","['T', 'T']","[180, 228]","['T', 'V']","[153, 201]","['A', 'M']" 13 | 11,-0.199600712935,"(118, 172)",contact,"['T', 'G']",10,"[155, 206]","['T', 'A']","[180, 231]","['T', 'A']","[153, 204]","['A', 'G']" 14 | 12,0.326253727293,"(206, 208)",contact,"['V', 'A']",0,"[236, 238]","['I', 'A']","[261, 263]","['V', 'S']","[234, 236]","['V', 'A']" 15 | 13,0.326253727293,"(206, 217)",contact,"['V', 'H']",0,"[236, 240]","['I', 'H']","[261, 265]","['V', 'Y']","[234, 238]","['V', 'H']" 16 | 14,-0.191853161447,"(235, 272)",contact,"['L', 'I']",2,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']" 17 | 15,-0.191853161447,"(235, 276)",contact,"['L', 'F']",2,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']" 18 | 16,0.112963236909,"(235, 272)",contact,"['L', 'L']",1,"[257, 294]","['L', 'M']","[282, 319]","['V', 'L']","[255, 292]","['A', 'I']" 19 | 17,0.112963236909,"(235, 276)",contact,"['L', 'I']",1,"[257, 298]","['L', 'C']","[282, 323]","['V', 'I']","[255, 296]","['A', 'F']" 20 | 18,-0.756058535511,"(242, 269)",contact,"['M', 'C']",9,"[264, 291]","['M', 'I']","[289, 316]","['M', 'A']","[262, 289]","['S', 'C']" 21 | 19,-0.756058535511,"(243, 269)",contact,"['F', 'C']",9,"[265, 291]","['F', 'I']","[290, 316]","['F', 'A']","[263, 289]","['Y', 'C']" 22 | 20,-0.756058535511,"(243, 273)",contact,"['F', 'A']",9,"[265, 295]","['F', 'S']","[290, 320]","['F', 'S']","[263, 293]","['Y', 'A']" 23 | 21,0.245774599494,"(170, 174)",contact,"['T', 'A']",3,"[204, 208]","['T', 'S']","[229, 233]","['T', 'A']","[202, 206]","['A', 'A']" 24 | 22,0.245774599494,"(172, 174)",contact,"['A', 'A']",3,"[206, 208]","['A', 'S']","[231, 233]","['A', 'A']","[204, 206]","['G', 'A']" 25 | 23,0.245774599494,"(172, 180)",contact,"['A', 'K']",3,"[206, 213]","['A', 'R']","[231, 238]","['A', 'K']","[204, 211]","['G', 'K']" 26 | 24,-0.349941733312,"(41, 264)",contact,"['G', 'I']",12,"[90, 286]","['A', 'V']","[115, 311]","['T', 'I']","[88, 284]","['G', 'I']" 27 | 25,-0.349941733312,"(45, 264)",contact,"['C', 'I']",12,"[94, 286]","['L', 'V']","[119, 311]","['F', 'I']","[92, 284]","['C', 'I']" 28 | 26,0.155622833297,"(156, 158)",contact,"['A', 'L']",5,"[190, 192]","['G', 'L']","[215, 217]","['A', 'L']","[188, 190]","['G', 'I']" 29 | 27,0.425036524061,"(158, 161)",contact,"['L', 'D']",8,"[192, 195]","['L', 'D']","[217, 220]","['L', 'D']","[190, 193]","['I', 'C']" 30 | 28,-0.37731282473,"(172, 247)",contact,"['G', 'F']",13,"[206, 269]","['A', 'F']","[231, 294]","['A', 'F']","[204, 267]","['G', 'W']" 31 | -------------------------------------------------------------------------------- /regression/outputs/max_peak_matern_kernel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel.pdf -------------------------------------------------------------------------------- /regression/outputs/max_peak_matern_kernel_CV_fig1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel_CV_fig1.pdf -------------------------------------------------------------------------------- /regression/outputs/max_peak_matern_kernel_LASSO_CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fhalab/channels/da2f6651a760be6b8daec06467272b0ac2d8aeeb/regression/outputs/max_peak_matern_kernel_LASSO_CV.pdf --------------------------------------------------------------------------------