├── .ipynb_checkpoints └── DeepSP_predictor-checkpoint.ipynb ├── Conv1D_regressionSAPpos.json ├── Conv1D_regressionSCMneg.json ├── Conv1D_regressionSCMpos.json ├── Conv1D_regression_SAPpos.h5 ├── Conv1D_regression_SCMneg.h5 ├── Conv1D_regression_SCMpos.h5 ├── DeepSP-app.py ├── DeepSP_input.csv ├── DeepSP_model_train.py ├── DeepSP_predictor.ipynb ├── LICENSE ├── README.md ├── data ├── Deep_SAPpos_data.txt ├── Deep_SCMneg_data.txt ├── Deep_SCMpos_data.txt ├── ERR4082227_1482_rank1_imgt_scheme.pdb └── ERR4082243_2914_rank1_imgt_scheme.pdb ├── deepsp_predictor.py └── environment.yml /.ipynb_checkpoints/DeepSP_predictor-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "CqYhZfJnSvsH" 7 | }, 8 | "source": [ 9 | "Install and Import Necessary Libraries" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "colab": { 17 | "base_uri": "https://localhost:8080/" 18 | }, 19 | "id": "icCMgpaXYyFN", 20 | "outputId": "4fe01a84-464c-4248-8e1a-c1b671051b4c" 21 | }, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", 28 | "\u001b[0m✨🍰✨ Everything looks OK!\n", 29 | "Channels:\n", 30 | " - bioconda\n", 31 | " - conda-forge\n", 32 | "Platform: linux-64\n", 33 | "Collecting package metadata (repodata.json): - \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\b/ \b\b- \b\b\\ \b\b| \b\bdone\n", 34 | "Solving environment: - \b\b\\ \b\b| \b\bdone\n", 35 | "\n", 36 | "\n", 37 | "==> WARNING: A newer version of conda exists. <==\n", 38 | " current version: 23.11.0\n", 39 | " latest version: 24.3.0\n", 40 | "\n", 41 | "Please update conda by running\n", 42 | "\n", 43 | " $ conda update -n base -c conda-forge conda\n", 44 | "\n", 45 | "\n", 46 | "\n", 47 | "## Package Plan ##\n", 48 | "\n", 49 | " environment location: /usr/local\n", 50 | "\n", 51 | " added / updated specs:\n", 52 | " - anarci\n", 53 | "\n", 54 | "\n", 55 | "The following packages will be downloaded:\n", 56 | "\n", 57 | " package | build\n", 58 | " ---------------------------|-----------------\n", 59 | " anarci-2021.02.04 | pyhdfd78af_0 1.1 MB bioconda\n", 60 | " biopython-1.83 | py310h2372a71_0 2.6 MB conda-forge\n", 61 | " ca-certificates-2024.2.2 | hbcca054_0 152 KB conda-forge\n", 62 | " certifi-2024.2.2 | pyhd8ed1ab_0 157 KB conda-forge\n", 63 | " hmmer-3.4 | hdbdd923_1 11.1 MB bioconda\n", 64 | " libblas-3.9.0 |22_linux64_openblas 14 KB conda-forge\n", 65 | " libcblas-3.9.0 |22_linux64_openblas 14 KB conda-forge\n", 66 | " libgfortran-ng-13.2.0 | h69a702a_6 24 KB conda-forge\n", 67 | " libgfortran5-13.2.0 | h43f5ff8_6 1.4 MB conda-forge\n", 68 | " liblapack-3.9.0 |22_linux64_openblas 14 KB conda-forge\n", 69 | " libopenblas-0.3.27 |pthreads_h413a1c8_0 5.3 MB conda-forge\n", 70 | " numpy-1.26.4 | py310hb13e2d6_0 6.7 MB conda-forge\n", 71 | " openssl-3.3.0 | hd590300_0 2.8 MB conda-forge\n", 72 | " ------------------------------------------------------------\n", 73 | " Total: 31.3 MB\n", 74 | "\n", 75 | "The following NEW packages will be INSTALLED:\n", 76 | "\n", 77 | " anarci bioconda/noarch::anarci-2021.02.04-pyhdfd78af_0 \n", 78 | " biopython conda-forge/linux-64::biopython-1.83-py310h2372a71_0 \n", 79 | " hmmer bioconda/linux-64::hmmer-3.4-hdbdd923_1 \n", 80 | " libblas conda-forge/linux-64::libblas-3.9.0-22_linux64_openblas \n", 81 | " libcblas conda-forge/linux-64::libcblas-3.9.0-22_linux64_openblas \n", 82 | " libgfortran-ng conda-forge/linux-64::libgfortran-ng-13.2.0-h69a702a_6 \n", 83 | " libgfortran5 conda-forge/linux-64::libgfortran5-13.2.0-h43f5ff8_6 \n", 84 | " liblapack conda-forge/linux-64::liblapack-3.9.0-22_linux64_openblas \n", 85 | " libopenblas conda-forge/linux-64::libopenblas-0.3.27-pthreads_h413a1c8_0 \n", 86 | " numpy conda-forge/linux-64::numpy-1.26.4-py310hb13e2d6_0 \n", 87 | "\n", 88 | "The following packages will be UPDATED:\n", 89 | "\n", 90 | " ca-certificates 2023.11.17-hbcca054_0 --> 2024.2.2-hbcca054_0 \n", 91 | " certifi 2023.11.17-pyhd8ed1ab_0 --> 2024.2.2-pyhd8ed1ab_0 \n", 92 | " openssl 3.2.0-hd590300_1 --> 3.3.0-hd590300_0 \n", 93 | "\n", 94 | "\n", 95 | "\n", 96 | "Downloading and Extracting Packages:\n", 97 | "hmmer-3.4 | 11.1 MB | : 0% 0/1 [00:00\n", 535 | "
\n", 536 | "\n", 549 | "\n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | "
NameHeavy_ChainLight_Chain
0mAb1EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...
1mAb2EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE...DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL...
2mAb3QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE...DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL...
3mAb4EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE...EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL...
4mAb5EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR...
5mAb6QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE...EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL...
6mAb7EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE...DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL...
7mAb8EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL...DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK...
8mAb9QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG...DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL...
9mAb10EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE...DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL...
10mAb11QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGL...DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKL...
11mAb12QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLE...DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPG...
12mAb13QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI...QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW...
13mAb14QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE...DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL...
14mAb15QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE...DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL...
15mAb16EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKL...
\n", 657 | "
\n", 658 | "
\n", 659 | "\n", 660 | "
\n", 661 | " \n", 669 | "\n", 670 | " \n", 710 | "\n", 711 | " \n", 735 | "
\n", 736 | "\n", 737 | "\n", 738 | "
\n", 739 | " \n", 750 | "\n", 751 | "\n", 840 | "\n", 841 | " \n", 863 | "
\n", 864 | "
\n", 865 | " \n" 866 | ], 867 | "text/plain": [ 868 | " Name Heavy_Chain \\\n", 869 | "0 mAb1 EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE... \n", 870 | "1 mAb2 EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLE... \n", 871 | "2 mAb3 QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLE... \n", 872 | "3 mAb4 EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLE... \n", 873 | "4 mAb5 EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... \n", 874 | "5 mAb6 QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLE... \n", 875 | "6 mAb7 EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLE... \n", 876 | "7 mAb8 EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGL... \n", 877 | "8 mAb9 QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKG... \n", 878 | "9 mAb10 EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLE... \n", 879 | "10 mAb11 QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGL... \n", 880 | "11 mAb12 QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLE... \n", 881 | "12 mAb13 QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWI... \n", 882 | "13 mAb14 QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLE... \n", 883 | "14 mAb15 QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLE... \n", 884 | "15 mAb16 EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE... \n", 885 | "\n", 886 | " Light_Chain \n", 887 | "0 DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL... \n", 888 | "1 DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKL... \n", 889 | "2 DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRL... \n", 890 | "3 EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRL... \n", 891 | "4 EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPR... \n", 892 | "5 EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRL... \n", 893 | "6 DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRL... \n", 894 | "7 DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGK... \n", 895 | "8 DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKL... \n", 896 | "9 DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKL... \n", 897 | "10 DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKL... \n", 898 | "11 DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPG... \n", 899 | "12 QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRW... \n", 900 | "13 DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRL... \n", 901 | "14 DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKL... \n", 902 | "15 DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKL... " 903 | ] 904 | }, 905 | "execution_count": 4, 906 | "metadata": {}, 907 | "output_type": "execute_result" 908 | } 909 | ], 910 | "source": [ 911 | "dataset = pd.read_csv('DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file\n", 912 | "dataset" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 5, 918 | "metadata": { 919 | "id": "XWbYjNz-SoTA" 920 | }, 921 | "outputs": [], 922 | "source": [ 923 | "name = dataset['Name'].to_list()\n", 924 | "Heavy_seq = dataset['Heavy_Chain'].to_list()\n", 925 | "Light_seq = dataset['Light_Chain'].to_list()" 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "metadata": { 931 | "id": "V-qttNLlTuT4" 932 | }, 933 | "source": [ 934 | "Convert to Fasta File" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": 6, 940 | "metadata": { 941 | "id": "Pt5KeAfZy8gF" 942 | }, 943 | "outputs": [], 944 | "source": [ 945 | "file_out='seq_H.fasta'\n", 946 | "\n", 947 | "with open(file_out, \"w\") as output_handle:\n", 948 | " for i in range(len(name)):\n", 949 | " seq_name = name[i]\n", 950 | " seq = Heavy_seq[i]\n", 951 | " record = SeqRecord(\n", 952 | " Seq(seq),\n", 953 | " id=seq_name,\n", 954 | " name=\"\",\n", 955 | " description=\"\",\n", 956 | " )\n", 957 | " SeqIO.write(record, output_handle, \"fasta\")\n", 958 | "\n", 959 | "file_out='seq_L.fasta'\n", 960 | "\n", 961 | "with open(file_out, \"w\") as output_handle:\n", 962 | " for i in range(len(name)):\n", 963 | " seq_name = name[i]\n", 964 | " seq = Light_seq[i]\n", 965 | " record = SeqRecord(\n", 966 | " Seq(seq),\n", 967 | " id=seq_name,\n", 968 | " name=\"\",\n", 969 | " description=\"\",\n", 970 | " )\n", 971 | " SeqIO.write(record, output_handle, \"fasta\")" 972 | ] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": { 977 | "id": "QugBcnYeT1ci" 978 | }, 979 | "source": [ 980 | "Sequence Alignment with ANARCI" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 7, 986 | "metadata": { 987 | "id": "h7DCE-fo16qG" 988 | }, 989 | "outputs": [], 990 | "source": [ 991 | "!ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv\n", 992 | "!ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv" 993 | ] 994 | }, 995 | { 996 | "cell_type": "code", 997 | "execution_count": 8, 998 | "metadata": { 999 | "id": "uFOunTDOUUhZ" 1000 | }, 1001 | "outputs": [], 1002 | "source": [ 1003 | "H_aligned = pd.read_csv('seq_aligned_H.csv')\n", 1004 | "L_aligned = pd.read_csv('seq_aligned_KL.csv')" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "execution_count": 9, 1010 | "metadata": { 1011 | "id": "Vn4-q554Udy_" 1012 | }, 1013 | "outputs": [], 1014 | "source": [ 1015 | "# https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py\n", 1016 | "\n", 1017 | "def seq_preprocessing():\n", 1018 | " infile_H = pd.read_csv('seq_aligned_H.csv')\n", 1019 | " infile_L = pd.read_csv('seq_aligned_KL.csv')\n", 1020 | " outfile = open('seq_aligned_HL.txt', \"w\")\n", 1021 | "\n", 1022 | " H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \\\n", 1023 | " '11','12','13','14','15','16','17','18','19','20', \\\n", 1024 | " '21','22','23','24','25','26','27','28','29','30', \\\n", 1025 | " '31','32','33','34','35','36','37','38','39','40', \\\n", 1026 | " '41','42','43','44','45','46','47','48','49','50', \\\n", 1027 | " '51','52','53','54','55','56','57','58','59','60', \\\n", 1028 | " '61','62','63','64','65','66','67','68','69','70', \\\n", 1029 | " '71','72','73','74','75','76','77','78','79','80', \\\n", 1030 | " '81','82','83','84','85','86','87','88','89','90', \\\n", 1031 | " '91','92','93','94','95','96','97','98','99','100', \\\n", 1032 | " '101','102','103','104','105','106','107','108','109','110', \\\n", 1033 | " '111','111A','111B','111C','111D','111E','111F','111G','111H', \\\n", 1034 | " '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\\\n", 1035 | " '113','114','115','116','117','118','119','120', \\\n", 1036 | " '121','122','123','124','125','126','127','128']\n", 1037 | "\n", 1038 | " L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \\\n", 1039 | " '11','12','13','14','15','16','17','18','19','20', \\\n", 1040 | " '21','22','23','24','25','26','27','28','29','30', \\\n", 1041 | " '31','32','33','34','35','36','37','38','39','40', \\\n", 1042 | " '41','42','43','44','45','46','47','48','49','50', \\\n", 1043 | " '51','52','53','54','55','56','57','58','59','60', \\\n", 1044 | " '61','62','63','64','65','66','67','68','69','70', \\\n", 1045 | " '71','72','73','74','75','76','77','78','79','80', \\\n", 1046 | " '81','82','83','84','85','86','87','88','89','90', \\\n", 1047 | " '91','92','93','94','95','96','97','98','99','100', \\\n", 1048 | " '101','102','103','104','105','106','107','108','109','110', \\\n", 1049 | " '111','112','113','114','115','116','117','118','119','120', \\\n", 1050 | " '121','122','123','124','125','126','127']\n", 1051 | "\n", 1052 | " H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \\\n", 1053 | " '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \\\n", 1054 | " '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \\\n", 1055 | " '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \\\n", 1056 | " '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \\\n", 1057 | " '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \\\n", 1058 | " '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \\\n", 1059 | " '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \\\n", 1060 | " '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \\\n", 1061 | " '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \\\n", 1062 | " '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \\\n", 1063 | " '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \\\n", 1064 | " '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \\\n", 1065 | " '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \\\n", 1066 | " '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144}\n", 1067 | "\n", 1068 | " L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \\\n", 1069 | " '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \\\n", 1070 | " '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \\\n", 1071 | " '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \\\n", 1072 | " '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \\\n", 1073 | " '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \\\n", 1074 | " '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \\\n", 1075 | " '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \\\n", 1076 | " '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \\\n", 1077 | " '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \\\n", 1078 | " '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \\\n", 1079 | " '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \\\n", 1080 | " '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127}\n", 1081 | "\n", 1082 | "\n", 1083 | " N_mAbs = len(infile_H[\"Id\"])\n", 1084 | "\n", 1085 | " for i in range(N_mAbs):\n", 1086 | " H_tmp = 145*['-']\n", 1087 | " L_tmp = 127*['-']\n", 1088 | " for col in infile_H.columns:\n", 1089 | " if(col in H_inclusion_list):\n", 1090 | " H_tmp[H_dict[col]]=infile_H.iloc[i][col]\n", 1091 | " for col in infile_L.columns:\n", 1092 | " if(col in L_inclusion_list):\n", 1093 | " L_tmp[L_dict[col]]=infile_L.iloc[i][col]\n", 1094 | "\n", 1095 | " aa_string = ''\n", 1096 | " for aa in H_tmp+L_tmp:\n", 1097 | " aa_string += aa\n", 1098 | " outfile.write(infile_H.iloc[i,0]+\" \"+aa_string)\n", 1099 | " outfile.write(\"\\n\")\n", 1100 | "\n", 1101 | " outfile.close()\n", 1102 | " return\n", 1103 | "\n", 1104 | "seq_preprocessing()" 1105 | ] 1106 | }, 1107 | { 1108 | "cell_type": "markdown", 1109 | "metadata": { 1110 | "id": "s0SoIZ19Un54" 1111 | }, 1112 | "source": [ 1113 | "Read Aligned Sequence" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "code", 1118 | "execution_count": 2, 1119 | "metadata": { 1120 | "id": "8maPu9TsUnU0" 1121 | }, 1122 | "outputs": [], 1123 | "source": [ 1124 | "def load_input_data(filename):\n", 1125 | " name_list=[]\n", 1126 | " seq_list=[]\n", 1127 | " with open(filename) as datafile:\n", 1128 | " for line in datafile:\n", 1129 | " line = line.strip().split()\n", 1130 | " name_list.append(line[0])\n", 1131 | " seq_list.append(line[1])\n", 1132 | " return name_list, seq_list" 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": 11, 1138 | "metadata": { 1139 | "id": "vjQU_ae6Usqq" 1140 | }, 1141 | "outputs": [], 1142 | "source": [ 1143 | "name_list, seq_list = load_input_data('seq_aligned_HL.txt')\n", 1144 | "X = seq_list" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "markdown", 1149 | "metadata": { 1150 | "id": "KVJcM7emVAqS" 1151 | }, 1152 | "source": [ 1153 | "One Hot Encoding of Aligned Sequence" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": 12, 1159 | "metadata": { 1160 | "id": "QTUshyyHVFbI" 1161 | }, 1162 | "outputs": [], 1163 | "source": [ 1164 | "def one_hot_encoder(s):\n", 1165 | " d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20}\n", 1166 | "\n", 1167 | " x = np.zeros((len(d), len(s)))\n", 1168 | " x[[d[c] for c in s], range(len(s))] = 1\n", 1169 | "\n", 1170 | " return x" 1171 | ] 1172 | }, 1173 | { 1174 | "cell_type": "code", 1175 | "execution_count": 13, 1176 | "metadata": { 1177 | "id": "v-JnQUxuVPr9" 1178 | }, 1179 | "outputs": [], 1180 | "source": [ 1181 | "X = [one_hot_encoder(s=x) for x in X]\n", 1182 | "X = np.transpose(np.asarray(X), (0, 2, 1))\n", 1183 | "X = np.asarray(X)" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "markdown", 1188 | "metadata": { 1189 | "id": "QrdyGSKQWv0V" 1190 | }, 1191 | "source": [ 1192 | "Predict DeepSP Predictor" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 14, 1198 | "metadata": { 1199 | "colab": { 1200 | "base_uri": "https://localhost:8080/" 1201 | }, 1202 | "id": "3Xyrqu5dXwxq", 1203 | "outputId": "84dede34-3550-4fa5-d28e-c8bc27c8e2e1" 1204 | }, 1205 | "outputs": [ 1206 | { 1207 | "name": "stdout", 1208 | "output_type": "stream", 1209 | "text": [ 1210 | "1/1 [==============================] - 0s 424ms/step\n", 1211 | "1/1 [==============================] - 0s 178ms/step\n", 1212 | "1/1 [==============================] - 0s 181ms/step\n" 1213 | ] 1214 | } 1215 | ], 1216 | "source": [ 1217 | "# sappos\n", 1218 | "json_file = open('Conv1D_regressionSAPpos.json', 'r')\n", 1219 | "loaded_model_json = json_file.read()\n", 1220 | "json_file.close()\n", 1221 | "loaded_model = model_from_json(loaded_model_json)\n", 1222 | "# load weights into model\n", 1223 | "loaded_model.load_weights(\"Conv1D_regression_SAPpos.h5\")\n", 1224 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n", 1225 | "sap_pos = loaded_model.predict(X)\n", 1226 | "\n", 1227 | "# scmneg\n", 1228 | "json_file = open('Conv1D_regressionSCMneg.json', 'r')\n", 1229 | "loaded_model_json = json_file.read()\n", 1230 | "json_file.close()\n", 1231 | "loaded_model = model_from_json(loaded_model_json)\n", 1232 | "# load weights into model\n", 1233 | "loaded_model.load_weights(\"Conv1D_regression_SCMneg.h5\")\n", 1234 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n", 1235 | "scm_neg = loaded_model.predict(X)\n", 1236 | "\n", 1237 | "# scmpos\n", 1238 | "json_file = open('Conv1D_regressionSCMpos.json', 'r')\n", 1239 | "loaded_model_json = json_file.read()\n", 1240 | "json_file.close()\n", 1241 | "loaded_model = model_from_json(loaded_model_json)\n", 1242 | "# load weights into model\n", 1243 | "loaded_model.load_weights(\"Conv1D_regression_SCMpos.h5\")\n", 1244 | "loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae'])\n", 1245 | "scm_pos = loaded_model.predict(X)" 1246 | ] 1247 | }, 1248 | { 1249 | "cell_type": "code", 1250 | "execution_count": 15, 1251 | "metadata": { 1252 | "colab": { 1253 | "base_uri": "https://localhost:8080/", 1254 | "height": 600 1255 | }, 1256 | "id": "a-FfSETHegbu", 1257 | "outputId": "5e5c0891-7fe1-46d2-b5b2-2ff93195f0af" 1258 | }, 1259 | "outputs": [ 1260 | { 1261 | "data": { 1262 | "application/vnd.google.colaboratory.intrinsic+json": { 1263 | "type": "dataframe", 1264 | "variable_name": "df" 1265 | }, 1266 | "text/html": [ 1267 | "\n", 1268 | "
\n", 1269 | "
\n", 1270 | "\n", 1283 | "\n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | " \n", 1574 | " \n", 1575 | " \n", 1576 | " \n", 1577 | " \n", 1578 | " \n", 1579 | " \n", 1580 | " \n", 1581 | " \n", 1582 | " \n", 1583 | " \n", 1584 | " \n", 1585 | " \n", 1586 | " \n", 1587 | " \n", 1588 | " \n", 1589 | " \n", 1590 | " \n", 1591 | " \n", 1592 | " \n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | " \n", 1638 | " \n", 1639 | " \n", 1640 | " \n", 1641 | " \n", 1642 | " \n", 1643 | " \n", 1644 | " \n", 1645 | " \n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | "
NameSAP_pos_CDRH1SAP_pos_CDRH2SAP_pos_CDRH3SAP_pos_CDRL1SAP_pos_CDRL2SAP_pos_CDRL3SAP_pos_CDRSAP_pos_HvSAP_pos_Lv...SCM_pos_CDRH1SCM_pos_CDRH2SCM_pos_CDRH3SCM_pos_CDRL1SCM_pos_CDRL2SCM_pos_CDRL3SCM_pos_CDRSCM_pos_HvSCM_pos_LvSCM_pos_Fv
0mAb12.1347832.52424514.4450711.9047403.5893513.17218227.49679458.41764830.517651...3.18318219.58335729.513483116.76950141.75936155.548267263.838928907.1130371219.4444582109.088623
1mAb21.8445764.3391178.9425921.61396810.4179409.94527138.40904258.48686244.568775...27.88926522.455563154.70430023.50365437.77193190.647194360.7615661224.5612791132.7552492335.686523
2mAb32.8094251.74609620.8086470.6179713.4609755.03178234.87711363.38225244.274189...71.39997929.84519441.84418537.09491339.44297822.183475246.8805081165.597656830.0642091963.784912
3mAb43.1396670.30068726.2609901.8631880.1880026.29535538.34285773.46613338.451836...27.55229211.26079256.28306613.80064266.96466850.259052225.270020993.9152221097.4565432073.500244
4mAb52.4890590.11188215.9674132.5538480.5648271.84845523.76520761.60302733.180843...42.20830910.15164877.493065142.88407981.86649361.183517415.9604801010.6287841221.2348632209.069336
5mAb69.6450242.26556123.0642416.4641990.9875258.05227749.34542161.48750748.577274...45.50165918.139217136.59309418.75672171.84088166.386726359.8916021329.4210211113.3817142426.393555
6mAb78.2228172.83804911.4146514.3154112.6735875.90856536.57814850.77877444.486637...52.31182174.92362282.05307072.28047945.87950988.023949428.9163511007.111145848.1044921808.066040
7mAb84.4242392.09597216.0190222.5232477.3551271.79732734.04642960.67822334.210297...51.687237-3.79564391.02833614.37737923.03526732.922684198.8403471157.940308944.0832522085.386963
8mAb91.4282193.4492055.4590982.1332442.7710938.27071623.19862245.44029237.316959...15.4072488.21823046.5402988.14966017.47427635.439575130.6155241107.602173954.6622922039.475220
9mAb102.3419850.19696810.5348144.1434416.52896815.28079840.83979455.97292349.084797...25.99439814.51695826.37035619.89388569.95530726.277246174.2610171044.7249761139.7918702159.286621
10mAb113.0509379.0657685.5949491.3000724.0100726.86201530.16391455.92456134.403553...46.80364244.56305790.80063629.547958120.15985928.398201362.1413271422.7878421272.1237792652.440430
11mAb122.9034810.37731711.4683906.6137255.5203601.47786228.12219451.99200153.377674...35.554497-3.779891-0.063098125.55715243.82605473.312820267.234039969.9224851093.7979742053.877930
12mAb132.8351681.4580579.9940592.1095353.5172813.28773723.88748946.27349135.819031...114.27550521.51770452.248070111.76911984.82471586.854416469.4820861137.8215331275.5649412417.019531
13mAb142.5899874.79150220.6677492.3084636.2068005.44844043.31604863.52381538.004623...45.41270426.94136664.77358262.668564109.24386633.384670342.1687931134.1562501143.9604492254.151367
14mAb152.7692193.56867514.4866799.5994162.9806655.31449238.96510348.76808539.433372...52.21768633.02408668.11985048.751259103.54825625.522581326.2986151186.5108641266.1505132435.450439
15mAb162.0717247.66590919.8764464.2670053.1848406.35070244.84572268.78134937.064419...33.74091322.686165151.18983541.30941068.02310932.488457351.1022641164.1860351198.9584962334.452393
\n", 1697 | "

16 rows × 31 columns

\n", 1698 | "
\n", 1699 | "
\n", 1700 | "\n", 1701 | "
\n", 1702 | " \n", 1710 | "\n", 1711 | " \n", 1751 | "\n", 1752 | " \n", 1776 | "
\n", 1777 | "\n", 1778 | "\n", 1779 | "
\n", 1780 | " \n", 1791 | "\n", 1792 | "\n", 1881 | "\n", 1882 | " \n", 1904 | "
\n", 1905 | "
\n", 1906 | "
\n" 1907 | ], 1908 | "text/plain": [ 1909 | " Name SAP_pos_CDRH1 SAP_pos_CDRH2 SAP_pos_CDRH3 SAP_pos_CDRL1 \\\n", 1910 | "0 mAb1 2.134783 2.524245 14.445071 1.904740 \n", 1911 | "1 mAb2 1.844576 4.339117 8.942592 1.613968 \n", 1912 | "2 mAb3 2.809425 1.746096 20.808647 0.617971 \n", 1913 | "3 mAb4 3.139667 0.300687 26.260990 1.863188 \n", 1914 | "4 mAb5 2.489059 0.111882 15.967413 2.553848 \n", 1915 | "5 mAb6 9.645024 2.265561 23.064241 6.464199 \n", 1916 | "6 mAb7 8.222817 2.838049 11.414651 4.315411 \n", 1917 | "7 mAb8 4.424239 2.095972 16.019022 2.523247 \n", 1918 | "8 mAb9 1.428219 3.449205 5.459098 2.133244 \n", 1919 | "9 mAb10 2.341985 0.196968 10.534814 4.143441 \n", 1920 | "10 mAb11 3.050937 9.065768 5.594949 1.300072 \n", 1921 | "11 mAb12 2.903481 0.377317 11.468390 6.613725 \n", 1922 | "12 mAb13 2.835168 1.458057 9.994059 2.109535 \n", 1923 | "13 mAb14 2.589987 4.791502 20.667749 2.308463 \n", 1924 | "14 mAb15 2.769219 3.568675 14.486679 9.599416 \n", 1925 | "15 mAb16 2.071724 7.665909 19.876446 4.267005 \n", 1926 | "\n", 1927 | " SAP_pos_CDRL2 SAP_pos_CDRL3 SAP_pos_CDR SAP_pos_Hv SAP_pos_Lv ... \\\n", 1928 | "0 3.589351 3.172182 27.496794 58.417648 30.517651 ... \n", 1929 | "1 10.417940 9.945271 38.409042 58.486862 44.568775 ... \n", 1930 | "2 3.460975 5.031782 34.877113 63.382252 44.274189 ... \n", 1931 | "3 0.188002 6.295355 38.342857 73.466133 38.451836 ... \n", 1932 | "4 0.564827 1.848455 23.765207 61.603027 33.180843 ... \n", 1933 | "5 0.987525 8.052277 49.345421 61.487507 48.577274 ... \n", 1934 | "6 2.673587 5.908565 36.578148 50.778774 44.486637 ... \n", 1935 | "7 7.355127 1.797327 34.046429 60.678223 34.210297 ... \n", 1936 | "8 2.771093 8.270716 23.198622 45.440292 37.316959 ... \n", 1937 | "9 6.528968 15.280798 40.839794 55.972923 49.084797 ... \n", 1938 | "10 4.010072 6.862015 30.163914 55.924561 34.403553 ... \n", 1939 | "11 5.520360 1.477862 28.122194 51.992001 53.377674 ... \n", 1940 | "12 3.517281 3.287737 23.887489 46.273491 35.819031 ... \n", 1941 | "13 6.206800 5.448440 43.316048 63.523815 38.004623 ... \n", 1942 | "14 2.980665 5.314492 38.965103 48.768085 39.433372 ... \n", 1943 | "15 3.184840 6.350702 44.845722 68.781349 37.064419 ... \n", 1944 | "\n", 1945 | " SCM_pos_CDRH1 SCM_pos_CDRH2 SCM_pos_CDRH3 SCM_pos_CDRL1 SCM_pos_CDRL2 \\\n", 1946 | "0 3.183182 19.583357 29.513483 116.769501 41.759361 \n", 1947 | "1 27.889265 22.455563 154.704300 23.503654 37.771931 \n", 1948 | "2 71.399979 29.845194 41.844185 37.094913 39.442978 \n", 1949 | "3 27.552292 11.260792 56.283066 13.800642 66.964668 \n", 1950 | "4 42.208309 10.151648 77.493065 142.884079 81.866493 \n", 1951 | "5 45.501659 18.139217 136.593094 18.756721 71.840881 \n", 1952 | "6 52.311821 74.923622 82.053070 72.280479 45.879509 \n", 1953 | "7 51.687237 -3.795643 91.028336 14.377379 23.035267 \n", 1954 | "8 15.407248 8.218230 46.540298 8.149660 17.474276 \n", 1955 | "9 25.994398 14.516958 26.370356 19.893885 69.955307 \n", 1956 | "10 46.803642 44.563057 90.800636 29.547958 120.159859 \n", 1957 | "11 35.554497 -3.779891 -0.063098 125.557152 43.826054 \n", 1958 | "12 114.275505 21.517704 52.248070 111.769119 84.824715 \n", 1959 | "13 45.412704 26.941366 64.773582 62.668564 109.243866 \n", 1960 | "14 52.217686 33.024086 68.119850 48.751259 103.548256 \n", 1961 | "15 33.740913 22.686165 151.189835 41.309410 68.023109 \n", 1962 | "\n", 1963 | " SCM_pos_CDRL3 SCM_pos_CDR SCM_pos_Hv SCM_pos_Lv SCM_pos_Fv \n", 1964 | "0 55.548267 263.838928 907.113037 1219.444458 2109.088623 \n", 1965 | "1 90.647194 360.761566 1224.561279 1132.755249 2335.686523 \n", 1966 | "2 22.183475 246.880508 1165.597656 830.064209 1963.784912 \n", 1967 | "3 50.259052 225.270020 993.915222 1097.456543 2073.500244 \n", 1968 | "4 61.183517 415.960480 1010.628784 1221.234863 2209.069336 \n", 1969 | "5 66.386726 359.891602 1329.421021 1113.381714 2426.393555 \n", 1970 | "6 88.023949 428.916351 1007.111145 848.104492 1808.066040 \n", 1971 | "7 32.922684 198.840347 1157.940308 944.083252 2085.386963 \n", 1972 | "8 35.439575 130.615524 1107.602173 954.662292 2039.475220 \n", 1973 | "9 26.277246 174.261017 1044.724976 1139.791870 2159.286621 \n", 1974 | "10 28.398201 362.141327 1422.787842 1272.123779 2652.440430 \n", 1975 | "11 73.312820 267.234039 969.922485 1093.797974 2053.877930 \n", 1976 | "12 86.854416 469.482086 1137.821533 1275.564941 2417.019531 \n", 1977 | "13 33.384670 342.168793 1134.156250 1143.960449 2254.151367 \n", 1978 | "14 25.522581 326.298615 1186.510864 1266.150513 2435.450439 \n", 1979 | "15 32.488457 351.102264 1164.186035 1198.958496 2334.452393 \n", 1980 | "\n", 1981 | "[16 rows x 31 columns]" 1982 | ] 1983 | }, 1984 | "execution_count": 15, 1985 | "metadata": {}, 1986 | "output_type": "execute_result" 1987 | } 1988 | ], 1989 | "source": [ 1990 | "features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv',\n", 1991 | " 'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv',\n", 1992 | " 'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv']\n", 1993 | "df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features\n", 1994 | "df.to_csv('DeepSP_descriptors.csv', index=False)\n", 1995 | "df" 1996 | ] 1997 | }, 1998 | { 1999 | "cell_type": "code", 2000 | "execution_count": 15, 2001 | "metadata": { 2002 | "id": "R0Gi5_po05Ct" 2003 | }, 2004 | "outputs": [], 2005 | "source": [] 2006 | } 2007 | ], 2008 | "metadata": { 2009 | "colab": { 2010 | "provenance": [] 2011 | }, 2012 | "kernelspec": { 2013 | "display_name": "Python 3 (ipykernel)", 2014 | "language": "python", 2015 | "name": "python3" 2016 | }, 2017 | "language_info": { 2018 | "codemirror_mode": { 2019 | "name": "ipython", 2020 | "version": 3 2021 | }, 2022 | "file_extension": ".py", 2023 | "mimetype": "text/x-python", 2024 | "name": "python", 2025 | "nbconvert_exporter": "python", 2026 | "pygments_lexer": "ipython3", 2027 | "version": "3.9.13" 2028 | } 2029 | }, 2030 | "nbformat": 4, 2031 | "nbformat_minor": 1 2032 | } 2033 | -------------------------------------------------------------------------------- /Conv1D_regressionSAPpos.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_1"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout", "trainable": true, "dtype": "float32", "rate": 0.3, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 96, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_1", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 32, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_2", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 112, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 48, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_3", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Conv1D_regressionSCMneg.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_3"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_6", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_4", "trainable": true, "dtype": "float32", "rate": 0.1, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 112, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_7", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 64, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_8", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten_2", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Conv1D_regressionSCMpos.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": {"name": "model_conv1D", "layers": [{"class_name": "InputLayer", "config": {"batch_input_shape": [null, 272, 21], "dtype": "float32", "sparse": false, "ragged": false, "name": "input_2"}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_1", "trainable": true, "dtype": "float32", "filters": 128, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_3", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "dtype": "float32", "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_2", "trainable": true, "dtype": "float32", "filters": 112, "kernel_size": [4], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_4", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "dtype": "float32", "rate": 0.4, "noise_shape": null, "seed": null}}, {"class_name": "Conv1D", "config": {"name": "Conv1D_3", "trainable": true, "dtype": "float32", "filters": 144, "kernel_size": [5], "strides": [1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1], "groups": 1, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "BatchNormalization", "config": {"name": "batch_normalization_5", "trainable": true, "dtype": "float32", "axis": [2], "momentum": 0.99, "epsilon": 0.001, "center": true, "scale": true, "beta_initializer": {"class_name": "Zeros", "config": {}}, "gamma_initializer": {"class_name": "Ones", "config": {}}, "moving_mean_initializer": {"class_name": "Zeros", "config": {}}, "moving_variance_initializer": {"class_name": "Ones", "config": {}}, "beta_regularizer": null, "gamma_regularizer": null, "beta_constraint": null, "gamma_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "dtype": "float32", "rate": 0.0, "noise_shape": null, "seed": null}}, {"class_name": "MaxPooling1D", "config": {"name": "MaxPooling1D", "trainable": true, "dtype": "float32", "strides": [2], "pool_size": [2], "padding": "valid", "data_format": "channels_last"}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true, "dtype": "float32", "data_format": "channels_last"}}, {"class_name": "Dense", "config": {"name": "Dense_1", "trainable": true, "dtype": "float32", "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dense", "config": {"name": "Dense_2", "trainable": true, "dtype": "float32", "units": 10, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "GlorotUniform", "config": {"seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}]}, "keras_version": "2.11.0", "backend": "tensorflow"} -------------------------------------------------------------------------------- /Conv1D_regression_SAPpos.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SAPpos.h5 -------------------------------------------------------------------------------- /Conv1D_regression_SCMneg.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SCMneg.h5 -------------------------------------------------------------------------------- /Conv1D_regression_SCMpos.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lailabcode/DeepSP/4ba0118fa8e95873daf4a4dcef52beca08b67ab5/Conv1D_regression_SCMpos.h5 -------------------------------------------------------------------------------- /DeepSP-app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Aug 22 17:32:38 2023 4 | 5 | @author: plai3 6 | """ 7 | 8 | import streamlit as st 9 | import numpy as np 10 | import pandas as pd 11 | 12 | from keras.models import model_from_json 13 | 14 | from Bio import SeqIO 15 | from io import StringIO 16 | from anarci import anarci 17 | 18 | def one_hot_encoder(s): 19 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20} 20 | 21 | x = np.zeros((len(d), len(s))) 22 | x[[d[c] for c in s], range(len(s))] = 1 23 | 24 | return x 25 | 26 | H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \ 27 | '11','12','13','14','15','16','17','18','19','20', \ 28 | '21','22','23','24','25','26','27','28','29','30', \ 29 | '31','32','33','34','35','36','37','38','39','40', \ 30 | '41','42','43','44','45','46','47','48','49','50', \ 31 | '51','52','53','54','55','56','57','58','59','60', \ 32 | '61','62','63','64','65','66','67','68','69','70', \ 33 | '71','72','73','74','75','76','77','78','79','80', \ 34 | '81','82','83','84','85','86','87','88','89','90', \ 35 | '91','92','93','94','95','96','97','98','99','100', \ 36 | '101','102','103','104','105','106','107','108','109','110', \ 37 | '111','111A','111B','111C','111D','111E','111F','111G','111H', \ 38 | '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\ 39 | '113','114','115','116','117','118','119','120', \ 40 | '121','122','123','124','125','126','127','128'] 41 | 42 | L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \ 43 | '11','12','13','14','15','16','17','18','19','20', \ 44 | '21','22','23','24','25','26','27','28','29','30', \ 45 | '31','32','33','34','35','36','37','38','39','40', \ 46 | '41','42','43','44','45','46','47','48','49','50', \ 47 | '51','52','53','54','55','56','57','58','59','60', \ 48 | '61','62','63','64','65','66','67','68','69','70', \ 49 | '71','72','73','74','75','76','77','78','79','80', \ 50 | '81','82','83','84','85','86','87','88','89','90', \ 51 | '91','92','93','94','95','96','97','98','99','100', \ 52 | '101','102','103','104','105','106','107','108','109','110', \ 53 | '111','112','113','114','115','116','117','118','119','120', \ 54 | '121','122','123','124','125','126','127'] 55 | 56 | H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \ 57 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \ 58 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \ 59 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \ 60 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \ 61 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \ 62 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \ 63 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \ 64 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \ 65 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \ 66 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \ 67 | '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \ 68 | '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \ 69 | '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \ 70 | '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144} 71 | 72 | L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \ 73 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \ 74 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \ 75 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \ 76 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \ 77 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \ 78 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \ 79 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \ 80 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \ 81 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \ 82 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \ 83 | '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \ 84 | '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127} 85 | 86 | st.set_page_config( 87 | page_title="DeepSP App", 88 | layout="centered", 89 | ) 90 | 91 | st.title('DeepSP') 92 | st.header('Deep learning-based antibody structural properties') 93 | st.subheader('The FASTA file format is H_seq/L_seq (variable regions)') 94 | 95 | st.markdown(''' 96 | ### EXAMPLE: 97 | \>6p8n 98 | QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMNWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARGKNSDYNWDFQHWGQGTLVTVSS/DIVMSQSPSSLAVSVGEKVTMSCKSSQSLLYSSNQKNYLAWYQQKPGQSPKLLIYWASTRESGVPDRFTGSGSGTDFTLTISSVKAEDLAVYYCQQYEMFGGGTKLEIK 99 | ''') 100 | 101 | seq_file = st.file_uploader("#### Upload your FASTA file", type=['fasta']) 102 | if seq_file is not None: 103 | stringio = StringIO(seq_file.getvalue().decode("utf-8")) 104 | sequences_H = [] 105 | sequences_L = [] 106 | name_list = [] 107 | for record in SeqIO.parse(stringio, 'fasta'): 108 | name = str(record.id) 109 | name_list.append(name) 110 | sequence = str(record.seq) 111 | sequence_H, sequence_L = sequence.split('/') 112 | sequences_H.append((name,sequence_H)) 113 | sequences_L.append((name,sequence_L)) 114 | 115 | results_H = anarci(sequences_H, scheme="imgt", output=False) 116 | results_L = anarci(sequences_L, scheme="imgt", output=False) 117 | numbering_H, alignment_details_H, hit_tables_H = results_H 118 | numbering_L, alignment_details_L, hit_tables_L = results_L 119 | 120 | # Iterate over the sequences 121 | seq_list = [] 122 | for i in range(len(sequences_H)): 123 | if numbering_H[i] is None: 124 | print('ANARCI did not number', sequences_H[i][0]) 125 | else: 126 | domain_numbering_H, start_index_H, end_index_H = numbering_H[i][0] 127 | domain_numbering_L, start_index_L, end_index_L = numbering_L[i][0] 128 | H_tmp = 145*['-'] 129 | L_tmp = 127*['-'] 130 | for j in range(len(domain_numbering_H)): 131 | col_H = str(domain_numbering_H[j][0][0])+domain_numbering_H[j][0][1] 132 | col_H = col_H.replace(" ", "") 133 | H_tmp[H_dict[col_H]]=domain_numbering_H[j][1] 134 | for j in range(len(domain_numbering_L)): 135 | col_L = str(domain_numbering_L[j][0][0])+domain_numbering_L[j][0][1] 136 | col_L = col_L.replace(" ", "") 137 | L_tmp[L_dict[col_L]]=domain_numbering_L[j][1] 138 | aa_string = '' 139 | for aa in H_tmp+L_tmp: 140 | aa_string += aa 141 | seq_list.append(aa_string) 142 | 143 | X = [one_hot_encoder(s=x) for x in seq_list] 144 | X = np.transpose(np.asarray(X), (0, 2, 1)) 145 | X = np.asarray(X) 146 | 147 | #load DeepSAP_pos model 148 | json_file = open('Conv1D_regressionSAPpos.json', 'r') 149 | loaded_model_json = json_file.read() 150 | json_file.close() 151 | loaded_model = model_from_json(loaded_model_json) 152 | 153 | # load weights into the model 154 | loaded_model.load_weights('Conv1D_regression_SAPpos.h5') 155 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 156 | 157 | # predict SAPpos 158 | y_pred = loaded_model.predict(X) 159 | df_SAPpos = pd.DataFrame(y_pred, columns=['SAP_pos_CDRH1', 'SAP_pos_CDRH2', 'SAP_pos_CDRH3', 160 | 'SAP_pos_CDRL1', 'SAP_pos_CDRL2', 'SAP_pos_CDRL3', 161 | 'SAP_pos_CDR', 'SAP_pos_Hv', 'SAP_pos_Lv', 'SAP_pos_Fv']) 162 | 163 | #load DeepSCM_neg model 164 | json_file = open('Conv1D_regressionSCMneg.json', 'r') 165 | loaded_model_json = json_file.read() 166 | json_file.close() 167 | loaded_model = model_from_json(loaded_model_json) 168 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 169 | 170 | # load weights into the model 171 | loaded_model.load_weights('Conv1D_regression_SCMneg.h5') 172 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 173 | 174 | # predict SCMneg 175 | y_pred = loaded_model.predict(X) 176 | df_SCMneg = pd.DataFrame(y_pred, columns=['SCM_neg_CDRH1', 'SCM_neg_CDRH2', 'SCM_neg_CDRH3', 177 | 'SCM_neg_CDRL1', 'SCM_neg_CDRL2', 'SCM_neg_CDRL3', 178 | 'SCM_neg_CDR', 'SCM_neg_Hv', 'SCM_neg_Lv', 'SCM_neg_Fv']) 179 | 180 | 181 | #load DeepSCM_pos model 182 | json_file = open('Conv1D_regressionSCMpos.json', 'r') 183 | loaded_model_json = json_file.read() 184 | json_file.close() 185 | loaded_model = model_from_json(loaded_model_json) 186 | 187 | # load weights into the model 188 | loaded_model.load_weights('Conv1D_regression_SCMpos.h5') 189 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 190 | 191 | # predict SCMpos 192 | y_pred = loaded_model.predict(X) 193 | df_SCMpos = pd.DataFrame(y_pred, columns=['SCM_pos_CDRH1', 'SCM_pos_CDRH2', 'SCM_pos_CDRH3', 194 | 'SCM_pos_CDRL1', 'SCM_pos_CDRL2', 'SCM_pos_CDRL3', 195 | 'SCM_pos_CDR', 'SCM_pos_Hv', 'SCM_pos_Lv', 'SCM_pos_Fv']) 196 | 197 | df_name = pd.DataFrame(name_list, columns=['ID']) 198 | 199 | df_DeepSP = pd.concat([df_name, df_SAPpos, df_SCMneg, df_SCMpos], axis=1) 200 | st.dataframe(data = df_DeepSP, use_container_width=True, hide_index=True) 201 | 202 | -------------------------------------------------------------------------------- /DeepSP_input.csv: -------------------------------------------------------------------------------- 1 | Name,Heavy_Chain,Light_Chain 2 | mAb1,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLEWVSAITWNSGHIDYADSVEGRFTISRDNAKNSLYLQMNSLRAEDTAVYYCAKVSYLSTASSLDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDVATYYCQRYNRAPYTFGQGTKVEIK 3 | mAb2,EVQLVESGGGLVQPGGSLRLSCAASGFTFSDSWIHWVRQAPGKGLEWVAWISPYGGSTYYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARRHWPGGFDYWGQGTLVTVSA,DIQMTQSPSSLSASVGDRVTITCRASQDVSTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLYHPATFGQGTKVEIK 4 | mAb3,QVQLKQSGPGLVQPSQSLSITCTVSGFSLTNYGVHWVRQSPGKGLEWLGVIWSGGNTDYNTPFTSRLSINKDNSKSQVFFKMNSLQSNDTAIYYCARALTYYDYEFAYWGQGTLVTVSA,DILLTQSPVILSVSPGERVSFSCRASQSIGTNIHWYQQRTNGSPRLLIKYASESISGIPSRFSGSGSGTDFTLSINSVESEDIADYYCQQNNNWPTTFGAGTKLELK 5 | mAb4,EVQLLESGGGLVQPGGSLRLSCAVSGFTFNSFAMSWVRQAPGKGLEWVSAISGSGGGTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYFCAKDKILWFGEPVFDYWGQGTLVTVSS,EIVLTQSPATLSLSPGERATLSCRASQSVSSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPPTFGQGTKVEIK 6 | mAb5,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSGITGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDPGTTVIMSWFDPWGQGTLVTVSS,EIVLTQSPGTLSLSPGERATLSCRASQSVRGRYLAWYQQKPGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISRLEPEDFAVFYCQQYGSSPRTFGQGTKVEIK 7 | mAb6,QVQLVESGGGVVQPGRSLRLSCAASGFIFSSYAMHWVRQAPGNGLEWVAFMSYDGSNKKYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDRGIAAGGNYYYYGMDVWGQGTTVTVSS,EIVLTQSPATLSLSPGERATLSCRASQSVYSYLAWYQQKPGQAPRLLIYDASNRATGIPARFSGSGSGTDFTLTISSLEPEDFAVYYCQQRSNWPPFTFGPGTKVDIK 8 | mAb7,EVKLEESGGGLVQPGGSMKLSCVASGFIFSNHWMNWVRQSPEKGLEWVAEIRSKSINSATHYAESVKGRFTISRDDSKSAVYLQMTDLRTEDTGVYYCSRNYYGSTYDYWGQGTTLTVSS,DILLTQSPAILSVSPGERVSFSCRASQFVGSSIHWYQQRTNGSPRLLIKYASESMSGIPSRFSGSGSGTDFTLSINTVESEDIADYYCQQSHSWPFTFGSGTNLEVK 9 | mAb8,EVQLVESGGGLVQPGGSLRLSCAVSGYSITSGYSWNWIRQAPGKGLEWVASITYDGSTNYNPSVKGRITISRDDSKNTFYLQMNSLRAEDTAVYYCARGSHYFGHWHFAVWGQGTLVTVSS,DIQLTQSPSSLSASVGDRVTITCRASQSVDYDGDSYMNWYQQKPGKAPKLLIYAASYLESGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSHEDPYTFGQGTKVEIK 10 | mAb9,QVQLQESGPGLVKPSETLSLTCTVSGGSVSSGDYYWTWIRQSPGKGLEWIGHIYYSGNTNYNPSLKSRLTISIDTSKTQFSLKLSSVTAADTAIYYCVRDRVTGAFDIWGQGTMVTVSS,DIQMTQSPSSLSASVGDRVTITCQASQDISNYLNWYQQKPGKAPKLLIYDASNLETGVPSRFSGSGSGTDFTFTISSLQPEDIATYFCQHFDHLPLAFGGGTKVEIK 11 | mAb10,EVQLVESGGGLVQPGGSLRLSCAASGFTFTDYTMDWVRQAPGKGLEWVADVNPNSGGSIYNQRFKGRFTLSVDRSKNTLYLQMNSLRAEDTAVYYCARNLGPSFYFDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCKASQDVSIGVAWYQQKPGKAPKLLIYSASYRYTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYYIYPYTFGQGTKVEIK 12 | mAb11,QVQLQESGPGLVRPSQTLSLTCTVSGYSITSDHAWSWVRQPPGRGLEWIGYISYSGITTYNPSLKSRVTMLRDTSKNQFSLRLSSVTAADTAVYYCARSLARTTAMDYWGQGSLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQDISSYLNWYQQKPGKAPKLLIYYTSRLHSGVPSRFSGSGSGTDFTFTISSLQPEDIATYYCQQGNTLPYTFGQGTKVEIK 13 | mAb12,QVQLVQSGAEVKKPGASVKVSCKGSGYTFTSYWMHWVRQAPGQRLEWIGEIDPSESNTNYNQKFKGRVTLTVDISASTAYMELSSLRSEDTAVYYCARGGYDGWDYAIDYWGQGTLVTVSS,DVVMTQSPLSLPVTPGEPASISCRSSQSLAKSYGNTYLSWYLQKPGQSPQLLIYGISNRFSGVPDRFSGSGSGTDFTLKISRVEAEDVGVYYCLQGTHQPYTFGQGTKVEIK 14 | mAb13,QLQQSGTVLARPGASVKMSCKASGYSFTRYWMHWIKQRPGQGLEWIGAIYPGNSDTSYNQKFEGKAKLTAVTSASTAYMELSSLTHEDSAVYYCSRDYGYYFDFWGQGTTLTVSS,QIVSTQSPAIMSASPGEKVTMTCSASSSRSYMQWYQQKPGTSPKRWIYDTSKLASGVPARFSGSGSGTSYSLTISSMEAEDAATYYCHQRSSYTFGGGTKLEIK 15 | mAb14,QVQLVQSGAEVKKPGASVKVSCKASGFNIKDTYIHWVRQAPGQRLEWMGRIDPANGYTKYDPKFQGRVTITADTSASTAYMELSSLRSEDEAVYYCAREGYYGNYGVYAMDYWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCKTSQDINKYMAWYQQTPGKAPRLLIHYTSALQPGIPSRFSGSGSGRDYTFTISSLQPEDIATYYCLQYDNLWTFGQGTKVEIK 16 | mAb15,QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYYIHWVRQAPGQGLEWIGCIYPGNVNTNYNEKFKDRATLTVDTSISTAYMELSRLRSDDTAVYFCTRSHYGLDWNFDVWGQGTTVTVSS,DIQMTQSPSSLSASVGDRVTITCHASQNIYVWLNWYQQKPGKAPKLLIYKASNLHTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQGQTYPYTFGGGTKVEIK 17 | mAb16,EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSQISPAGGYTNYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCARGELPYYRMSKVMDVWGQGTLVTVSS,DIQMTQSPSSLSASVGDRVTITCRASQYFSSYLAWYQQKPGKAPKLLIYGASSRASGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYLGSPPTFGQGTKVEIK 18 | -------------------------------------------------------------------------------- /DeepSP_model_train.py: -------------------------------------------------------------------------------- 1 | # Import libraries 2 | import numpy as np 3 | import pandas as pd 4 | import random 5 | 6 | 7 | from numpy.random import seed 8 | 9 | # Import machine learning libraries 10 | import tensorflow as tf 11 | import keras 12 | from keras.models import model_from_json 13 | from keras.layers import BatchNormalization 14 | from keras.callbacks import ModelCheckpoint 15 | from keras.optimizers import Adam 16 | import keras_tuner as kt 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 19 | 20 | 21 | np.random.seed(0) 22 | random.seed(0) 23 | tf.random.set_seed(0) 24 | 25 | 26 | def load_input_data(filename): 27 | name_list=[] 28 | seq_list=[] 29 | score_list=[] 30 | 31 | with open(filename) as datafile: 32 | for line in datafile: 33 | line = line.strip().split() 34 | name_list.append(line[0]) 35 | seq_list.append(line[1]) 36 | score_temp = [] 37 | for i in range(len(line[2:])): 38 | data = float(line[i+2]) 39 | score_temp.append(data) 40 | score_list.append(score_temp) 41 | return name_list, seq_list, score_list 42 | 43 | def one_hot_encoder(s): 44 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20} 45 | 46 | x = np.zeros((len(d), len(s))) 47 | x[[d[c] for c in s], range(len(s))] = 1 48 | 49 | return x 50 | 51 | 52 | def best_model_SAPpos(): 53 | best_model = keras.Sequential(name="model_conv1D") 54 | 55 | best_model.add(keras.layers.Input(shape=(272,21))) 56 | 57 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation = 'relu', name="Conv1D_1")) 58 | best_model.add(BatchNormalization()) 59 | best_model.add(keras.layers.Dropout(0.3)) 60 | 61 | best_model.add(keras.layers.Conv1D(filters=96, kernel_size=4, activation = 'relu', name="Conv1D_2")) 62 | best_model.add(BatchNormalization()) 63 | 64 | best_model.add(keras.layers.Conv1D(filters=32, kernel_size=5, activation = 'relu', name="Conv1D_3")) 65 | best_model.add(BatchNormalization()) 66 | 67 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D")) 68 | best_model.add(keras.layers.Flatten()) 69 | 70 | 71 | # Input layer and First hidden layer of neural network 72 | best_model.add(keras.layers.Dense(units=112, activation = 'relu', name="Dense_1")) 73 | best_model.add(keras.layers.Dense(units=48, activation = 'relu', name="Dense_2")) 74 | best_model.add(keras.layers.Dense(10, name="Dense_3")) 75 | 76 | return best_model 77 | 78 | 79 | def best_model_SCMpos(): 80 | best_model = keras.Sequential(name="model_conv1D") 81 | 82 | best_model.add(keras.layers.Input(shape=(272,21))) 83 | 84 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=4, activation = 'relu', name="Conv1D_1")) 85 | best_model.add(BatchNormalization()) 86 | best_model.add(keras.layers.Dropout(0.4)) 87 | 88 | best_model.add(keras.layers.Conv1D(filters=112, kernel_size=4, activation = 'relu', name="Conv1D_2")) 89 | best_model.add(BatchNormalization()) 90 | best_model.add(keras.layers.Dropout(0.4)) 91 | 92 | best_model.add(keras.layers.Conv1D(filters=144, kernel_size=5, activation = 'relu', name="Conv1D_3")) 93 | best_model.add(BatchNormalization()) 94 | best_model.add(keras.layers.Dropout(0.0)) 95 | 96 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D")) 97 | best_model.add(keras.layers.Flatten()) 98 | 99 | # Input layer and First hidden layer of neural network 100 | best_model.add(keras.layers.Dense(units=128, activation = 'relu', name="Dense_1")) 101 | best_model.add(keras.layers.Dense(10, name="Dense_2")) 102 | 103 | return best_model 104 | 105 | 106 | def best_model_SCMneg(): 107 | best_model = keras.Sequential(name="model_conv1D") 108 | 109 | best_model.add(keras.layers.Input(shape=(272,21))) 110 | 111 | best_model.add(keras.layers.Conv1D(filters=128, kernel_size=5, activation = 'relu', name="Conv1D_1")) 112 | best_model.add(BatchNormalization()) 113 | best_model.add(keras.layers.Dropout(0.1)) 114 | 115 | best_model.add(keras.layers.Conv1D(filters=112, kernel_size=4, activation = 'relu', name="Conv1D_2")) 116 | best_model.add(BatchNormalization()) 117 | 118 | best_model.add(keras.layers.Conv1D(filters=64, kernel_size=4, activation = 'relu', name="Conv1D_3")) 119 | best_model.add(BatchNormalization()) 120 | 121 | best_model.add(keras.layers.MaxPooling1D(pool_size=2, name="MaxPooling1D")) 122 | best_model.add(keras.layers.Flatten()) 123 | 124 | # Input layer and First hidden layer of neural network 125 | best_model.add(keras.layers.Dense(units=128, activation = 'relu', name="Dense_1")) 126 | best_model.add(keras.layers.Dense(10, name="Dense_2")) 127 | 128 | return best_model 129 | 130 | 131 | #ts = 0.2; bs = 64 132 | filenames = ['Deep_SAPpos_data.txt', 'Deep_SCMpos_data.txt', 'Deep_SCMneg_data.txt'] 133 | models = [best_model_SAPpos(), best_model_SCMpos(), best_model_SCMneg()] 134 | l_rates = [0.0001, 0.005, 0.0001] 135 | 136 | for file, model, l_rate in zip(filenames,models,l_rates): 137 | prop = file.split('_')[1] 138 | 139 | name_list, seq_list, score_list = load_input_data("data/"+file) 140 | X= seq_list; y= score_list 141 | 142 | #Train and compile model with best hyperparameters 143 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 144 | X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=0) 145 | 146 | X_train = [one_hot_encoder(s=x) for x in X_train] 147 | X_train = np.transpose(np.asarray(X_train), (0, 2, 1)) 148 | X_train = np.asarray(X_train) 149 | 150 | X_test = [one_hot_encoder(s=x) for x in X_test] 151 | X_test = np.transpose(np.asarray(X_test), (0, 2, 1)) 152 | X_test = np.asarray(X_test) 153 | 154 | X_val = [one_hot_encoder(s=x) for x in X_val] 155 | X_val = np.transpose(np.asarray(X_val), (0, 2, 1)) 156 | X_val = np.asarray(X_val) 157 | 158 | y_train = np.asarray(y_train).reshape((-1, 10)) 159 | y_test = np.asarray(y_test).reshape((-1, 10)) 160 | y_val = np.asarray(y_val).reshape((-1, 10)) 161 | 162 | optimizer = Adam(learning_rate = l_rate) 163 | best_model = model 164 | best_model.compile(optimizer=optimizer, loss='mae', metrics=None) 165 | 166 | # Create callback 167 | filepath = 'Conv1D_regression_'+prop+'.h5' 168 | checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min') 169 | callbacks = [checkpoint] 170 | 171 | # Fit the CNN to the training set 172 | history = best_model.fit(x=X_train, y=y_train, shuffle=True, validation_data=(X_val, y_val), epochs=50, callbacks=callbacks, batch_size=64, verbose=2) 173 | 174 | # Save the Conv1D architecture to json 175 | Conv1D_regression_json = best_model.to_json() 176 | with open("Conv1D_regression"+prop+".json", "w") as json_file: 177 | json_file.write(Conv1D_regression_json) 178 | 179 | 180 | # Load the Conv1D architecture from json 181 | pred_model = model_from_json(Conv1D_regression_json) 182 | 183 | # Load weights from the best model into Conv1D model 184 | pred_model.load_weights(filepath) 185 | 186 | # Compile the loaded Conv1D model 187 | pred_model.compile(optimizer=optimizer, metrics=['mae']) 188 | 189 | y_pred = pred_model.predict(X_test) 190 | 191 | best_val_loss = min(history.history['val_loss']) 192 | 193 | # Initialize lists to store baseline MAE and mean scores for each target 194 | baseline_mae_list = [] 195 | mean_score_list = [] 196 | 197 | for i in range(y_test.shape[1]): 198 | # Calculate the baseline MAE for the i-th target 199 | baseline_prediction = np.full_like(y_test[:, i], np.mean(y_test[:, i])) 200 | baseline_mae = mean_absolute_error(y_test[:, i], baseline_prediction) 201 | baseline_mae_list.append(baseline_mae) 202 | 203 | # Calculate the mean score for the i-th target 204 | mean_score = np.mean(y_test[:, i]) 205 | mean_score_list.append(mean_score) 206 | 207 | # Initialize lists to store metrics for each target 208 | mae_list = [] 209 | corr_list = [] 210 | 211 | for i in range(y_test.shape[1]): 212 | # Calculate MAE for the i-th target 213 | mae = mean_absolute_error(y_test[:, i], y_pred[:, i]) 214 | mae_list.append(mae) 215 | 216 | # Calculate correlation coefficient (correlation) for the i-th target 217 | corr = np.corrcoef(y_test[:, i], y_pred[:, i])[0, 1] 218 | corr_list.append(corr) 219 | 220 | reg = ['CDRH1', 'CDRH2', 'CDRH3', 'CDRL1', 'CDRL2', 'CDRL3', 'CDR', 'Hv', 'Lv', 'Fv'] 221 | 222 | result_dict = { 223 | "prop": [], 224 | "Mean_score": [], 225 | "Baseline_MAE": [], 226 | "Val_loss": [], 227 | "MAE": [], 228 | "R": [], 229 | } 230 | 231 | for r, i, j, k, l in zip(reg, mean_score_list, baseline_mae_list, mae_list, corr_list): 232 | # Append the corresponding values to the result_dict 233 | result_dict["prop"].append(prop + r) 234 | result_dict["Mean_score"].append(i) 235 | result_dict["Baseline_MAE"].append(j) 236 | result_dict["Val_loss"].append(best_val_loss) 237 | result_dict["MAE"].append(k) 238 | result_dict["R"].append(l) 239 | 240 | # Create the DataFrame 241 | result_df = pd.DataFrame(result_dict) 242 | 243 | # Save the DataFrame to CSV 244 | result_df.to_csv("hyp_metric_" + prop + ".csv", index=False) 245 | 246 | his_df = pd.DataFrame(history.history) 247 | his_df.to_csv("his" + prop + ".csv", index=False) 248 | 249 | data_frames = [] 250 | for file in filenames: 251 | prop = file.split('_')[1] 252 | infile = "hyp_metric_" + prop + ".csv" 253 | df = pd.read_csv(infile) 254 | data_frames.append(df) 255 | concatenated_df = pd.concat(data_frames, ignore_index=True) 256 | concatenated_df.to_csv("Final_model_metric.csv", index=False) 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Lailabcode 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSP 2 | DeepSP is an antibody-specific surrogate model that can generate 30 spatial properties of an antibody solely based on their sequence. 3 | 4 | # How to generate descriptors (features) using DeepSP 5 | 6 | ## Option 1 - Google colab notebook 7 | - Run 8 | 1. Prepare your input file according to the format DeepSP_input.csv 9 | 2. Run the notebook file DeepSP_predictor.ipynb 10 | 3. DeepSP structural properties for sequences inputed, will be populated and saved to a csv file - 'DeepSP_descriptor.csv'. 11 | 12 | ## Option 2 - Linux environment 13 | - Set up (bash)- create an environment and install necessary package 14 | 1. conda create -n deepSP python=3.9.13 15 | 2. source activate deepSP 16 | 3. conda install -c bioconda anarci 17 | 4. pip install keras==2.11.0 tensorflow-cpu==2.11.0 scikit-learn==1.0.2 pandas numpy==1.26.4 18 | - Run 19 | 1. Prepare your input file according to the format DeepSP_input.csv 20 | 2. Run the python file deepsp_predictor.py - 'python deepsp_predictor.py' 21 | 3. DeepSP structural properties for sequences inputed, will be obtained and saved to a csv file - 'DeepSP_descriptor.csv'. 22 | 23 | 24 | # Citation 25 | 26 | Kalejaye, L.; Wu, I.-E.; Terry, T.; Lai, P.-K. DeepSP: Deep Learning-Based Spatial Properties to Predict Monoclonal Antibody Stability. *Comput. Struct. Biotechnol. J.* 2024, 23, 2220–2229 (https://doi.org/10.1016/j.csbj.2024.05.029) -------------------------------------------------------------------------------- /deepsp_predictor.py: -------------------------------------------------------------------------------- 1 | # Import libraries 2 | import os 3 | os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' 4 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 5 | import numpy as np 6 | import pandas as pd 7 | import random 8 | from numpy.random import seed 9 | 10 | # Import machine learning libraries 11 | import tensorflow as tf 12 | from tensorflow.keras.models import model_from_json 13 | 14 | import keras 15 | from keras.models import model_from_json 16 | 17 | from Bio import SeqIO 18 | from Bio.Seq import Seq 19 | from Bio.SeqRecord import SeqRecord 20 | 21 | 22 | # Import dataset 23 | dataset = pd.read_csv('DeepSP_input.csv') # replace with your csv file, see format in DeepSP_input.csv file 24 | name = dataset['Name'].to_list() 25 | Heavy_seq = dataset['Heavy_Chain'].to_list() 26 | Light_seq = dataset['Light_Chain'].to_list() 27 | 28 | # Convert to Fasta File 29 | file_out='seq_H.fasta' 30 | with open(file_out, "w") as output_handle: 31 | for i in range(len(name)): 32 | seq_name = name[i] 33 | seq = Heavy_seq[i] 34 | record = SeqRecord( 35 | Seq(seq), 36 | id=seq_name, 37 | name="", 38 | description="", 39 | ) 40 | SeqIO.write(record, output_handle, "fasta") 41 | 42 | file_out='seq_L.fasta' 43 | with open(file_out, "w") as output_handle: 44 | for i in range(len(name)): 45 | seq_name = name[i] 46 | seq = Light_seq[i] 47 | record = SeqRecord( 48 | Seq(seq), 49 | id=seq_name, 50 | name="", 51 | description="", 52 | ) 53 | SeqIO.write(record, output_handle, "fasta") 54 | 55 | # sequence alignment with ANARCI 56 | os.system('ANARCI -i seq_H.fasta -o seq_aligned -s imgt -r heavy --csv') 57 | os.system('ANARCI -i seq_L.fasta -o seq_aligned -s imgt -r light --csv') 58 | 59 | H_aligned = pd.read_csv('seq_aligned_H.csv') 60 | L_aligned = pd.read_csv('seq_aligned_KL.csv') 61 | 62 | #sequence preprocessing: source - https://github.com/Lailabcode/DeepSCM/blob/main/deepscm-master/seq_preprocessing.py 63 | def seq_preprocessing(): 64 | infile_H = pd.read_csv('seq_aligned_H.csv') 65 | infile_L = pd.read_csv('seq_aligned_KL.csv') 66 | outfile = open('seq_aligned_HL.txt', "w") 67 | 68 | H_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \ 69 | '11','12','13','14','15','16','17','18','19','20', \ 70 | '21','22','23','24','25','26','27','28','29','30', \ 71 | '31','32','33','34','35','36','37','38','39','40', \ 72 | '41','42','43','44','45','46','47','48','49','50', \ 73 | '51','52','53','54','55','56','57','58','59','60', \ 74 | '61','62','63','64','65','66','67','68','69','70', \ 75 | '71','72','73','74','75','76','77','78','79','80', \ 76 | '81','82','83','84','85','86','87','88','89','90', \ 77 | '91','92','93','94','95','96','97','98','99','100', \ 78 | '101','102','103','104','105','106','107','108','109','110', \ 79 | '111','111A','111B','111C','111D','111E','111F','111G','111H', \ 80 | '112I','112H','112G','112F','112E','112D','112C','112B','112A','112',\ 81 | '113','114','115','116','117','118','119','120', \ 82 | '121','122','123','124','125','126','127','128'] 83 | 84 | L_inclusion_list = ['1','2','3','4','5','6','7','8','9','10', \ 85 | '11','12','13','14','15','16','17','18','19','20', \ 86 | '21','22','23','24','25','26','27','28','29','30', \ 87 | '31','32','33','34','35','36','37','38','39','40', \ 88 | '41','42','43','44','45','46','47','48','49','50', \ 89 | '51','52','53','54','55','56','57','58','59','60', \ 90 | '61','62','63','64','65','66','67','68','69','70', \ 91 | '71','72','73','74','75','76','77','78','79','80', \ 92 | '81','82','83','84','85','86','87','88','89','90', \ 93 | '91','92','93','94','95','96','97','98','99','100', \ 94 | '101','102','103','104','105','106','107','108','109','110', \ 95 | '111','112','113','114','115','116','117','118','119','120', \ 96 | '121','122','123','124','125','126','127'] 97 | 98 | H_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \ 99 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \ 100 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \ 101 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \ 102 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \ 103 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \ 104 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \ 105 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \ 106 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \ 107 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \ 108 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \ 109 | '111':110,'111A':111,'111B':112,'111C':113,'111D':114,'111E':115,'111F':116,'111G':117,'111H':118, \ 110 | '112I':119,'112H':120,'112G':121,'112F':122,'112E':123,'112D':124,'112C':125,'112B':126,'112A':127,'112':128, \ 111 | '113':129,'114':130,'115':131,'116':132,'117':133,'118':134,'119':135,'120':136, \ 112 | '121':137,'122':138,'123':139,'124':140,'125':141,'126':142,'127':143,'128':144} 113 | 114 | L_dict = {'1': 0, '2':1, '3':2, '4':3, '5':4, '6':5, '7':6, '8':7, '9':8, '10':9, \ 115 | '11':10, '12':11, '13':12, '14':13, '15':14, '16':15, '17':16, '18':17, '19':18, '20':19, \ 116 | '21':20, '22':21, '23':22, '24':23, '25':24, '26':25, '27':26, '28':27, '29':28, '30':29, \ 117 | '31':30, '32':31, '33':32, '34':33, '35':34, '36':35, '37':36, '38':37, '39':38, '40':39, \ 118 | '41':40, '42':41, '43':42, '44':43, '45':44, '46':45, '47':46, '48':47, '49':48, '50':49, \ 119 | '51':50, '52':51, '53':52, '54':53, '55':54, '56':55, '57':56, '58':57, '59':58, '60':59, \ 120 | '61':60, '62':61, '63':62, '64':63, '65':64, '66':65, '67':66, '68':67, '69':68, '70':69, \ 121 | '71':70, '72':71, '73':72, '74':73, '75':74, '76':75, '77':76, '78':77, '79':78, '80':79, \ 122 | '81':80, '82':81, '83':82, '84':83, '85':84, '86':85, '87':86, '88':87, '89':88, '90':89, \ 123 | '91':90, '92':91, '93':92, '94':93, '95':94, '96':95, '97':96, '98':97, '99':98, '100':99, \ 124 | '101':100,'102':101,'103':102,'104':103,'105':104,'106':105,'107':106,'108':107,'109':108,'110':109, \ 125 | '111':110,'112':111,'113':112,'114':113,'115':114,'116':115,'117':116,'118':117,'119':118,'120':119, \ 126 | '121':120,'122':121,'123':122,'124':123,'125':124,'126':125,'127':126,'128':127} 127 | 128 | 129 | N_mAbs = len(infile_H["Id"]) 130 | 131 | for i in range(N_mAbs): 132 | H_tmp = 145*['-'] 133 | L_tmp = 127*['-'] 134 | for col in infile_H.columns: 135 | if(col in H_inclusion_list): 136 | H_tmp[H_dict[col]]=infile_H.iloc[i][col] 137 | for col in infile_L.columns: 138 | if(col in L_inclusion_list): 139 | L_tmp[L_dict[col]]=infile_L.iloc[i][col] 140 | 141 | aa_string = '' 142 | for aa in H_tmp+L_tmp: 143 | aa_string += aa 144 | outfile.write(infile_H.iloc[i,0]+" "+aa_string) 145 | outfile.write("\n") 146 | 147 | outfile.close() 148 | return 149 | 150 | seq_preprocessing() 151 | 152 | # Read Aligned Sequence 153 | def load_input_data(filename): 154 | name_list=[] 155 | seq_list=[] 156 | with open(filename) as datafile: 157 | for line in datafile: 158 | line = line.strip().split() 159 | name_list.append(line[0]) 160 | seq_list.append(line[1]) 161 | return name_list, seq_list 162 | 163 | name_list, seq_list = load_input_data('seq_aligned_HL.txt') 164 | X = seq_list 165 | 166 | # One Hot Encoding of Aligned Sequence 167 | def one_hot_encoder(s): 168 | d = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, '-': 20} 169 | 170 | x = np.zeros((len(d), len(s))) 171 | x[[d[c] for c in s], range(len(s))] = 1 172 | 173 | return x 174 | X = [one_hot_encoder(s=x) for x in X] 175 | X = np.transpose(np.asarray(X), (0, 2, 1)) 176 | X = np.asarray(X) 177 | 178 | 179 | # Predict DeepSP Descriptors 180 | 181 | # sappos 182 | json_file = open('Conv1D_regressionSAPpos.json', 'r') 183 | loaded_model_json = json_file.read() 184 | json_file.close() 185 | loaded_model = model_from_json(loaded_model_json) 186 | # load weights into model 187 | loaded_model.load_weights("Conv1D_regression_SAPpos.h5") 188 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 189 | sap_pos = loaded_model.predict(X) 190 | 191 | # scmneg 192 | json_file = open('Conv1D_regressionSCMneg.json', 'r') 193 | loaded_model_json = json_file.read() 194 | json_file.close() 195 | loaded_model = model_from_json(loaded_model_json) 196 | # load weights into model 197 | loaded_model.load_weights("Conv1D_regression_SCMneg.h5") 198 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 199 | scm_neg = loaded_model.predict(X) 200 | 201 | # scmpos 202 | json_file = open('Conv1D_regressionSCMpos.json', 'r') 203 | loaded_model_json = json_file.read() 204 | json_file.close() 205 | loaded_model = model_from_json(loaded_model_json) 206 | # load weights into model 207 | loaded_model.load_weights("Conv1D_regression_SCMpos.h5") 208 | loaded_model.compile(optimizer='adam', loss='mae', metrics=['mae']) 209 | scm_pos = loaded_model.predict(X) 210 | 211 | features = ['Name', 'SAP_pos_CDRH1','SAP_pos_CDRH2','SAP_pos_CDRH3','SAP_pos_CDRL1','SAP_pos_CDRL2','SAP_pos_CDRL3','SAP_pos_CDR','SAP_pos_Hv','SAP_pos_Lv','SAP_pos_Fv', 212 | 'SCM_neg_CDRH1','SCM_neg_CDRH2','SCM_neg_CDRH3','SCM_neg_CDRL1','SCM_neg_CDRL2','SCM_neg_CDRL3','SCM_neg_CDR','SCM_neg_Hv','SCM_neg_Lv','SCM_neg_Fv', 213 | 'SCM_pos_CDRH1','SCM_pos_CDRH2','SCM_pos_CDRH3','SCM_pos_CDRL1','SCM_pos_CDRL2','SCM_pos_CDRL3','SCM_pos_CDR','SCM_pos_Hv','SCM_pos_Lv','SCM_pos_Fv'] 214 | df = pd.concat([pd.DataFrame(name_list), pd.DataFrame(sap_pos), pd.DataFrame(scm_neg), pd.DataFrame(scm_pos)], ignore_index=True, axis=1,); df.columns = features 215 | df.to_csv('DeepSP_descriptors.csv', index=False) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: web2 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | dependencies: 7 | - anarci=2021.02.04 8 | - biopython=1.78 9 | - hmmer=3.3.2 10 | - keras=2.12.0 11 | - keras-preprocessing=1.1.2 12 | - numpy=1.23.5 13 | - numpy-base=1.23.5 14 | - pandas=2.0.3 15 | - python=3.11.5 16 | - tensorflow=2.12.0 17 | - tensorflow-base=2.12.0 18 | - tensorflow-estimator=2.12.0 19 | prefix: /home/pklai/anaconda3/envs/web2 20 | --------------------------------------------------------------------------------