├── ASAP ├── __init__.py ├── .DS_Store ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── FeatureExtraction.cpython-36.pyc │ ├── FeatureExtraction.cpython-37.pyc │ ├── DesignRecommendation.cpython-36.pyc │ ├── SequenceAndFeatureAnalysis.cpython-36.pyc │ └── SequenceAndFeatureAnalysis.cpython-37.pyc ├── DesignRecommendation.py ├── S_SequenceInRegion.py ├── FeatureExtraction.py └── SequenceAndFeatureAnalysis.py ├── .DS_Store ├── data ├── .DS_Store ├── pigs_canonical.txt └── blosum62.csv ├── results ├── .DS_Store ├── MMP-IGHV │ ├── .DS_Store │ ├── IGHV_Only pI Features_ROC.png │ ├── IGHV_Except pI Features_ROC.png │ ├── IGHV_All Features Included_ROC.png │ ├── IGHV_Except Germline Features_ROC.png │ ├── IGHV_Only Germline Features_ROC.png │ ├── IGHV_Except CDR Canonical Structure Features_ROC.png │ ├── IGHV_Only CDR Canonical Structure Features_ROC.png │ ├── IGHV_Only Frequent Positional Motif Features_ROC.png │ ├── IGHV_Except Frequent Positional Motif Features_ROC.png │ ├── IGHV_RankFisherAndFS.csv │ └── IGHV_Jaccard Feature Coefficient.csv └── MMP-PDB │ ├── .DS_Store │ ├── MMP-cluster_DTreeAllFeature.png │ ├── MMP-cluster_Extracted Features.png │ ├── MMP-cluster_Except pI Features_ROC.png │ ├── MMP-cluster_Heavy Chain Sequences.png │ ├── MMP-cluster_Light Chain Sequences.png │ ├── MMP-cluster_Only pI Features_ROC.png │ ├── MMP-cluster_All Features Included_ROC.png │ ├── MMP-cluster_Only Germline Features_ROC.png │ ├── MMP-cluster_Except Germline Features_ROC.png │ ├── MMP-cluster_Only CDR Canonical Structure Features_ROC.png │ ├── MMP-cluster_Except CDR Canonical Structure Features_ROC.png │ ├── MMP-cluster_Only Frequent Positional Motif Features_ROC.png │ ├── MMP-cluster_All Features Included(Exclude Correlated)_ROC.png │ ├── MMP-cluster_Except Frequent Positional Motif Features_ROC.png │ ├── MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png │ ├── MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png │ ├── MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png │ └── MMP-cluster_RankFisherAndFS.csv ├── testCase ├── .DS_Store ├── IGHV │ ├── .DS_Store │ ├── reference-IGHV │ │ └── .DS_Store │ └── targeting-MMP-IGHV │ │ └── .DS_Store └── MMP-cluster │ ├── .DS_Store │ ├── reference-PDB │ └── .DS_Store │ └── targeting-MMP │ └── .DS_Store ├── __pycache__ └── ASAP.cpython-36.pyc ├── requirements.txt ├── supporting information ├── .DS_Store ├── Figure S1.png ├── Figure S2.png ├── Figure S3.png ├── Figure S4.png ├── Table S1.xlsx ├── Table S2.xlsx ├── Table S3.xlsx └── Table S4.xlsx ├── .idea ├── vcs.xml ├── misc.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml ├── ASAP-1.0.iml └── workspace.xml ├── LICENSE ├── environment.yml ├── README.md ├── ASAP.ipynb └── .ipynb_checkpoints └── ASAP-checkpoint.ipynb /ASAP/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/.DS_Store -------------------------------------------------------------------------------- /ASAP/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/.DS_Store -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/data/.DS_Store -------------------------------------------------------------------------------- /results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/.DS_Store -------------------------------------------------------------------------------- /testCase/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/.DS_Store -------------------------------------------------------------------------------- /testCase/IGHV/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/.DS_Store -------------------------------------------------------------------------------- /results/MMP-IGHV/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/.DS_Store -------------------------------------------------------------------------------- /results/MMP-PDB/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/.DS_Store -------------------------------------------------------------------------------- /__pycache__/ASAP.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/__pycache__/ASAP.cpython-36.pyc -------------------------------------------------------------------------------- /testCase/MMP-cluster/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/.DS_Store -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pydotplus==2.0.2 2 | scipy==0.19.1 3 | matplotlib==2.1.0 4 | numpy==1.14.1 5 | scikit-learn==0.19.2 6 | -------------------------------------------------------------------------------- /supporting information/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/.DS_Store -------------------------------------------------------------------------------- /supporting information/Figure S1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S1.png -------------------------------------------------------------------------------- /supporting information/Figure S2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S2.png -------------------------------------------------------------------------------- /supporting information/Figure S3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S3.png -------------------------------------------------------------------------------- /supporting information/Figure S4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Figure S4.png -------------------------------------------------------------------------------- /supporting information/Table S1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S1.xlsx -------------------------------------------------------------------------------- /supporting information/Table S2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S2.xlsx -------------------------------------------------------------------------------- /supporting information/Table S3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S3.xlsx -------------------------------------------------------------------------------- /supporting information/Table S4.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/supporting information/Table S4.xlsx -------------------------------------------------------------------------------- /testCase/IGHV/reference-IGHV/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/reference-IGHV/.DS_Store -------------------------------------------------------------------------------- /ASAP/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /ASAP/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /testCase/IGHV/targeting-MMP-IGHV/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/IGHV/targeting-MMP-IGHV/.DS_Store -------------------------------------------------------------------------------- /testCase/MMP-cluster/reference-PDB/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/reference-PDB/.DS_Store -------------------------------------------------------------------------------- /testCase/MMP-cluster/targeting-MMP/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/testCase/MMP-cluster/targeting-MMP/.DS_Store -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Only pI Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only pI Features_ROC.png -------------------------------------------------------------------------------- /ASAP/__pycache__/FeatureExtraction.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-36.pyc -------------------------------------------------------------------------------- /ASAP/__pycache__/FeatureExtraction.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/FeatureExtraction.cpython-37.pyc -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Except pI Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except pI Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_DTreeAllFeature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_DTreeAllFeature.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_All Features Included_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_All Features Included_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Extracted Features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Extracted Features.png -------------------------------------------------------------------------------- /ASAP/__pycache__/DesignRecommendation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/DesignRecommendation.cpython-36.pyc -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Except Germline Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Germline Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Only Germline Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Germline Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except pI Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Heavy Chain Sequences.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Light Chain Sequences.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Light Chain Sequences.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only pI Features_ROC.png -------------------------------------------------------------------------------- /ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-36.pyc -------------------------------------------------------------------------------- /ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/ASAP/__pycache__/SequenceAndFeatureAnalysis.cpython-37.pyc -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_All Features Included_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except CDR Canonical Structure Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only CDR Canonical Structure Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Only Frequent Positional Motif Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-IGHV/IGHV_Except Frequent Positional Motif Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except CDR Canonical Structure Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Frequent Positional Motif Features_ROC.png -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_All Features Included(Exclude Correlated)_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Frequent Positional Motif Features_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only Germline Features(Exclude Correlated)_ROC.png -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Except Germline Features(Exclude Correlated)_ROC.png -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HassounLab/ASAP-SML/HEAD/results/MMP-PDB/MMP-cluster_Only CDR Canonical Structure Features(Exclude Correlated)_ROC.png -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/ASAP-1.0.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Xinmeng Li 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /data/pigs_canonical.txt: -------------------------------------------------------------------------------- 1 | L1 1 6 29 VIL 2 | L1 2 7 29 VIL 3 | L1 3 13 29 VIL 4 | L1 4 12 29 VIL 5 | L1 5 11 29 VIL 6 | L1 6 8 29 VIL 7 | L2 1 3 8 | L3 1 6 95 P 90 HNQ 9 | L3 2 6 94 P 90 Q 10 | L3 3 5 96 P 90 Q 11 | L3 4 4 90 Q 12 | L3 5 7 95A P 90 Q 13 | L3 6 5 90 Q 94 L 14 | H1 1 7 15 | H1 2 8 16 | H1 3 9 17 | H1 4 6 18 | H2 1 3 71 AVL 19 | H2 2 3 71 RK 20 | H2 3 4 71 AVL 21 | H2 4 4 71 RK 22 | H2 5 6 71 AVL 23 | H2 6 6 71 RK 24 | H3 2 11 94 ACDEFGHILMNPQSTVYW 25 | H3 3 11 94 RK 26 | H3 1 10 27 | H3 2 12 94 ACDEFGHILMNPQSTVYW 28 | H3 3 12 94 RK 29 | H3 2 13 94 ACDEFGHILMNPQSTVYW 30 | H3 3 13 94 RK 31 | H3 2 14 94 ACDEFGHILMNPQSTVYW 32 | H3 3 14 94 RK 33 | H3 2 15 94 ACDEFGHILMNPQSTVYW 34 | H3 3 15 94 RK 35 | H3 2 16 94 ACDEFGHILMNPQSTVYW 36 | H3 3 16 94 RK 37 | H3 2 17 94 ACDEFGHILMNPQSTVYW 38 | H3 3 17 94 RK 39 | H3 2 18 94 ACDEFGHILMNPQSTVYW 40 | H3 3 18 94 RK 41 | H3 2 19 94 ACDEFGHILMNPQSTVYW 42 | H3 3 19 94 RK 43 | H3 2 20 94 ACDEFGHILMNPQSTVYW 44 | H3 3 20 94 RK 45 | H3 2 21 94 ACDEFGHILMNPQSTVYW 46 | H3 3 21 94 RK 47 | H3 2 22 94 ACDEFGHILMNPQSTVYW 48 | H3 3 22 94 RK 49 | H3 2 23 94 ACDEFGHILMNPQSTVYW 50 | H3 3 23 94 RK 51 | H3 2 24 94 ACDEFGHILMNPQSTVYW 52 | H3 3 24 94 RK 53 | H3 2 25 94 ACDEFGHILMNPQSTVYW 54 | H3 3 25 94 RK 55 | H3 2 26 94 ACDEFGHILMNPQSTVYW 56 | H3 3 26 94 RK 57 | H3 2 27 94 ACDEFGHILMNPQSTVYW 58 | H3 3 27 94 RK -------------------------------------------------------------------------------- /data/blosum62.csv: -------------------------------------------------------------------------------- 1 | A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V,B,Z,X,_ 2 | 4,-1,-2,-2,0,-1,-1,0,-2,-1,-1,-1,-1,-2,-1,1,0,-3,-2,0,-2,-1,0,-4 3 | -1,5,0,-2,-3,1,0,-2,0,-3,-2,2,-1,-3,-2,-1,-1,-3,-2,-3,-1,0,-1,-4 4 | -2,0,6,1,-3,0,0,0,1,-3,-3,0,-2,-3,-2,1,0,-4,-2,-3,3,0,-1,-4 5 | -2,-2,1,6,-3,0,2,-1,-1,-3,-4,-1,-3,-3,-1,0,-1,-4,-3,-3,4,1,-1,-4 6 | 0,-3,-3,-3,9,-3,-4,-3,-3,-1,-1,-3,-1,-2,-3,-1,-1,-2,-2,-1,-3,-3,-2,-4 7 | -1,1,0,0,-3,5,2,-2,0,-3,-2,1,0,-3,-1,0,-1,-2,-1,-2,0,3,-1,-4 8 | -1,0,0,2,-4,2,5,-2,0,-3,-3,1,-2,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4 9 | 0,-2,0,-1,-3,-2,-2,6,-2,-4,-4,-2,-3,-3,-2,0,-2,-2,-3,-3,-1,-2,-1,-4 10 | -2,0,1,-1,-3,0,0,-2,8,-3,-3,-1,-2,-1,-2,-1,-2,-2,2,-3,0,0,-1,-4 11 | -1,-3,-3,-3,-1,-3,-3,-4,-3,4,2,-3,1,0,-3,-2,-1,-3,-1,3,-3,-3,-1,-4 12 | -1,-2,-3,-4,-1,-2,-3,-4,-3,2,4,-2,2,0,-3,-2,-1,-2,-1,1,-4,-3,-1,-4 13 | -1,2,0,-1,-3,1,1,-2,-1,-3,-2,5,-1,-3,-1,0,-1,-3,-2,-2,0,1,-1,-4 14 | -1,-1,-2,-3,-1,0,-2,-3,-2,1,2,-1,5,0,-2,-1,-1,-1,-1,1,-3,-1,-1,-4 15 | -2,-3,-3,-3,-2,-3,-3,-3,-1,0,0,-3,0,6,-4,-2,-2,1,3,-1,-3,-3,-1,-4 16 | -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4,7,-1,-1,-4,-3,-2,-2,-1,-2,-4 17 | 1,-1,1,0,-1,0,0,0,-1,-2,-2,0,-1,-2,-1,4,1,-3,-2,-2,0,0,0,-4 18 | 0,-1,0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1,1,5,-2,-2,0,-1,-1,0,-4 19 | -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1,1,-4,-3,-2,11,2,-3,-4,-3,-2,-4 20 | -2,-2,-2,-3,-2,-1,-2,-3,2,-1,-1,-2,-1,3,-3,-2,-2,2,7,-1,-3,-2,-1,-4 21 | 0,-3,-3,-3,-1,-2,-2,-3,-3,3,1,-2,1,-1,-2,-2,0,-3,-1,4,-3,-2,-1,-4 22 | -2,-1,3,4,-3,0,1,-1,0,-3,-4,0,-3,-3,-2,0,-1,-4,-3,-3,4,1,-1,-4 23 | -1,0,0,1,-3,3,4,-2,0,-3,-3,1,-1,-3,-1,0,-1,-3,-2,-2,1,4,-1,-4 24 | 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2,0,0,-2,-1,-1,-1,-1,-1,-4 25 | -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,1 -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_RankFisherAndFS.csv: -------------------------------------------------------------------------------- 1 | Feature, Feature Value,Fisher Test p-value, Feature Selection (thereshold = 0.0167),Rank of Statistic Significancy, Rank of Feature Selection 2 | Canonical H1,0,1,0.000480165,, 3 | Canonical H1,1,0.774906367,0.000600854,, 4 | Canonical H1,2,1,0.000175353,, 5 | Canonical H1,3,1,0,, 6 | Canonical H2,0,0.535049709,0.012889063,, 7 | Canonical H2,6,0.644296474,0.010232892,, 8 | Canonical H3,0,0.083794647,0.01479616,, 9 | Canonical H3,1,0.234700323,0.013119923,, 10 | Canonical H3,2,0.002614369,0.033944391,7.62,8 11 | Canonical H3,3,0.999938155,0.053446578,,2 12 | Germ HJ,IGHJ1*01,0.688488693,0.004603937,, 13 | Germ HJ,IGHJ2*01,0.354049029,0.004012455,, 14 | Germ HJ,IGHJ3*01,0.576535581,0.003012808,, 15 | Germ HJ,IGHJ3*02,1.32E-05,0.047185683,1.29,3 16 | Germ HJ,IGHJ4*02,0.553358944,0.034656349,,7 17 | Germ HJ,IGHJ5*01,0.96537164,0.015574391,, 18 | Germ HJ,IGHJ5*02,0.197274561,0.018123477,,19 19 | Germ HJ,IGHJ6*01,0.999947777,0.066944116,,1 20 | Germ HJ,IGHJ6*04,0.000416969,0.032812578,6.36,9 21 | Motif,10_YY,0.07051444,0.016529828,, 22 | Motif,10_YYG,0.053349784,0.014285162,, 23 | Motif,10_YYY,0.343316239,0.011916447,, 24 | Motif,2_GG,1,0.008545674,, 25 | Motif,2_GS,0.762847397,0.007176706,, 26 | Motif,2_YG,0.005067609,0.036587558,10.51,6 27 | Motif,2_YY,0.102581789,0.018412503,,18 28 | Motif,3_SG,0.624699976,0.017795207,,20 29 | Motif,3_SS,0.617371164,0.008419981,, 30 | Motif,3_YY,0.001265975,0.032171889,6.41,11 31 | Motif,3_YYD,0.002689351,0.00459369,7.61, 32 | Motif,4_SG,0.955215392,0.009229585,, 33 | Motif,4_SS,0.305268831,0.012016952,, 34 | Motif,4_YD,0.000317996,0.026747743,3.51,14 35 | Motif,4_YDS,0.002069412,0.006805006,7.59, 36 | Motif,5_DS,0.004330063,0.007100598,10.42, 37 | Motif,5_SG,0.626303426,0.010727004,, 38 | Motif,5_YY,0.026298446,0.037488145,15.28,5 39 | Motif,6_SG,0.509513574,0.022391224,,16 40 | Motif,6_SS,0.023126499,0.004795634,15.85, 41 | Motif,6_SSG,0.002423729,0.008186862,8.61, 42 | Motif,6_YY,0.937239799,0.011581004,, 43 | Motif,7_SG,0.028197383,0.007329619,16.75, 44 | Motif,7_SGY,0.003847006,0.003018869,10.29, 45 | Motif,7_YY,0.846405186,0.014090079,, 46 | Motif,7_YYY,0.591550617,0.004151414,, 47 | Motif,8_GY,0.012402538,0.010409138,12.7, 48 | Motif,8_YY,0.364641908,0.02334784,,15 49 | Motif,8_YYY,0.27775982,0.005765425,, 50 | Motif,9_FD,0.555232757,0.019582355,,17 51 | Motif,9_YY,0.017733467,0.01198022,13.5, 52 | Motif,9_YYY,0.0259799,0.009162702,14.58, 53 | PI,0.0-3.5,0.018313867,0.032143423,13.67,12 54 | PI,3.5-3.9375,0.08010366,0.039139254,,4 55 | PI,3.9375-4.375,0.485241177,0.032641723,,10 56 | PI,4.375-4.8125,0.901919558,0.009318928,, 57 | PI,4.8125-5.25,0.936906848,0.015848292,, 58 | PI,5.25-5.6875,0.824022061,0.010279235,, 59 | PI,5.6875-6.125,0.904667929,0.027072376,,13 60 | PI,6.125-7.0,0.945280631,0.009723836,, 61 | PI,7.0-14.0,0.966549563,0.014879694,, -------------------------------------------------------------------------------- /ASAP/DesignRecommendation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pydotplus 3 | from sklearn import tree 4 | 5 | SET_NAME = 'MMP-cluster' 6 | IF_ONLY_HEAVY = False 7 | CNT_DB = 2 8 | CNT_TARGET = 1 9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/' 10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/' 11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster 12 | 13 | 14 | 15 | ################################################################################################################# 16 | # function SanityFeature: 17 | # Omit non-recommending features, such as motif features and type 0 canonical structures, as feature value for decision tree 18 | # 19 | # Input: AgreeFeature, AllFeatureNames 20 | # Output: 1. SanityAgreeFeature, [] a list of index according to the AllFeatureNames that remain to put in decision tree 21 | ################################################################################################################# 22 | 23 | def SanityFeature(AgreeFeature, AllFeatureNames): 24 | SanityAgreeFeature=[] 25 | for idx in AgreeFeature: 26 | if not(AllFeatureNames[idx].split('_')[0] == 'Motif') and not(AllFeatureNames[idx].split('_')[0] == 'Canonical' and AllFeatureNames[idx].split('_')[2] == '0'): 27 | SanityAgreeFeature.append(idx) 28 | return SanityAgreeFeature 29 | 30 | ################################################################################################################# 31 | # function MultiDecisionTree: 32 | # Decision tree drawn with combined data across multiple iteration 33 | # 34 | # Input: X_DS, Y_DS, FeatureN, type 35 | ################################################################################################################# 36 | def MultiDecisionTree(iterate, X_IDS, Y_IDS, AllFeatureNames, type): 37 | Y = np.concatenate(Y_IDS, axis=0) 38 | AgreeFeature =[i for i in range(len(AllFeatureNames)) ] 39 | SanityAgreeFeature = SanityFeature(AgreeFeature, AllFeatureNames) 40 | 41 | SanityAgreeFeatureName = [] 42 | for idx in SanityAgreeFeature: 43 | SanityAgreeFeatureName.append(AllFeatureNames[idx]) 44 | 45 | Sig_X_DS =[[] for i in range(iterate)] 46 | for i in range(iterate): 47 | X_IDS[i]=np.array(X_IDS[i]) 48 | Sig_X_DS[i] = X_IDS[i][:,SanityAgreeFeature] 49 | 50 | X =np.concatenate(Sig_X_DS, axis=0) 51 | 52 | minLeafSize = int(0.025 *len(Y)) 53 | clf = tree.DecisionTreeClassifier(min_samples_leaf = minLeafSize) 54 | clf = clf.fit(np.ones((len(Y),len(X[0])))-X, Y) #flip the X for decision tree to meet the true false 55 | 56 | dot_data = tree.export_graphviz(clf, out_file=None, filled=True,feature_names=SanityAgreeFeatureName, class_names=['Reference', 'Targeting'], rounded=True) 57 | pydotplus.graph_from_dot_data(dot_data).write_png("./results/"+ SET_NAME + "_DTree"+ type +".png") 58 | 59 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: asap 2 | channels: 3 | - conda-forge/label/cf201901 4 | - anaconda 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - ca-certificates=2019.11.27=0 9 | - certifi=2019.11.28=py36_0 10 | - openssl=1.1.1=h1de35cc_0 11 | - pandas=0.25.3=py36h0a44026_0 12 | - pytz=2019.3=py_0 13 | - appnope=0.1.0=py36_1000 14 | - attrs=19.3.0=py_0 15 | - backcall=0.1.0=py_0 16 | - bleach=3.1.0=py_0 17 | - cycler=0.10.0=py_2 18 | - decorator=4.4.1=py_0 19 | - defusedxml=0.6.0=py_0 20 | - entrypoints=0.3=py36_1000 21 | - freetype=2.10.0=h24853df_1 22 | - graphviz=2.42.3=h98dfb87_0 23 | - icu=58.2=h0a44026_1000 24 | - importlib_metadata=1.3.0=py36_0 25 | - ipykernel=5.1.3=py36h5ca1d4c_0 26 | - ipython=7.11.0=py36h5ca1d4c_0 27 | - ipython_genutils=0.2.0=py_1 28 | - ipywidgets=7.5.1=py_0 29 | - jedi=0.15.2=py36_0 30 | - jinja2=2.10.3=py_0 31 | - joblib=0.14.1=py_0 32 | - jsonschema=3.2.0=py36_0 33 | - jupyter=1.0.0=py_2 34 | - jupyter_client=5.3.3=py36_1 35 | - jupyter_console=5.1.0=py36_0 36 | - jupyter_core=4.6.1=py36_0 37 | - kiwisolver=1.1.0=py36ha1b3eb9_0 38 | - libblas=3.8.0=14_openblas 39 | - libcblas=3.8.0=14_openblas 40 | - libcxx=9.0.0=h89e68fa_1 41 | - libffi=3.2.1=h6de7cb9_1006 42 | - libgfortran=4.0.0=2 43 | - liblapack=3.8.0=14_openblas 44 | - libopenblas=0.3.7=h3d69b6c_4 45 | - libpng=1.6.37=h2573ce8_0 46 | - libsodium=1.0.17=h01d97ff_0 47 | - libtiff=4.1.0=ha78913b_1 48 | - llvm-openmp=8.0.1=h770b8ee_0 49 | - lz4-c=1.8.3=h6de7cb9_1001 50 | - markupsafe=1.1.1=py36h0b31af3_0 51 | - matplotlib=3.1.2=py36_1 52 | - matplotlib-base=3.1.2=py36h11da6c2_1 53 | - mistune=0.8.4=py36h0b31af3_1000 54 | - more-itertools=8.0.2=py_0 55 | - nbconvert=5.6.1=py36_0 56 | - nbformat=4.4.0=py_1 57 | - ncurses=6.1=h0a44026_1002 58 | - notebook=6.0.1=py36_0 59 | - numpy=1.17.3=py36hde6bac1_0 60 | - pandoc=2.9.1=0 61 | - pandocfilters=1.4.2=py_1 62 | - parso=0.5.2=py_0 63 | - pexpect=4.7.0=py36_0 64 | - pickleshare=0.7.5=py36_1000 65 | - pip=19.3.1=py36_0 66 | - prometheus_client=0.7.1=py_0 67 | - prompt_toolkit=3.0.2=py_0 68 | - ptyprocess=0.6.0=py_1001 69 | - pydot=1.4.1=py36_1001 70 | - pydotplus=2.0.2=pyhd1c1de3_3 71 | - pygments=2.5.2=py_0 72 | - pyparsing=2.4.6=py_0 73 | - pyqt=5.6.0=py36hc26a216_1008 74 | - pyrsistent=0.15.6=py36h0b31af3_0 75 | - python=3.6.7=h8dc6b48_1004 76 | - python-dateutil=2.8.1=py_0 77 | - pyzmq=18.1.1=py36h4bf09a9_0 78 | - qt=5.6.2=h822fa55_1013 79 | - qtconsole=4.6.0=py_0 80 | - scikit-learn=0.21.3=py36hd4ffd6c_0 81 | - scipy=1.4.1=py36h82752d6_0 82 | - send2trash=1.5.0=py_0 83 | - setuptools=42.0.2=py36_0 84 | - sip=4.18.1=py36h0a44026_1000 85 | - six=1.13.0=py36_0 86 | - terminado=0.8.3=py36_0 87 | - testpath=0.4.4=py_0 88 | - tk=8.6.10=hbbe82c9_0 89 | - tornado=6.0.3=py36h0b31af3_0 90 | - traitlets=4.3.3=py36_0 91 | - wcwidth=0.1.7=py_1 92 | - webencodings=0.5.1=py_1 93 | - wheel=0.33.6=py36_0 94 | - widgetsnbextension=3.5.1=py36_0 95 | - xz=5.2.4=h1de35cc_1001 96 | - zeromq=4.3.2=h6de7cb9_2 97 | - zipp=0.6.0=py_0 98 | - zlib=1.2.11=h0b31af3_1006 99 | - zstd=1.4.4=he7fca8b_1 100 | - biopython=1.72=py36h470a237_0 101 | - jpeg=9c=h470a237_1 102 | - readline=7.0=haf1bffa_1 103 | - sqlite=3.26.0=hb1c47c0_0 104 | 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning 2 | 3 | Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning (ASAP-SML) is a pipeline to identify distinguishing features in targeting antibody set when compared to a reference non-targeting set. The pipeline first extracts germline, CDR canonical structure, isoelectric point and frequent positional motifs features from sequences and creates an antibody feature fingerprint. Machine-learning and statistical significance testing are applied to antibody sequences and feature fingerprints to identify distinguishing feature values and combinations thereof. When applied to an MMP-targeting set, ASAP identifies salient features and recommends features to use when designing novel MPP-targeting antibody sequences. 4 | 5 | ## How to install 6 | ### Requirements: 7 | An [Anaconda python environment](https://www.anaconda.com/download) is recommmended. 8 | Check the environment.yml file, but primarily: 9 | - python >= 3.5 10 | - pandas 11 | - graphviz 12 | - jupyter 13 | - numpy 14 | - scikit-learn 15 | - scipy 16 | - biopython 17 | 18 | Jupyter notebook is required to run the ipynb examples. 19 | 20 | ### via Anaconda 21 | We recommend installing using Anaconda as follows: 22 | ``` 23 | conda create --name asap --file enviroment.yml 24 | source activate asap 25 | ``` 26 | 27 | ## Example: Matrix Metalloproteinases (MMP) targeting and reference antibody sequence set 28 | 29 | This repository contains an example of how to run the ASAP pipeline on the MMP-targeting and reference antibody sequence set. 30 | 31 | To run the script, open the terminal and go to the project directory, then run: 32 | 33 | ` 34 | jupyter notebook 35 | ` 36 | 37 | Take a look at the file "ASAP.ipynb". Parameters are set based on the users choice. Once you have set the parameters, run the notebook document step-by-step (one cell a time) by 38 | 39 | - Pressing shift + enter 40 | 41 | Or, run the whole notebook in a single step by 42 | 43 | - Clicking on the menu Cell -> Run All. 44 | 45 | ## Components 46 | ASAP.ipynb : main script for running ASAP pipeline 47 | 48 | - **./ASAP/FeatureExtraction.py** - functions for feature extraction on Chothia numbered antibody sequences. 49 | - **./ASAP/SequenceAndFeatureAnalysis.py** - functions for sequence and feature analysis on antibody sequences. 50 | - **./ASAP/DesignRecommendation.py** - functions to generate design recommendation trees for specific targeting antibody sequences. 51 | 52 | ## Data 53 | 54 | - Data to run ASAP: [BLOSUM-62 substitution matrix](https://en.wikipedia.org/wiki/BLOSUM#cite_ref-henikoff_1-0) and [Canonical Structure Definition](http://circe.med.uniroma1.it/pigs/canonical.php) 55 | 56 | - Data to run ASAP on MMP-targeting example: MMP-targeting and reference set. 57 | 58 | MMP-targeting set is composed of publicly available antibody sequence data. Reference set is from the Protein Data Bank (PDB) and it consists of human and murine antibody sequences that do not bind or inhibit MMPs. Please see our paper for details. 59 | 60 | ## Authors: 61 | This software is written by Xinmeng Li, James Van Deventer, Soha Hassoun (Soha.Hassoun@tufts.edu). 62 | 63 | Publication: ["ASAP-SML: An Antibody Sequence Analysis Pipeline Using Statistical Testing and Machine Learning"](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1007779) 64 | 65 | **Please cite our work:** 66 | 67 | Li, Xinmeng, James A. Van Deventer, and Soha Hassoun. "ASAP-SML: An antibody sequence analysis pipeline using statistical testing and machine learning." PLoS computational biology 16.4 (2020): e1007779. 68 | 69 | ## License 70 | 71 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details 72 | 73 | -------------------------------------------------------------------------------- /ASAP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#################################################################################################################\n", 10 | "# #\n", 11 | "# Section 1 Data Preperation #\n", 12 | "# #\n", 13 | "#################################################################################################################\n", 14 | "\n", 15 | "\n", 16 | "######################################### User define variables ###############################################\n", 17 | "\n", 18 | "# User Choice C_MMPTest\n", 19 | "# Run test case for MMP? (y/n)\n", 20 | "# If True: Default testCase-MMP files \n", 21 | "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n", 22 | "# Default: True\n", 23 | "\n", 24 | "C_SAMPLE_Test = True\n", 25 | "\n", 26 | "# User Choice C_PIGS\n", 27 | "# Use PIGS template for CDR canonical structure? (y/n)\n", 28 | "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n", 29 | "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n", 30 | "# Default: True\n", 31 | "\n", 32 | "C_PIGS = True\n", 33 | "\n", 34 | "# User Choice C_DesireSize\n", 35 | "# Use default desire size for targeting dataset? (y/n)\n", 36 | "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n", 37 | "# If False: User define desire size for targeting dataset\n", 38 | "# Default: True\n", 39 | "\n", 40 | "C_DesireSize = True\n", 41 | "\n", 42 | "# User Choice C_k\n", 43 | "# Use default number of iterations? (y/n)\n", 44 | "# If True: Default number of iterations, k = 100\n", 45 | "# If False: User define number of iterations\n", 46 | "# Default: True\n", 47 | "C_k = True\n", 48 | "\n", 49 | "\n", 50 | "####################################### Define global variables ###############################################\n", 51 | "\n", 52 | "\n", 53 | "# SET_NAME = 'IGHV'\n", 54 | "# IF_ONLY_HEAVY = True\n", 55 | "# CNT_DB = 1\n", 56 | "# CNT_TARGET = 1\n", 57 | "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n", 58 | "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n", 59 | "# TARGET_DESIRE_SIZE = 134 #44 #IGHV\n", 60 | "\n", 61 | "SET_NAME = 'MMP-cluster'\n", 62 | "IF_ONLY_HEAVY = False\n", 63 | "CNT_DB = 2\n", 64 | "CNT_TARGET = 1\n", 65 | "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n", 66 | "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n", 67 | "TARGET_DESIRE_SIZE = 166\n", 68 | "\n", 69 | "PIGS_PATH = './data/pigs_canonical.txt'\n", 70 | "TEMPLATE_PATH = './user/data/'\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "ITERATION = 100\n", 75 | "\n", 76 | "######################################## Determine variable values ############################################\n", 77 | "\n", 78 | "if C_SAMPLE_Test == True:\n", 79 | " targeting_direct = TARGETING_PATH_TESTCASE\n", 80 | " reference_direct = REFERENCE_PATH_TESTCASE\n", 81 | "else:\n", 82 | " print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n", 83 | " targeting_direct = TARGETING_PATH\n", 84 | " reference_direct = REFERENCE_PATH\n", 85 | " \n", 86 | "if C_PIGS == True:\n", 87 | " canonical_direct = PIGS_PATH\n", 88 | "else:\n", 89 | " print(\"Upload CDR canonical structure templates. \")\n", 90 | " print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n", 91 | " print(\"the second column is the length of the region defined in the first column, \")\n", 92 | " print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n", 93 | " template_name = input(\"What is the name of the template?\")\n", 94 | " canonical_direct = TEMPLATE_PATH + template_name\n", 95 | " \n", 96 | "if C_SAMPLE_Test == True and C_DesireSize == True:\n", 97 | " size = TARGET_DESIRE_SIZE\n", 98 | "elif C_SAMPLE_Test == False and C_DesireSize == True:\n", 99 | " size = 'medium'\n", 100 | "else:\n", 101 | " size = int(input('What is the desire size for the targeting set?'))\n", 102 | " \n", 103 | "if C_k == True:\n", 104 | " iterate = ITERATION\n", 105 | "else:\n", 106 | " iterate = int(input(\"What is the number of iterations?\"))\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 2, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Data:\n", 119 | "r1 : 276\n", 120 | "r2 : 219\n", 121 | "t1 : 166\n", 122 | "Sum: 661\n", 123 | "\n", 124 | "Number of feature values:\n", 125 | "Germline: 334\n", 126 | "CDR canonical structures: 20\n", 127 | "Isoelectric points (pI): 8\n", 128 | "Frequent positional motif: 42\n", 129 | "Total: 404\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "#################################################################################################################\n", 135 | "# #\n", 136 | "# Section 2 Feature Extraction #\n", 137 | "# #\n", 138 | "#################################################################################################################\n", 139 | "\n", 140 | "\n", 141 | "############################################ Import libaries ##################################################\n", 142 | "\n", 143 | "import ASAP.FeatureExtraction as extract\n", 144 | "\n", 145 | "\n", 146 | "############################################ Function calls ##################################################\n", 147 | "\n", 148 | "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n", 149 | "\n", 150 | "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n", 151 | "\n", 152 | "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n", 153 | "\n", 154 | "CDRH3 = extract.GetCDRH3(Amino, Num)\n", 155 | "\n", 156 | "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n", 157 | "\n", 158 | "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n", 159 | "\n", 160 | "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n", 161 | "\n", 162 | "\n", 163 | "############################################ Report section results #############################################\n", 164 | "\n", 165 | "print(\"Data:\")\n", 166 | "for i in range(len(DatasetSize)):\n", 167 | " print(DatasetName[i], \":\",DatasetSize[i],)\n", 168 | "print(\"Sum:\", sum(DatasetSize))\n", 169 | "\n", 170 | "print(\"\\nNumber of feature values:\")\n", 171 | "print(\"Germline:\", len(GermFeatureNames),)\n", 172 | "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n", 173 | "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n", 174 | "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n", 175 | "print(\"Total:\", len(AllFeatureNames))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stderr", 185 | "output_type": "stream", 186 | "text": [ 187 | "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 188 | " return f(*args, **kwds)\n" 189 | ] 190 | }, 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n", 196 | "Statistical tests (Reference against Targeting) succeed.\n", 197 | "(661, 404) 495 166\n", 198 | "Average AUC with all features: \n", 199 | "SVM\t\t 0.9900013753999437\n", 200 | "Random forest\t 0.9858123420498757\n", 201 | "AdaBoost\t 0.985921855997419\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "#################################################################################################################\n", 207 | "# #\n", 208 | "# Section 3 Sequence and Feature Analysis #\n", 209 | "# #\n", 210 | "#################################################################################################################\n", 211 | "\n", 212 | "############################################ Import libaries ##################################################\n", 213 | "\n", 214 | "import ASAP.SequenceAndFeatureAnalysis as analysis\n", 215 | "\n", 216 | "############################################ Function calls ##################################################\n", 217 | "\n", 218 | "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n", 219 | " DatasetSize, ExcludeIGHVVectors)\n", 220 | "\n", 221 | "###################### Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n", 222 | "\n", 223 | "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n", 224 | "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n", 225 | "if not IF_ONLY_HEAVY:\n", 226 | " analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n", 227 | "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n", 228 | "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n", 229 | "\n", 230 | "############################### Section 3.2 Similarity analysis (Statistical test) #############################\n", 231 | "\n", 232 | "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n", 233 | "\n", 234 | "####################################### Section 3.3 Salient feature-value analysis ############################ #\n", 235 | "\n", 236 | "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n", 237 | " ExcludeFeatureNames)\n", 238 | "\n", 239 | "####################################### Section 3.4 Classification on segments ################################ \n", 240 | "\n", 241 | "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n", 242 | "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n", 243 | "\n", 244 | "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 4, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "#################################################################################################################\n", 254 | "# #\n", 255 | "# Section 4 Design Recommendation #\n", 256 | "# #\n", 257 | "#################################################################################################################\n", 258 | "\n", 259 | "############################################ Import libaries ##################################################\n", 260 | "\n", 261 | "import ASAP.DesignRecommendation as design\n", 262 | "\n", 263 | "############################################ Function calls ##################################################\n", 264 | "\n", 265 | "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.6.8" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 1 297 | } 298 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/ASAP-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#################################################################################################################\n", 10 | "# #\n", 11 | "# Section 1 Data Preperation #\n", 12 | "# #\n", 13 | "#################################################################################################################\n", 14 | "\n", 15 | "\n", 16 | "######################################### User define variables ###############################################\n", 17 | "\n", 18 | "# User Choice C_MMPTest\n", 19 | "# Run test case for MMP? (y/n)\n", 20 | "# If True: Default testCase-MMP files \n", 21 | "# If False: User upload Chothia-numbered sequences files to \"targeting\" and \"reference\" folders under \"./user/data/\" respectively.\n", 22 | "# Default: True\n", 23 | "\n", 24 | "C_SAMPLE_Test = True\n", 25 | "\n", 26 | "# User Choice C_PIGS\n", 27 | "# Use PIGS template for CDR canonical structure? (y/n)\n", 28 | "# If True: Default PIGS CDR Canonical structure template under Chothia numbering\n", 29 | "# If False: User upload fomatted CDR Canonical structure template under \"./user/data/\"\n", 30 | "# Default: True\n", 31 | "\n", 32 | "C_PIGS = True\n", 33 | "\n", 34 | "# User Choice C_DesireSize\n", 35 | "# Use default desire size for targeting dataset? (y/n)\n", 36 | "# If True: Default desire size, 44 for the MMP test case, medium for user upload files\n", 37 | "# If False: User define desire size for targeting dataset\n", 38 | "# Default: True\n", 39 | "\n", 40 | "C_DesireSize = True\n", 41 | "\n", 42 | "# User Choice C_k\n", 43 | "# Use default number of iterations? (y/n)\n", 44 | "# If True: Default number of iterations, k = 100\n", 45 | "# If False: User define number of iterations\n", 46 | "# Default: True\n", 47 | "C_k = True\n", 48 | "\n", 49 | "\n", 50 | "####################################### Define global variables ###############################################\n", 51 | "\n", 52 | "\n", 53 | "# SET_NAME = 'IGHV'\n", 54 | "# IF_ONLY_HEAVY = True\n", 55 | "# CNT_DB = 1\n", 56 | "# CNT_TARGET = 1\n", 57 | "# REFERENCE_PATH_TESTCASE = './testCase/IGHV/reference-IGHV/'\n", 58 | "# TARGETING_PATH_TESTCASE = './testCase/IGHV/targeting-MMP-IGHV/'\n", 59 | "# TARGET_DESIRE_SIZE = 134 #44 #IGHV\n", 60 | "\n", 61 | "SET_NAME = 'MMP-cluster'\n", 62 | "IF_ONLY_HEAVY = False\n", 63 | "CNT_DB = 2\n", 64 | "CNT_TARGET = 1\n", 65 | "REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/'\n", 66 | "TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/'\n", 67 | "TARGET_DESIRE_SIZE = 166\n", 68 | "\n", 69 | "PIGS_PATH = './data/pigs_canonical.txt'\n", 70 | "TEMPLATE_PATH = './user/data/'\n", 71 | "\n", 72 | "\n", 73 | "\n", 74 | "ITERATION = 100\n", 75 | "\n", 76 | "######################################## Determine variable values ############################################\n", 77 | "\n", 78 | "if C_SAMPLE_Test == True:\n", 79 | " targeting_direct = TARGETING_PATH_TESTCASE\n", 80 | " reference_direct = REFERENCE_PATH_TESTCASE\n", 81 | "else:\n", 82 | " print(\"Each pair of light and heavy chain sequence should be in the order of LIGHT/HEAVY/LIGHT/HEAVY\")\n", 83 | " targeting_direct = TARGETING_PATH\n", 84 | " reference_direct = REFERENCE_PATH\n", 85 | " \n", 86 | "if C_PIGS == True:\n", 87 | " canonical_direct = PIGS_PATH\n", 88 | "else:\n", 89 | " print(\"Upload CDR canonical structure templates. \")\n", 90 | " print(\"In the template, the first column must be the L1, L2, L3, H1, H2, or H3, \")\n", 91 | " print(\"the second column is the length of the region defined in the first column, \")\n", 92 | " print(\"starting from the third column, it is the position and candidate amino acid on each position, such as 1 ABC 2 CDETFG.\") \n", 93 | " template_name = input(\"What is the name of the template?\")\n", 94 | " canonical_direct = TEMPLATE_PATH + template_name\n", 95 | " \n", 96 | "if C_SAMPLE_Test == True and C_DesireSize == True:\n", 97 | " size = TARGET_DESIRE_SIZE\n", 98 | "elif C_SAMPLE_Test == False and C_DesireSize == True:\n", 99 | " size = 'medium'\n", 100 | "else:\n", 101 | " size = int(input('What is the desire size for the targeting set?'))\n", 102 | " \n", 103 | "if C_k == True:\n", 104 | " iterate = ITERATION\n", 105 | "else:\n", 106 | " iterate = int(input(\"What is the number of iterations?\"))\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 2, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "Data:\n", 119 | "r1 : 276\n", 120 | "r2 : 219\n", 121 | "t1 : 166\n", 122 | "Sum: 661\n", 123 | "\n", 124 | "Number of feature values:\n", 125 | "Germline: 334\n", 126 | "CDR canonical structures: 20\n", 127 | "Isoelectric points (pI): 8\n", 128 | "Frequent positional motif: 42\n", 129 | "Total: 404\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "#################################################################################################################\n", 135 | "# #\n", 136 | "# Section 2 Feature Extraction #\n", 137 | "# #\n", 138 | "#################################################################################################################\n", 139 | "\n", 140 | "\n", 141 | "############################################ Import libaries ##################################################\n", 142 | "\n", 143 | "import ASAP.FeatureExtraction as extract\n", 144 | "\n", 145 | "\n", 146 | "############################################ Function calls ##################################################\n", 147 | "\n", 148 | "Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct)\n", 149 | "\n", 150 | "OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName)\n", 151 | "\n", 152 | "OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName)\n", 153 | "\n", 154 | "CDRH3 = extract.GetCDRH3(Amino, Num)\n", 155 | "\n", 156 | "OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName)\n", 157 | "\n", 158 | "MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName)\n", 159 | "\n", 160 | "AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames)\n", 161 | "\n", 162 | "\n", 163 | "############################################ Report section results #############################################\n", 164 | "\n", 165 | "print(\"Data:\")\n", 166 | "for i in range(len(DatasetSize)):\n", 167 | " print(DatasetName[i], \":\",DatasetSize[i],)\n", 168 | "print(\"Sum:\", sum(DatasetSize))\n", 169 | "\n", 170 | "print(\"\\nNumber of feature values:\")\n", 171 | "print(\"Germline:\", len(GermFeatureNames),)\n", 172 | "print(\"CDR canonical structures:\", len(CanonFeatureNames),)\n", 173 | "print(\"Isoelectric points (pI):\", len(PIFeatureNames),)\n", 174 | "print(\"Frequent positional motif:\",len(MotifFeatureNames),)\n", 175 | "print(\"Total:\", len(AllFeatureNames))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stderr", 185 | "output_type": "stream", 186 | "text": [ 187 | "/Users/xinmeng/anaconda3/envs/homework/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88\n", 188 | " return f(*args, **kwds)\n" 189 | ] 190 | }, 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "RanksumsResult(statistic=-147.48812830271845, pvalue=0.0) RanksumsResult(statistic=-191.24982487069587, pvalue=0.0)\n", 196 | "Statistical tests (Reference against Targeting) succeed.\n", 197 | "(661, 404) 495 166\n", 198 | "Average AUC with all features: \n", 199 | "SVM\t\t 0.9900013753999437\n", 200 | "Random forest\t 0.9858123420498757\n", 201 | "AdaBoost\t 0.985921855997419\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "#################################################################################################################\n", 207 | "# #\n", 208 | "# Section 3 Sequence and Feature Analysis #\n", 209 | "# #\n", 210 | "#################################################################################################################\n", 211 | "\n", 212 | "############################################ Import libaries ##################################################\n", 213 | "\n", 214 | "import ASAP.SequenceAndFeatureAnalysis as analysis\n", 215 | "\n", 216 | "############################################ Function calls ##################################################\n", 217 | "\n", 218 | "X_IDS, Y_IDS, SeqName_IDS = analysis.IterationDuplicateSelectFeature(size, iterate, DatasetName, \n", 219 | " DatasetSize, ExcludeIGHVVectors)\n", 220 | "\n", 221 | "###################### Section 3.1 Sequence and feature similarity analysis (Heat map) ##########################\n", 222 | "\n", 223 | "H_Idist, L_Idist = analysis.HeatmapHL(size, iterate, SeqName_IDS, Amino, Num)\n", 224 | "analysis.Draw_heatmap(size, H_Idist[1], 'Heavy Chain Sequences', DatasetSize)\n", 225 | "if not IF_ONLY_HEAVY:\n", 226 | " analysis.Draw_heatmap(size, L_Idist[1], 'Light Chain Sequences', DatasetSize)\n", 227 | "F_Idist = analysis.HeatmapFeature(size, iterate, X_IDS, ExcludeFeatureNames, MotifFeatureNames)\n", 228 | "analysis.Draw_heatmap(size, F_Idist[0], 'Extracted Features', DatasetSize)\n", 229 | "\n", 230 | "############################### Section 3.2 Similarity analysis (Statistical test) #############################\n", 231 | "\n", 232 | "analysis.MultiRankTest(size, iterate, F_Idist, H_Idist, L_Idist)\n", 233 | "\n", 234 | "####################################### Section 3.3 Salient feature-value analysis ############################ #\n", 235 | "\n", 236 | "analysis.MultiFisherFS(iterate, X_IDS, Y_IDS, DatasetName, DatasetSize, ExcludeIGHVVectors, \n", 237 | " ExcludeFeatureNames)\n", 238 | "\n", 239 | "####################################### Section 3.4 Classification on segments ################################ \n", 240 | "\n", 241 | "analysis.MultiAuc(iterate, X_IDS, Y_IDS)\n", 242 | "analysis.ROCDrawing(X_IDS[0], Y_IDS[0], GermFeatureNames, CanonFeatureNames, PIFeatureNames, MotifFeatureNames, AllFeatureNames)\n", 243 | "\n", 244 | "analysis.JaccardCoefficientAnalysis(AllFeatureVectors, AllFeatureNames, DatasetSize)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 4, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "#################################################################################################################\n", 254 | "# #\n", 255 | "# Section 4 Design Recommendation #\n", 256 | "# #\n", 257 | "#################################################################################################################\n", 258 | "\n", 259 | "############################################ Import libaries ##################################################\n", 260 | "\n", 261 | "import ASAP.DesignRecommendation as design\n", 262 | "\n", 263 | "############################################ Function calls ##################################################\n", 264 | "\n", 265 | "design.MultiDecisionTree(iterate, X_IDS, Y_IDS, ExcludeFeatureNames, 'AllFeature')" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.6.8" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 1 297 | } 298 | -------------------------------------------------------------------------------- /ASAP/S_SequenceInRegion.py: -------------------------------------------------------------------------------- 1 | import Bio.SeqUtils.ProtParam 2 | import os 3 | import ASAP.FeatureExtraction as extract 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | 8 | # Chothia numbering definition for CDR regions 9 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}} 10 | canonical_direct = '../data/pigs_canonical.txt' 11 | 12 | SET_NAME = 'IGHV' 13 | IF_ONLY_HEAVY = True 14 | CNT_DB = 1 15 | CNT_TARGET = 1 16 | REFERENCE_PATH_TESTCASE = '../testCase/IGHV/reference-IGHV/' 17 | TARGETING_PATH_TESTCASE = '../testCase/IGHV/targeting-MMP-IGHV/' 18 | TARGET_DESIRE_SIZE = 134 #44 #IGHV 19 | 20 | targeting_direct = TARGETING_PATH_TESTCASE 21 | reference_direct = REFERENCE_PATH_TESTCASE 22 | 23 | Amino, Num, Germ, DatasetName, DatasetSize = extract.ReadAminoNumGerm(targeting_direct, reference_direct) 24 | 25 | seq_id = [] 26 | for i, name in enumerate(DatasetName): 27 | # if i<2: 28 | # continue 29 | tmp= [[] for j in range(int(DatasetSize[i]))] 30 | # for every seq in that dataset 31 | for j in range(int(DatasetSize[i])): 32 | seq_name = name + '_' + str(j) 33 | seq_id.append(seq_name) 34 | 35 | # raw sequence 36 | def sequence_raw(): 37 | def getSequenceHL(sname): 38 | SH = ''.join(Amino['H'][sname]) 39 | SL = '' 40 | if not IF_ONLY_HEAVY: 41 | SL = ''.join(Amino['L'][sname]) 42 | return SL, SH 43 | else: 44 | return [SH] 45 | 46 | with open('../results/'+SET_NAME +'_Sequence.csv','w') as fi: 47 | fi.write('sequence name, ') 48 | if not IF_ONLY_HEAVY: 49 | fi.write('light chain, ') 50 | fi.write('heavy chain\n') 51 | for sname in seq_id: 52 | fi.write(sname + ',' + ','.join(getSequenceHL(sname))+ '\n') 53 | 54 | # sequence with numbering 55 | def sequence_num(): 56 | def getSequenceHL_num(sname): 57 | NH = ','.join(Num['H'][sname]) 58 | SH = ','.join(Amino['H'][sname]) 59 | NL = ','.join(Num['L'][sname]) 60 | SL = ','.join(Amino['L'][sname]) 61 | return NH, SH, NL, SL 62 | 63 | with open('./Sequence_numbered.csv','w') as fi: 64 | for sname in seq_id: 65 | NH, SH, NL, SL = getSequenceHL_num(sname) 66 | fi.write(sname + ' light num,' + NL + '\n') 67 | fi.write(sname + ' light seq,' + SL + '\n') 68 | fi.write(sname + ' heavy num,' + NH + '\n') 69 | fi.write(sname + ' heavy seq,' + SH + '\n') 70 | 71 | # sequence with region 72 | def sequence_region(): 73 | def getSequenceHL_region(sname): 74 | NH = Num['H'][sname] 75 | 76 | HFW1, HCDR1, HFW2, HCDR2, HFW3, HCDR3, HFW4 = '', '', '', '', '', '', '' 77 | 78 | for i, number in enumerate(NH): 79 | if number[-1] >= 'A' and number[-1] <= 'Z': 80 | num_i = int(number[:-1]) 81 | else: 82 | num_i = int(number) 83 | if num_i < CHOTHIA_CDR['H']['1'][0]: 84 | HFW1 += Amino['H'][sname][i] 85 | elif num_i <= CHOTHIA_CDR['H']['1'][1]: 86 | HCDR1+= Amino['H'][sname][i] 87 | elif num_i < CHOTHIA_CDR['H']['2'][0]: 88 | HFW2 += Amino['H'][sname][i] 89 | elif num_i <= CHOTHIA_CDR['H']['2'][1]: 90 | HCDR2 += Amino['H'][sname][i] 91 | elif num_i < CHOTHIA_CDR['H']['3'][0]: 92 | HFW3 += Amino['H'][sname][i] 93 | elif num_i <= CHOTHIA_CDR['H']['3'][1]: 94 | HCDR3 += Amino['H'][sname][i] 95 | else: 96 | HFW4 += Amino['H'][sname][i] 97 | if IF_ONLY_HEAVY: 98 | return ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join( 99 | HFW4) 100 | else: 101 | NL = Num['L'][sname] 102 | LFW1, LCDR1, LFW2, LCDR2, LFW3, LCDR3, LFW4 = '', '', '', '', '', '', '' 103 | for i, number in enumerate(NL): 104 | if number[-1] >= 'A' and number[-1] <= 'Z': 105 | num_i = int(number[:-1]) 106 | else: 107 | num_i = int(number) 108 | if num_i < CHOTHIA_CDR['L']['1'][0]: 109 | LFW1 += Amino['L'][sname][i] 110 | elif num_i <= CHOTHIA_CDR['L']['1'][1]: 111 | LCDR1 += Amino['L'][sname][i] 112 | elif num_i < CHOTHIA_CDR['L']['2'][0]: 113 | LFW2 += Amino['L'][sname][i] 114 | elif num_i <= CHOTHIA_CDR['L']['2'][1]: 115 | LCDR2 += Amino['L'][sname][i] 116 | elif num_i < CHOTHIA_CDR['L']['3'][0]: 117 | LFW3 += Amino['L'][sname][i] 118 | elif num_i <= CHOTHIA_CDR['L']['3'][1]: 119 | LCDR3 += Amino['L'][sname][i] 120 | else: 121 | LFW4 += Amino['L'][sname][i] 122 | return ''.join(LFW1), ''.join(LCDR1), ''.join(LFW2), ''.join(LCDR2), ''.join(LFW3), ''.join(LCDR3), ''.join(LFW4),\ 123 | ''.join(HFW1), ''.join(HCDR1), ''.join(HFW2), ''.join(HCDR2), ''.join(HFW3), ''.join(HCDR3), ''.join(HFW4) 124 | 125 | with open('../results/'+SET_NAME +'_Sequence_region.csv','w') as fi: 126 | if IF_ONLY_HEAVY: 127 | fi.write( 128 | 'sequence id, heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n') 129 | 130 | else: 131 | fi.write('sequence id, light chain FW1, light chain CDR1, light chain FW2, light chain CDR2, light chain FW3, light chain CDR3, light chain FW4, '+ 132 | 'heavy chain FW1, heavy chain CDR1, heavy chain FW2, heavy chain CDR2, heavy chain FW3, heavy chain CDR3, heavy chain FW4\n') 133 | for sname in seq_id: 134 | fi.write(sname + ',' + ','.join(getSequenceHL_region(sname)) + '\n') 135 | 136 | 137 | def feature_distribution(): 138 | from collections import Counter 139 | write_out = [[] for i in range(len(seq_id))] 140 | for fi in range(1,12): 141 | feat = [] 142 | for item in write_out: 143 | feat.append(item[fi]) 144 | 145 | feat_count = Counter(feat) 146 | sorted_count = sorted(feat_count.items(), key=lambda kv: kv[1], reverse=True) 147 | if fi==11: 148 | feat_type = sorted_count[0][0].split('_')[0] 149 | else: 150 | feat_type = sorted_count[0][0].split('_')[0] + sorted_count[0][0].split('_')[1] 151 | with open('./Features_distribution_'+feat_type+'.csv','w') as fi: 152 | for i in range(len(sorted_count)): 153 | fi.write(sorted_count[i][0]+','+str(sorted_count[i][1])+'\n') 154 | 155 | def feature(): 156 | write_out = [[] for i in range(len(seq_id))] 157 | for i in range(len(seq_id)): 158 | write_out[i].append(seq_id[i]) 159 | for idx, f in enumerate(AllFeatureVectors[i]): 160 | if f == 1: 161 | write_out[i].append(AllFeatureNames[idx]) 162 | 163 | with open('../results/'+SET_NAME +'_Features.csv', 'w') as fi: 164 | 165 | fi.write('sequence id, ') 166 | if not IF_ONLY_HEAVY: 167 | fi.write('light chain V region, light chain J region, ') 168 | fi.write('heavy chain V region, heavy chain J region, ') 169 | if not IF_ONLY_HEAVY: 170 | fi.write('Canonical L1, Canonical L2, Canonical L3, ') 171 | fi.write('Canonical H1, Canonical H2, Canonical H3, ' ) 172 | fi.write('PI, frequent positional motif\n') 173 | for i in range(len(write_out)): 174 | fi.write(','.join(write_out[i]) + '\n') 175 | 176 | 177 | def correlation_feature(): 178 | 179 | ###### plot correlation matrix 180 | data = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames) 181 | # print(AllFeatureVectors.shape) 182 | corr = data.corr() 183 | import numpy as np 184 | corr = np.array(corr) 185 | with open('../results/Pearson_feature_correlation.csv', 'w') as fi: 186 | fi.write('Feature value 1, Feature value 2, Pearson coefficient\n') 187 | for i in range(len(AllFeatureNames)): 188 | for j in range(i+1, len(AllFeatureNames)): 189 | # if str(corr[i][j])=='nan': 190 | # print('nan', AllFeatureNames[i], AllFeatureNames[j]) 191 | fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(corr[i][j])+'\n') 192 | 193 | 194 | 195 | # data.to_csv(r'../results/Feature_test.csv', header=True) 196 | 197 | # fig = plt.figure(figsize=(100, 70)) 198 | # ax = fig.add_subplot(111) 199 | # cax = ax.matshow(corr, cmap='seismic', vmin=-1, vmax =1) 200 | # fig.colorbar(cax) 201 | # ticks = np.arange(0, len(data.columns),1) 202 | # ax.set_xticks(ticks) 203 | # plt.xticks(rotation=90) 204 | # ax.set_yticks(ticks) 205 | # ax.set_xticklabels(data.columns) 206 | # ax.set_yticklabels(data.columns) 207 | # plt.savefig('../results/feature_correlation.png') 208 | # corr = pd.DataFrame(corr, index=AllFeatureNames, columns=AllFeatureNames) 209 | ###### display pairwise correlation value 210 | # au_corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(np.bool)) 211 | # au_corr = au_corr.stack().sort_values(ascending=False) 212 | # au_corr = corr.unstack() 213 | # au_corr.columns = [' 1', 'Feature 2', 'Pearson Correlation Value'] 214 | # au_corr = pd.DataFrame(au_corr.values, columns = ['Feature 1, Feature 2, Pearson Correlation Value']) 215 | # au_corr.to_csv(r'../results/Pearson_feature_correlation.csv', header=True) 216 | # print(len(au_corr)) 217 | 218 | # print(AllFeatureVectors[:, AllFeatureNames.index('Germ_LJ_IGKJ3*01')]) 219 | # print(AllFeatureVectors[:, AllFeatureNames.index('Canonical_L2_0')]) 220 | 221 | # def JaccardCoefficientAnalysis(): 222 | # df = pd.DataFrame(AllFeatureVectors, columns=AllFeatureNames) 223 | # 224 | # interest_feature=['Germ_HV_IGHV3-23*01', 'Canonical_H2_6', 'Germ_HJ_IGHJ4*02', 'Germ_HJ_IGHJ6*01', 'Germ_LV_IGKV1D-39*01', 225 | # 'Canonical_H2_5', 'Germ_HJ_IGHJ4*01'] 226 | # jac_sim = np.eye(len(AllFeatureNames)) 227 | # for i in range(len(AllFeatureNames)): 228 | # for j in range(i+1, len(AllFeatureNames)): 229 | # if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'): 230 | # continue 231 | # a = AllFeatureVectors[:, i] 232 | # b = AllFeatureVectors[:, j] 233 | # aandb =0 234 | # aorb = 0 235 | # for k in range(len(a)): 236 | # if a[k]==b[k] and a[k]==1: 237 | # aandb +=1 238 | # if a[k]==1 or b[k]==1: 239 | # aorb +=1 240 | # if aorb==0: 241 | # jac_tmp=0 242 | # else: 243 | # jac_tmp = float(aandb)/aorb 244 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature: 245 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp) 246 | # 247 | # jac_sim[i][j]=jac_tmp 248 | # jac_sim[j][i]=jac_tmp 249 | # 250 | # 251 | # with open('../results/Jaccard_feature_coefficient.csv', 'w') as fi: 252 | # fi.write('Feature value 1, Feature value 2, Jaccard coefficient\n') 253 | # for i in range(len(AllFeatureNames)): 254 | # for j in range(i+1, len(AllFeatureNames)): 255 | # if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'): 256 | # continue 257 | # fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim[i][j])+'\n') 258 | # 259 | # 260 | # fig = plt.figure(figsize=(100, 70)) 261 | # ax = fig.add_subplot(111) 262 | # cax = ax.matshow(jac_sim, cmap='Blues', vmin=0, vmax =1) 263 | # fig.colorbar(cax) 264 | # ticks = np.arange(0, len(df.columns),1) 265 | # ax.set_xticks(ticks) 266 | # plt.xticks(rotation=90) 267 | # ax.set_yticks(ticks) 268 | # ax.set_xticklabels(df.columns) 269 | # ax.set_yticklabels(df.columns) 270 | # plt.savefig('../results/feature_coefficient.png') 271 | # 272 | # # print(AllFeatureVectors[:,AllFeatureNames.index('Germ_LJ_IGKJ3*01')]) 273 | # # print(AllFeatureVectors[:,AllFeatureNames.index('Canonical_L2_0*01')]) 274 | # # where(np.triu(np.ones(jac_sim.shape), k=1).astype(np.bool)) 275 | # # au_jac = jac_sim.where(np.triu(np.ones(jac_sim.shape), k=0).astype(np.bool)) 276 | # # au_jac = au_jac.stack().sort_values(ascending=False) 277 | # # au_jac = jac_sim.unstack() 278 | # # print(len(au_jac)) 279 | # # au_jac.to_csv(r'../results/Jaccard_feature_coefficient.csv', header=True) 280 | 281 | def JaccardCoefficientAnalysis(): 282 | 283 | PDB_size = DatasetSize[0] 284 | 285 | jac_sim_PDB = np.eye(len(AllFeatureNames)) 286 | for i in range(len(AllFeatureNames)): 287 | for j in range(i+1, len(AllFeatureNames)): 288 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'): 289 | continue 290 | a = AllFeatureVectors[:PDB_size, i] 291 | b = AllFeatureVectors[:PDB_size, j] 292 | aandb =0 293 | aorb = 0 294 | for k in range(len(a)): 295 | if a[k]==b[k] and a[k]==1: 296 | aandb +=1 297 | if a[k]==1 or b[k]==1: 298 | aorb +=1 299 | if aorb==0: 300 | jac_tmp=0 301 | else: 302 | jac_tmp = float(aandb)/aorb 303 | 304 | # if AllFeatureNames[i] == 'Germ_HV_IGHV3-23*01' and AllFeatureNames[j] =='Canonical_H2_6': 305 | # print(a, b, jac_tmp) 306 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature: 307 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp) 308 | jac_sim_PDB[i][j]=jac_tmp 309 | jac_sim_PDB[j][i]=jac_tmp 310 | 311 | jac_sim_MMP = np.eye(len(AllFeatureNames)) 312 | for i in range(len(AllFeatureNames)): 313 | for j in range(i+1, len(AllFeatureNames)): 314 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'): 315 | continue 316 | a = AllFeatureVectors[PDB_size:, i] 317 | b = AllFeatureVectors[PDB_size:, j] 318 | 319 | aandb =0 320 | aorb = 0 321 | for k in range(len(a)): 322 | if a[k]==b[k] and a[k]==1: 323 | aandb +=1 324 | if a[k]==1 or b[k]==1: 325 | aorb +=1 326 | if aorb==0: 327 | jac_tmp=0 328 | else: 329 | jac_tmp = float(aandb)/aorb 330 | # if AllFeatureNames[i] in interest_feature and AllFeatureNames[j] in interest_feature: 331 | # print(AllFeatureNames[i], AllFeatureNames[j], jac_tmp) 332 | 333 | jac_sim_MMP[i][j]=jac_tmp 334 | jac_sim_MMP[j][i]=jac_tmp 335 | 336 | 337 | with open('../results/'+SET_NAME+'_Jaccard Feature Coefficient.csv', 'w') as fi: 338 | fi.write('Feature value 1, Feature value 2, Jaccard coefficient for reference set, Jaccard coefficient for MMP-targeting set\n') 339 | for i in range(len(AllFeatureNames)): 340 | for j in range(i+1, len(AllFeatureNames)): 341 | if AllFeatureNames[i].startswith('Motif') or AllFeatureNames[j].startswith('Motif'): 342 | continue 343 | fi.write(AllFeatureNames[i]+ ','+AllFeatureNames[j]+','+ str(jac_sim_PDB[i][j])+','+ str(jac_sim_MMP[i][j])+'\n') 344 | if __name__=='__main__': 345 | sequence_raw() 346 | sequence_region() 347 | OneHotGerm, GermFeatureNames = extract.GetOneHotGerm(Germ, DatasetSize, DatasetName) 348 | OneHotCanon, CanonFeatureNames = extract.GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName) 349 | CDRH3 = extract.GetCDRH3(Amino, Num) 350 | OneHotPI, PIFeatureNames = extract.GetOneHotPI(CDRH3, DatasetSize, DatasetName) 351 | MultiHotMotif, MotifFeatureNames = extract.MultiHotMotif(CDRH3, DatasetSize, DatasetName) 352 | AllFeatureVectors, AllFeatureNames, _, _ = extract.GetFeatureVectors(OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames) 353 | 354 | feature() 355 | # correlation_feature() 356 | JaccardCoefficientAnalysis() 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | -------------------------------------------------------------------------------- /results/MMP-PDB/MMP-cluster_RankFisherAndFS.csv: -------------------------------------------------------------------------------- 1 | Feature type, Feature value,"Fisher Exact Test 2 | p-value"," Feature selection 3 | (thereshold = 0.0025)","Average rank of 4 | Fisher Exact Test","Rank of 5 | feature selection" 6 | Canonical H1,0,1.00E+00,0.00070915,, 7 | Canonical H1,1,1.48E-05,0.003064429,11.86,38 8 | Canonical H1,2,1.00E+00,0.000198741,, 9 | Canonical H1,3,1.00E+00,0.001203345,, 10 | Canonical H1,4,1.00E+00,0,, 11 | Canonical H2,0,1.00E+00,0.029372109,,5 12 | Canonical H2,3,1.00E+00,0,, 13 | Canonical H2,5,1.00E+00,0.038609987,,4 14 | Canonical H2,6,1.71E-39,0.090003385,2,2 15 | Canonical H3,0,9.99E-01,0.004348375,,27 16 | Canonical H3,1,9.97E-01,0.003626023,,35 17 | Canonical H3,2,2.79E-03,0.005589529,31.75,20 18 | Canonical H3,3,9.33E-03,0.001775536,38.91, 19 | Canonical L1,0,9.99E-01,0.006550694,,18 20 | Canonical L1,2,1.00E+00,0,, 21 | Canonical L1,3,1.00E+00,0,, 22 | Canonical L1,4,1.45E-03,0.003968341,29.65,33 23 | Canonical L1,5,1.12E-01,0.002855355,,44 24 | Canonical L2,0,1.00E+00,0,, 25 | Canonical L3,0,1.00E+00,0,, 26 | Germ HJ,IGHJ1*01,9.89E-01,0.002300919,, 27 | Germ HJ,IGHJ1*02,1.00E+00,0.001660937,, 28 | Germ HJ,IGHJ1*03,1.00E+00,0,, 29 | Germ HJ,IGHJ2*01,1.00E+00,0.005897943,,19 30 | Germ HJ,IGHJ2*03,1.00E+00,0,, 31 | Germ HJ,IGHJ3*01,1.00E+00,0.015279146,,8 32 | Germ HJ,IGHJ3*02,2.20E-05,0.011174225,15.66,11 33 | Germ HJ,IGHJ4*01,1.00E+00,0.029354208,,6 34 | Germ HJ,IGHJ4*02,2.79E-23,0.051628515,3,3 35 | Germ HJ,IGHJ5*01,9.97E-01,0.007708994,,15 36 | Germ HJ,IGHJ5*02,2.20E-02,0.004523093,45.14,24 37 | Germ HJ,IGHJ6*01,1.58E-16,0.028791732,4,7 38 | Germ HJ,IGHJ6*02,1.00E+00,0.014648606,,9 39 | Germ HJ,IGHJ6*04,7.96E-02,0.002245223,, 40 | Germ HV,IGHV1-12*01,1.00E+00,0,, 41 | Germ HV,IGHV1-14*01,1.00E+00,0,, 42 | Germ HV,IGHV1-15*01,1.00E+00,0,, 43 | Germ HV,IGHV1-18*01,1.00E+00,0,, 44 | Germ HV,IGHV1-18*04,1.00E+00,0.000532851,, 45 | Germ HV,IGHV1-19*01,1.00E+00,0,, 46 | Germ HV,IGHV1-2*02,6.28E-01,0.003518566,,36 47 | Germ HV,IGHV1-2*04,1.00E+00,0,, 48 | Germ HV,IGHV1-22*01,1.00E+00,0,, 49 | Germ HV,IGHV1-24*01,1.00E+00,0,, 50 | Germ HV,IGHV1-26*01,1.00E+00,0,, 51 | Germ HV,IGHV1-3*01,1.00E+00,0.000294827,, 52 | Germ HV,IGHV1-34*01,1.00E+00,0,, 53 | Germ HV,IGHV1-34*02,1.00E+00,0,, 54 | Germ HV,IGHV1-37*01,1.00E+00,0,, 55 | Germ HV,IGHV1-39*01,1.00E+00,0,, 56 | Germ HV,IGHV1-4*01,1.00E+00,0,, 57 | Germ HV,IGHV1-42*01,1.00E+00,0,, 58 | Germ HV,IGHV1-46*01,1.00E+00,0.000184878,, 59 | Germ HV,IGHV1-47*01,1.00E+00,0,, 60 | Germ HV,IGHV1-5*01,1.00E+00,0.000178421,, 61 | Germ HV,IGHV1-50*01,1.00E+00,0,, 62 | Germ HV,IGHV1-53*01,1.00E+00,0,, 63 | Germ HV,IGHV1-54*01,1.00E+00,0,, 64 | Germ HV,IGHV1-55*01,1.00E+00,0,, 65 | Germ HV,IGHV1-59*01,1.00E+00,0,, 66 | Germ HV,IGHV1-6*02,1.00E+00,0,, 67 | Germ HV,IGHV1-61*01,1.00E+00,0.000155636,, 68 | Germ HV,IGHV1-63*02,1.00E+00,0,, 69 | Germ HV,IGHV1-66*01,1.00E+00,0,, 70 | Germ HV,IGHV1-67*01,1.00E+00,0,, 71 | Germ HV,IGHV1-69-2*01,1.00E+00,0,, 72 | Germ HV,IGHV1-69*01,1.00E+00,0,, 73 | Germ HV,IGHV1-69*02,1.00E+00,0,, 74 | Germ HV,IGHV1-69*08,1.00E+00,0.000118438,, 75 | Germ HV,IGHV1-69*10,1.00E+00,0,, 76 | Germ HV,IGHV1-69*14,1.00E+00,8.53E-06,, 77 | Germ HV,IGHV1-69D*01,1.00E+00,5.27E-05,, 78 | Germ HV,IGHV1-7*01,1.00E+00,0,, 79 | Germ HV,IGHV1-72*01,1.00E+00,0,, 80 | Germ HV,IGHV1-8*01,1.00E+00,0.000303056,, 81 | Germ HV,IGHV1-81*01,1.00E+00,0,, 82 | Germ HV,IGHV1-82*01,1.00E+00,2.15E-05,, 83 | Germ HV,IGHV1-84*01,1.00E+00,0,, 84 | Germ HV,IGHV1-85*01,1.00E+00,0,, 85 | Germ HV,IGHV1-9*01,1.00E+00,0.001006833,, 86 | Germ HV,IGHV10-1*02,1.00E+00,0,, 87 | Germ HV,IGHV10-3*01,1.00E+00,0,, 88 | Germ HV,IGHV10S3*01,1.00E+00,0,, 89 | Germ HV,IGHV13-2*02,1.00E+00,0,, 90 | Germ HV,IGHV14-1*01,1.00E+00,0,, 91 | Germ HV,IGHV14-1*02,1.00E+00,0,, 92 | Germ HV,IGHV14-3*02,1.00E+00,2.62E-05,, 93 | Germ HV,IGHV14-4*02,1.00E+00,0.000340449,, 94 | Germ HV,IGHV1S14*01,1.00E+00,0,, 95 | Germ HV,IGHV1S29*02,1.00E+00,0,, 96 | Germ HV,IGHV1S45*01,1.00E+00,0,, 97 | Germ HV,IGHV1S53*02,1.00E+00,0.000151308,, 98 | Germ HV,IGHV1S61*01,1.00E+00,0,, 99 | Germ HV,IGHV1S69*01,1.00E+00,0,, 100 | Germ HV,IGHV1S7*01,1.00E+00,6.14E-05,, 101 | Germ HV,IGHV2-2*03,1.00E+00,0,, 102 | Germ HV,IGHV2-5*02,1.00E+00,0,, 103 | Germ HV,IGHV2-5*09,1.00E+00,0,, 104 | Germ HV,IGHV2-6-4*01,1.00E+00,0,, 105 | Germ HV,IGHV2-6-5*01,1.00E+00,0,, 106 | Germ HV,IGHV2-6-7*02,1.00E+00,0,, 107 | Germ HV,IGHV2-70*13,1.00E+00,0,, 108 | Germ HV,IGHV2-9*02,5.98E-01,0.002199624,, 109 | Germ HV,IGHV2S33*01,1.00E+00,0,, 110 | Germ HV,IGHV2S5*01,1.00E+00,0,, 111 | Germ HV,IGHV3-1*01,1.00E+00,0,, 112 | Germ HV,IGHV3-11*01,1.00E+00,0,, 113 | Germ HV,IGHV3-11*05,1.00E+00,0,, 114 | Germ HV,IGHV3-13*04,1.00E+00,0,, 115 | Germ HV,IGHV3-15*04,1.00E+00,0,, 116 | Germ HV,IGHV3-15*06,1.00E+00,0,, 117 | Germ HV,IGHV3-15*07,1.00E+00,0.000143431,, 118 | Germ HV,IGHV3-2*02,1.00E+00,0,, 119 | Germ HV,IGHV3-20*01,1.00E+00,0,, 120 | Germ HV,IGHV3-21*03,1.00E+00,0.00016636,, 121 | Germ HV,IGHV3-23*01,1.75E-72,0.419321293,1,1 122 | Germ HV,IGHV3-23*02,1.00E+00,0,, 123 | Germ HV,IGHV3-23*03,5.00E-01,0.003982053,,32 124 | Germ HV,IGHV3-23*04,1.00E+00,0.000206359,, 125 | Germ HV,IGHV3-30*02,1.00E+00,0,, 126 | Germ HV,IGHV3-30*03,1.00E+00,0.000711737,, 127 | Germ HV,IGHV3-30*10,1.00E+00,0,, 128 | Germ HV,IGHV3-30*11,1.00E+00,0.00283518,,45 129 | Germ HV,IGHV3-30*18,1.00E+00,1.79E-05,, 130 | Germ HV,IGHV3-33*01,6.06E-02,0.002916101,,43 131 | Germ HV,IGHV3-33*03,1.00E+00,0,, 132 | Germ HV,IGHV3-48*01,1.00E+00,0.000344457,, 133 | Germ HV,IGHV3-53*01,5.00E-01,0.002345587,, 134 | Germ HV,IGHV3-6*02,1.00E+00,0,, 135 | Germ HV,IGHV3-64*01,1.00E+00,0,, 136 | Germ HV,IGHV3-66*02,1.00E+00,0,, 137 | Germ HV,IGHV3-66*03,1.00E+00,0,, 138 | Germ HV,IGHV3-7*02,1.00E+00,0.00028511,, 139 | Germ HV,IGHV3-72*01,1.00E+00,0,, 140 | Germ HV,IGHV3-73*01,1.00E+00,0,, 141 | Germ HV,IGHV3-74*01,1.00E+00,0,, 142 | Germ HV,IGHV3-74*03,1.00E+00,1.34E-05,, 143 | Germ HV,IGHV3-8*02,1.00E+00,0,, 144 | Germ HV,IGHV3-9*01,1.00E+00,0.00076671,, 145 | Germ HV,IGHV3S1*01,1.00E+00,0,, 146 | Germ HV,IGHV4-1*02,1.00E+00,0,, 147 | Germ HV,IGHV4-2*02,1.00E+00,0,, 148 | Germ HV,IGHV4-30-4*07,1.00E+00,0,, 149 | Germ HV,IGHV4-31*02,1.00E+00,0,, 150 | Germ HV,IGHV4-31*05,1.00E+00,0,, 151 | Germ HV,IGHV4-34*01,1.00E+00,0.000527108,, 152 | Germ HV,IGHV4-38-2*01,1.00E+00,0,, 153 | Germ HV,IGHV4-38-2*02,1.00E+00,0,, 154 | Germ HV,IGHV4-39*07,1.00E+00,3.88E-05,, 155 | Germ HV,IGHV4-4*02,1.00E+00,0,, 156 | Germ HV,IGHV4-4*07,1.00E+00,0,, 157 | Germ HV,IGHV4-4*08,1.00E+00,0,, 158 | Germ HV,IGHV4-59*02,1.00E+00,0,, 159 | Germ HV,IGHV4-59*03,1.00E+00,0,, 160 | Germ HV,IGHV4-59*04,1.00E+00,0,, 161 | Germ HV,IGHV4-59*05,1.00E+00,0,, 162 | Germ HV,IGHV4-59*07,1.00E+00,0,, 163 | Germ HV,IGHV4-59*08,1.00E+00,7.30E-05,, 164 | Germ HV,IGHV5-10-1*04,1.00E+00,0.000203388,, 165 | Germ HV,IGHV5-12-2*01,1.00E+00,0,, 166 | Germ HV,IGHV5-12*01,1.00E+00,0,, 167 | Germ HV,IGHV5-15*02,1.00E+00,0,, 168 | Germ HV,IGHV5-17*02,1.00E+00,0.000348178,, 169 | Germ HV,IGHV5-4*02,1.00E+00,0,, 170 | Germ HV,IGHV5-51*01,1.00E+00,0.000351596,, 171 | Germ HV,IGHV5-6-1*01,1.00E+00,0,, 172 | Germ HV,IGHV5-6-2*01,1.00E+00,0.000314301,, 173 | Germ HV,IGHV5-6-3*01,1.00E+00,0,, 174 | Germ HV,IGHV5-9-3*01,1.00E+00,3.18E-06,, 175 | Germ HV,IGHV5-9*01,5.00E-01,0.004882342,,22 176 | Germ HV,IGHV5-9*02,1.00E+00,0.000445259,, 177 | Germ HV,IGHV5-9*03,1.00E+00,0,, 178 | Germ HV,IGHV5S4*01,1.00E+00,0,, 179 | Germ HV,IGHV5S9*01,1.00E+00,0,, 180 | Germ HV,IGHV6-3*02,1.00E+00,0,, 181 | Germ HV,IGHV6-6*01,1.00E+00,0.000228354,, 182 | Germ HV,IGHV6-6*02,1.00E+00,0,, 183 | Germ HV,IGHV6-7*02,1.00E+00,0,, 184 | Germ HV,IGHV7-3*02,1.00E+00,0,, 185 | Germ HV,IGHV7-3*04,1.00E+00,0,, 186 | Germ HV,IGHV7-4-1*02,1.00E+00,0,, 187 | Germ HV,IGHV8-12*01,1.00E+00,0,, 188 | Germ HV,IGHV8-5*01,1.00E+00,0,, 189 | Germ HV,IGHV8-8*01,1.00E+00,0.000214988,, 190 | Germ HV,IGHV9-1*02,1.00E+00,0,, 191 | Germ HV,IGHV9-2-1*01,1.00E+00,0.000557316,, 192 | Germ HV,IGHV9-3-1*01,1.00E+00,0,, 193 | Germ HV,IGHV9-3*01,1.00E+00,0,, 194 | Germ HV,IGHV9-4*02,1.00E+00,0,, 195 | Germ LJ,IGKJ1-1*03,1.00E+00,0,, 196 | Germ LJ,IGKJ1-2*02,1.00E+00,0.001764855,, 197 | Germ LJ,IGKJ1-2*03,1.00E+00,0,, 198 | Germ LJ,IGKJ1*01,9.88E-01,0.00544662,,21 199 | Germ LJ,IGKJ1*02,1.00E+00,6.77E-05,, 200 | Germ LJ,IGKJ2-1*01,1.00E+00,0,, 201 | Germ LJ,IGKJ2-3*01,1.00E+00,0,, 202 | Germ LJ,IGKJ2*01,8.87E-01,0.002790228,,46 203 | Germ LJ,IGKJ2*02,1.00E+00,0,, 204 | Germ LJ,IGKJ2*03,1.00E+00,0,, 205 | Germ LJ,IGKJ3*01,1.29E-05,0.014631149,13.14,10 206 | Germ LJ,IGKJ4*01,2.45E-03,0.000513796,31.39, 207 | Germ LJ,IGKJ4*02,1.00E+00,0,, 208 | Germ LJ,IGKJ5*01,9.68E-01,0.001509162,, 209 | Germ LJ,IGLJ1*01,4.22E-01,0.003266883,,37 210 | Germ LJ,IGLJ2*01,1.00E+00,0.000485164,, 211 | Germ LJ,IGLJ3*01,3.28E-01,0.000824574,, 212 | Germ LJ,IGLJ3*02,7.95E-01,0.000667714,, 213 | Germ LJ,IGLJ6*01,2.85E-01,0.000181587,, 214 | Germ LJ,IGLJ7*01,6.60E-01,0,, 215 | Germ LV,IGKV1-110*01,1.00E+00,0,, 216 | Germ LV,IGKV1-110*02,1.00E+00,0,, 217 | Germ LV,IGKV1-117*01,9.41E-01,0,, 218 | Germ LV,IGKV1-117*02,1.00E+00,0,, 219 | Germ LV,IGKV1-12*01,3.58E-03,0.002993798,39.91,40 220 | Germ LV,IGKV1-133*01,1.00E+00,0,, 221 | Germ LV,IGKV1-16*01,1.24E-01,0.008987967,,14 222 | Germ LV,IGKV1-17*01,5.63E-01,0.001137982,, 223 | Germ LV,IGKV1-17*03,5.00E-01,0,, 224 | Germ LV,IGKV1-27*01,5.80E-02,0.001384217,, 225 | Germ LV,IGKV1-39*01,1.00E+00,0.001736249,, 226 | Germ LV,IGKV1-5*01,1.00E+00,0,, 227 | Germ LV,IGKV1-5*03,7.09E-01,0.000669564,, 228 | Germ LV,IGKV1-6*01,2.49E-01,1.51E-05,, 229 | Germ LV,IGKV1-88*01,1.00E+00,0,, 230 | Germ LV,IGKV1-9*01,3.38E-01,0.001313416,, 231 | Germ LV,IGKV1-NL1*01,1.00E+00,0,, 232 | Germ LV,IGKV10-94*02,1.00E+00,4.83E-05,, 233 | Germ LV,IGKV10-94*05,1.00E+00,0,, 234 | Germ LV,IGKV10-96*02,1.00E+00,0.001710628,, 235 | Germ LV,IGKV10-96*04,1.00E+00,1.14E-05,, 236 | Germ LV,IGKV12-41*01,1.00E+00,4.10E-06,, 237 | Germ LV,IGKV12-44*01,1.00E+00,0,, 238 | Germ LV,IGKV12-46*01,1.00E+00,0,, 239 | Germ LV,IGKV12S24*01,1.00E+00,0,, 240 | Germ LV,IGKV13-84*01,1.00E+00,0,, 241 | Germ LV,IGKV14-100*01,1.00E+00,0,, 242 | Germ LV,IGKV14-111*01,1.00E+00,0,, 243 | Germ LV,IGKV14-126*01,1.00E+00,0,, 244 | Germ LV,IGKV16-104*01,1.00E+00,0,, 245 | Germ LV,IGKV17-121*01,1.00E+00,0,, 246 | Germ LV,IGKV17-127*01,1.00E+00,0.000376466,, 247 | Germ LV,IGKV19-93*02,1.00E+00,0,, 248 | Germ LV,IGKV1D-13*01,7.38E-01,4.45E-07,, 249 | Germ LV,IGKV1D-33*01,8.24E-01,0.000615482,, 250 | Germ LV,IGKV1D-39*01,4.88E-10,0.009239291,5.03,13 251 | Germ LV,IGKV1S10*01,1.00E+00,0,, 252 | Germ LV,IGKV1S11*01,5.75E-01,0,, 253 | Germ LV,IGKV1S12*01,1.00E+00,0,, 254 | Germ LV,IGKV1S14*01,1.00E+00,0,, 255 | Germ LV,IGKV1S15*01,1.00E+00,0,, 256 | Germ LV,IGKV1S17*01,1.00E+00,0,, 257 | Germ LV,IGKV1S2*01,8.07E-01,0,, 258 | Germ LV,IGKV1S2*02,1.00E+00,0,, 259 | Germ LV,IGKV1S22*01,1.00E+00,0,, 260 | Germ LV,IGKV1S24*01,1.00E+00,0,, 261 | Germ LV,IGKV1S3*01,1.00E+00,0,, 262 | Germ LV,IGKV1S3*02,1.00E+00,0,, 263 | Germ LV,IGKV1S5*01,1.00E+00,0,, 264 | Germ LV,IGKV2-109*01,1.00E+00,0,, 265 | Germ LV,IGKV2-109*03,1.00E+00,0,, 266 | Germ LV,IGKV2-112*01,1.00E+00,0,, 267 | Germ LV,IGKV2-137*01,1.00E+00,0,, 268 | Germ LV,IGKV2-28*01,1.36E-01,0.004282447,,28 269 | Germ LV,IGKV2-29*02,1.00E+00,0,, 270 | Germ LV,IGKV2-30*01,6.91E-01,1.29E-05,, 271 | Germ LV,IGKV22S7*01,1.00E+00,0,, 272 | Germ LV,IGKV2D-29*02,1.00E+00,0,, 273 | Germ LV,IGKV2S3*01,1.00E+00,0,, 274 | Germ LV,IGKV3-1*01,1.00E+00,5.72E-05,, 275 | Germ LV,IGKV3-10*01,1.00E+00,0,, 276 | Germ LV,IGKV3-11*01,5.67E-04,0.000541447,25.77, 277 | Germ LV,IGKV3-11*02,5.00E-01,0,, 278 | Germ LV,IGKV3-12*01,1.00E+00,0,, 279 | Germ LV,IGKV3-2*01,1.00E+00,0,, 280 | Germ LV,IGKV3-20*01,4.99E-05,0.007138248,15.98,16 281 | Germ LV,IGKV3-3*01,1.00E+00,0,, 282 | Germ LV,IGKV3-4*01,1.00E+00,0,, 283 | Germ LV,IGKV3-5*01,1.00E+00,0.00027996,, 284 | Germ LV,IGKV3-7*01,1.00E+00,0,, 285 | Germ LV,IGKV3D-11*01,1.00E+00,0,, 286 | Germ LV,IGKV3D-15*01,3.74E-01,0.002168255,, 287 | Germ LV,IGKV3D-20*01,2.49E-01,3.88E-06,, 288 | Germ LV,IGKV3S3*01,5.00E-01,0,, 289 | Germ LV,IGKV3S9*01,1.00E+00,0,, 290 | Germ LV,IGKV4-1*01,7.34E-01,0.00259438,,49 291 | Germ LV,IGKV4-53*01,1.00E+00,0,, 292 | Germ LV,IGKV4-55*01,1.00E+00,0,, 293 | Germ LV,IGKV4-57-1*01,1.00E+00,0,, 294 | Germ LV,IGKV4-57*01,1.00E+00,0,, 295 | Germ LV,IGKV4-59*01,1.00E+00,0,, 296 | Germ LV,IGKV4-61*01,1.00E+00,0,, 297 | Germ LV,IGKV4-63*01,1.00E+00,0,, 298 | Germ LV,IGKV4-68*01,1.00E+00,0,, 299 | Germ LV,IGKV4-70*01,1.00E+00,0,, 300 | Germ LV,IGKV4-72*01,1.00E+00,0,, 301 | Germ LV,IGKV4-74*01,1.00E+00,0,, 302 | Germ LV,IGKV4-79*01,1.00E+00,0,, 303 | Germ LV,IGKV4-80*01,1.00E+00,0,, 304 | Germ LV,IGKV4-81*01,1.00E+00,0,, 305 | Germ LV,IGKV4-86*01,1.00E+00,0,, 306 | Germ LV,IGKV4-91*01,1.00E+00,0,, 307 | Germ LV,IGKV5-39*01,1.00E+00,0.000331762,, 308 | Germ LV,IGKV5-43*01,1.00E+00,0,, 309 | Germ LV,IGKV5-48*01,1.00E+00,0.000960814,, 310 | Germ LV,IGKV6-14*01,1.00E+00,2.54E-05,, 311 | Germ LV,IGKV6-15*01,1.00E+00,4.80E-05,, 312 | Germ LV,IGKV6-17*01,8.76E-01,0.002060703,, 313 | Germ LV,IGKV6-20*01,1.00E+00,0.000203542,, 314 | Germ LV,IGKV6-21*01,1.00E+00,0.000638102,, 315 | Germ LV,IGKV6-21*02,1.00E+00,0,, 316 | Germ LV,IGKV6-23*01,1.00E+00,0,, 317 | Germ LV,IGKV6-25*01,1.00E+00,0,, 318 | Germ LV,IGKV6-32*01,1.00E+00,0,, 319 | Germ LV,IGKV6-32*02,1.00E+00,0,, 320 | Germ LV,IGKV8-19*01,1.00E+00,0,, 321 | Germ LV,IGKV8-21*01,1.00E+00,0,, 322 | Germ LV,IGKV8-24*01,1.00E+00,9.87E-06,, 323 | Germ LV,IGKV8-27*01,1.00E+00,0,, 324 | Germ LV,IGKV8-28*01,1.00E+00,0,, 325 | Germ LV,IGKV8-30*01,1.00E+00,0,, 326 | Germ LV,IGKV9-120*01,1.00E+00,0,, 327 | Germ LV,IGKV9-124*01,1.00E+00,0,, 328 | Germ LV,IGLV1-10*01,1.00E+00,0,, 329 | Germ LV,IGLV1-36*01,1.00E+00,0,, 330 | Germ LV,IGLV1-40*01,5.65E-01,0,, 331 | Germ LV,IGLV1-40*03,1.00E+00,0,, 332 | Germ LV,IGLV1-44*01,2.20E-01,0.000696396,, 333 | Germ LV,IGLV1-47*01,1.64E-01,0.001464818,, 334 | Germ LV,IGLV1-47*02,2.70E-01,0.001099102,, 335 | Germ LV,IGLV1-51*01,1.00E+00,0.00029127,, 336 | Germ LV,IGLV1-51*02,1.00E+00,0.000544347,, 337 | Germ LV,IGLV1*01,1.00E+00,0.000329822,, 338 | Germ LV,IGLV1*02,1.00E+00,0,, 339 | Germ LV,IGLV2-11*01,2.49E-01,0.001072122,, 340 | Germ LV,IGLV2-14*01,2.57E-01,0.002751961,,47 341 | Germ LV,IGLV2-14*02,1.00E+00,0,, 342 | Germ LV,IGLV2-23*01,5.78E-01,4.83E-06,, 343 | Germ LV,IGLV2-23*02,1.00E+00,0,, 344 | Germ LV,IGLV2-8*01,1.77E-01,0.000704107,, 345 | Germ LV,IGLV2S1*01,3.14E-01,0.000799662,, 346 | Germ LV,IGLV2S9*01,1.00E+00,0,, 347 | Germ LV,IGLV3-1*01,7.82E-01,0.00093926,, 348 | Germ LV,IGLV3-10*01,1.00E+00,0,, 349 | Germ LV,IGLV3-19*01,5.78E-01,4.91E-07,, 350 | Germ LV,IGLV3-21*01,1.00E+00,0,, 351 | Germ LV,IGLV3-21*02,2.23E-01,0.0010809,, 352 | Germ LV,IGLV3-21*03,1.00E+00,0,, 353 | Germ LV,IGLV3-25*03,1.00E+00,0,, 354 | Germ LV,IGLV3-9*02,1.00E+00,0,, 355 | Germ LV,IGLV3*01,1.00E+00,2.63E-06,, 356 | Germ LV,IGLV5S10*01,1.00E+00,0,, 357 | Germ LV,IGLV6-57*01,5.00E-01,0,, 358 | Germ LV,IGLV6-57*02,1.00E+00,0,, 359 | Germ LV,IGLV7-43*01,1.00E+00,0,, 360 | Motif,10_FD,6.83E-03,0.001842351,35.99, 361 | Motif,10_MD,2.44E-02,0.002656346,46.63,48 362 | Motif,10_NG,7.46E-04,0,26.18, 363 | Motif,10_YY,3.01E-03,0.001841431,30.18, 364 | Motif,2_AY,2.45E-03,0.000718347,31.58, 365 | Motif,2_AYG,5.19E-07,0.004093965,9.95,30 366 | Motif,2_GG,3.29E-01,0.00048807,, 367 | Motif,2_YG,9.50E-06,0.004157068,11.55,29 368 | Motif,2_YY,3.01E-03,2.95E-05,36.18, 369 | Motif,3_YG,9.50E-06,0.000856833,10.55, 370 | Motif,3_YY,3.01E-03,0.00297028,33.18,41 371 | Motif,4_GD,1.82E-03,0.000754301,28.08, 372 | Motif,4_GDY,1.03E-05,3.70E-06,12.05, 373 | Motif,4_SV,9.73E-01,0,, 374 | Motif,4_SVT,1.00E+00,1.10E-05,, 375 | Motif,4_YD,5.54E-02,0.004696185,,23 376 | Motif,4_YY,3.01E-03,0.000362391,31.18, 377 | Motif,5_DY,3.58E-04,0.000492204,22.06, 378 | Motif,5_DYV,1.67E-05,0,14.44, 379 | Motif,5_FD,6.83E-03,8.46E-06,37.99, 380 | Motif,5_YY,3.01E-03,0.001704189,29.18, 381 | Motif,6_YA,1.00E+00,0.004067616,,31 382 | Motif,6_YD,5.54E-02,0.000727645,, 383 | Motif,6_YF,9.15E-01,0.000529252,, 384 | Motif,6_YV,7.20E-03,0.004395786,37.55,25 385 | Motif,6_YVG,2.20E-05,0.007033143,15.02,17 386 | Motif,6_YY,3.01E-03,0.002312894,34.18, 387 | Motif,7_FD,6.83E-03,0.001472312,38.99, 388 | Motif,7_MD,2.44E-02,0.000534293,47.63, 389 | Motif,7_VG,1.10E-04,0.004383491,18.22,26 390 | Motif,7_VGW,5.19E-07,0,8.75, 391 | Motif,8_GW,5.89E-04,0.000457018,22.57, 392 | Motif,8_GWN,1.11E-06,0.003937073,11.95,34 393 | Motif,8_MD,2.44E-02,0.002503496,45.63,50 394 | Motif,8_SA,1.00E+00,0.000719992,, 395 | Motif,8_YF,9.15E-01,6.92E-06,, 396 | Motif,8_YY,3.01E-03,0.00200915,32.18, 397 | Motif,9_AM,1.00E+00,0,, 398 | Motif,9_AMD,1.00E+00,0,, 399 | Motif,9_FD,6.83E-03,0.000525799,36.99, 400 | Motif,9_WN,3.34E-05,1.01E-05,16.75, 401 | Motif,9_YY,3.01E-03,0.010646544,35.18,12 402 | PI,0.0-3.5,8.32E-03,0.000693414,39.16, 403 | PI,3.5-3.9375,3.53E-03,0.001465018,34.51, 404 | PI,3.9375-4.375,6.74E-01,0.002927121,,42 405 | PI,4.375-5.25,9.54E-01,0.000670093,, 406 | PI,5.25-5.6875,9.80E-01,7.98E-05,, 407 | PI,5.6875-6.125,9.15E-01,0.003030013,,39 408 | PI,6.125-7.0,9.89E-01,0.001250475,, 409 | PI,7.0-14.0,9.92E-01,0.001905193,, -------------------------------------------------------------------------------- /results/MMP-IGHV/IGHV_Jaccard Feature Coefficient.csv: -------------------------------------------------------------------------------- 1 | Feature value 1, Feature value 2, Jaccard coefficient for MMP-IGHV-targeting set, Jaccard coefficient for IGHV-reference set 2 | Canonical_H1_0,Canonical_H2_0,0,0 3 | Canonical_H1_0,Canonical_H2_6,0,0.003383277 4 | Canonical_H1_0,Canonical_H3_0,0,0.006116208 5 | Canonical_H1_0,Canonical_H3_1,0,0.004878049 6 | Canonical_H1_0,Canonical_H3_2,0,0.004938272 7 | Canonical_H1_0,Canonical_H3_3,0,0.002617801 8 | Canonical_H1_0,PI_0.0-3.5,0,0.002770083 9 | Canonical_H1_0,PI_3.5-3.9375,0,0.004587156 10 | Canonical_H1_0,PI_3.9375-4.375,0,0.001090513 11 | Canonical_H1_0,PI_4.375-4.8125,0,0 12 | Canonical_H1_0,PI_4.8125-5.25,0,0.007092199 13 | Canonical_H1_0,PI_5.25-5.6875,0,0.005813953 14 | Canonical_H1_0,PI_5.6875-6.125,0,0.003496503 15 | Canonical_H1_0,PI_6.125-7.0,0,0.004255319 16 | Canonical_H1_0,PI_7.0-14.0,0,0 17 | Canonical_H1_1,Canonical_H1_0,0,0 18 | Canonical_H1_1,Canonical_H1_2,0,0 19 | Canonical_H1_1,Canonical_H2_0,0.052238806,0.044938615 20 | Canonical_H1_1,Canonical_H2_6,0.947761194,0.951523546 21 | Canonical_H1_1,Canonical_H3_0,0.141791045,0.072238944 22 | Canonical_H1_1,Canonical_H3_1,0.074626866,0.044243688 23 | Canonical_H1_1,Canonical_H3_2,0.246268657,0.09029868 24 | Canonical_H1_1,Canonical_H3_3,0.537313433,0.791262136 25 | Canonical_H1_1,PI_0.0-3.5,0.194029851,0.080379893 26 | Canonical_H1_1,PI_3.5-3.9375,0.402985075,0.299097849 27 | Canonical_H1_1,PI_3.9375-4.375,0.21641791,0.208893006 28 | Canonical_H1_1,PI_4.375-4.8125,0.02238806,0.052826691 29 | Canonical_H1_1,PI_4.8125-5.25,0.02238806,0.062065771 30 | Canonical_H1_1,PI_5.25-5.6875,0.02238806,0.03659949 31 | Canonical_H1_1,PI_5.6875-6.125,0.074626866,0.129226494 32 | Canonical_H1_1,PI_6.125-7.0,0.014925373,0.050949514 33 | Canonical_H1_1,PI_7.0-14.0,0.029850746,0.079240037 34 | Canonical_H1_2,Canonical_H1_0,0,0 35 | Canonical_H1_2,Canonical_H2_0,0,0 36 | Canonical_H1_2,Canonical_H2_6,0,0.000241663 37 | Canonical_H1_2,Canonical_H3_0,0,0 38 | Canonical_H1_2,Canonical_H3_1,0,0 39 | Canonical_H1_2,Canonical_H3_2,0,0.002544529 40 | Canonical_H1_2,Canonical_H3_3,0,0 41 | Canonical_H1_2,PI_0.0-3.5,0,0 42 | Canonical_H1_2,PI_3.5-3.9375,0,0.000769231 43 | Canonical_H1_2,PI_3.9375-4.375,0,0 44 | Canonical_H1_2,PI_4.375-4.8125,0,0 45 | Canonical_H1_2,PI_4.8125-5.25,0,0 46 | Canonical_H1_2,PI_5.25-5.6875,0,0 47 | Canonical_H1_2,PI_5.6875-6.125,0,0 48 | Canonical_H1_2,PI_6.125-7.0,0,0 49 | Canonical_H1_2,PI_7.0-14.0,0,0 50 | Canonical_H1_3,Canonical_H1_0,0,0 51 | Canonical_H1_3,Canonical_H1_1,0,0 52 | Canonical_H1_3,Canonical_H1_2,0,0 53 | Canonical_H1_3,Canonical_H2_0,0,0.005102041 54 | Canonical_H1_3,Canonical_H2_6,0,0.000241604 55 | Canonical_H1_3,Canonical_H3_0,0,0.003164557 56 | Canonical_H1_3,Canonical_H3_1,0,0 57 | Canonical_H1_3,Canonical_H3_2,0,0 58 | Canonical_H1_3,Canonical_H3_3,0,0.000291206 59 | Canonical_H1_3,PI_0.0-3.5,0,0 60 | Canonical_H1_3,PI_3.5-3.9375,0,0 61 | Canonical_H1_3,PI_3.9375-4.375,0,0.001104972 62 | Canonical_H1_3,PI_4.375-4.8125,0,0 63 | Canonical_H1_3,PI_4.8125-5.25,0,0 64 | Canonical_H1_3,PI_5.25-5.6875,0,0 65 | Canonical_H1_3,PI_5.6875-6.125,0,0 66 | Canonical_H1_3,PI_6.125-7.0,0,0.004484305 67 | Canonical_H1_3,PI_7.0-14.0,0,0 68 | Canonical_H2_0,Canonical_H3_0,0,0.040816327 69 | Canonical_H2_0,Canonical_H3_1,0,0.043126685 70 | Canonical_H2_0,Canonical_H3_2,0.052631579,0.022608696 71 | Canonical_H2_0,Canonical_H3_3,0.067567568,0.041929925 72 | Canonical_H2_0,PI_0.0-3.5,0.064516129,0.018761726 73 | Canonical_H2_0,PI_3.5-3.9375,0.051724138,0.049122807 74 | Canonical_H2_0,PI_3.9375-4.375,0.028571429,0.036792453 75 | Canonical_H2_0,PI_4.375-4.8125,0,0.02173913 76 | Canonical_H2_0,PI_4.8125-5.25,0,0.035634744 77 | Canonical_H2_0,PI_5.25-5.6875,0,0.014326648 78 | Canonical_H2_0,PI_5.6875-6.125,0,0.030013643 79 | Canonical_H2_0,PI_6.125-7.0,0.125,0.037313433 80 | Canonical_H2_0,PI_7.0-14.0,0,0.017045455 81 | Canonical_H2_6,Canonical_H2_0,0,0 82 | Canonical_H2_6,Canonical_H3_0,0.149606299,0.070947571 83 | Canonical_H2_6,Canonical_H3_1,0.078740157,0.042368801 84 | Canonical_H2_6,Canonical_H3_2,0.240310078,0.091544206 85 | Canonical_H2_6,Canonical_H3_3,0.507575758,0.767273576 86 | Canonical_H2_6,PI_0.0-3.5,0.186046512,0.081485053 87 | Canonical_H2_6,PI_3.5-3.9375,0.392307692,0.29230038 88 | Canonical_H2_6,PI_3.9375-4.375,0.21875,0.207086426 89 | Canonical_H2_6,PI_4.375-4.8125,0.023622047,0.05280926 90 | Canonical_H2_6,PI_4.8125-5.25,0.023622047,0.061145883 91 | Canonical_H2_6,PI_5.25-5.6875,0.023622047,0.037171132 92 | Canonical_H2_6,PI_5.6875-6.125,0.078740157,0.129326923 93 | Canonical_H2_6,PI_6.125-7.0,0.0078125,0.049843487 94 | Canonical_H2_6,PI_7.0-14.0,0.031496063,0.080299011 95 | Canonical_H3_0,Canonical_H3_1,0,0 96 | Canonical_H3_0,Canonical_H3_3,0,0 97 | Canonical_H3_0,PI_0.0-3.5,0.022727273,0.016871166 98 | Canonical_H3_0,PI_3.5-3.9375,0.140625,0.079545455 99 | Canonical_H3_0,PI_3.9375-4.375,0.043478261,0.043664384 100 | Canonical_H3_0,PI_4.375-4.8125,0,0.011173184 101 | Canonical_H3_0,PI_4.8125-5.25,0.1,0.026315789 102 | Canonical_H3_0,PI_5.25-5.6875,0.1,0.025974026 103 | Canonical_H3_0,PI_5.6875-6.125,0.035714286,0.069682152 104 | Canonical_H3_0,PI_6.125-7.0,0,0.036679537 105 | Canonical_H3_0,PI_7.0-14.0,0.095238095,0.039556962 106 | Canonical_H3_1,Canonical_H3_3,0,0 107 | Canonical_H3_1,PI_0.0-3.5,0.090909091,0.018867925 108 | Canonical_H3_1,PI_3.5-3.9375,0.032258065,0.037552156 109 | Canonical_H3_1,PI_3.9375-4.375,0.026315789,0.046800382 110 | Canonical_H3_1,PI_4.375-4.8125,0,0.004784689 111 | Canonical_H3_1,PI_4.8125-5.25,0,0.01986755 112 | Canonical_H3_1,PI_5.25-5.6875,0,0.03539823 113 | Canonical_H3_1,PI_5.6875-6.125,0.25,0.034387895 114 | Canonical_H3_1,PI_6.125-7.0,0,0.027295285 115 | Canonical_H3_1,PI_7.0-14.0,0,0.038910506 116 | Canonical_H3_2,Canonical_H3_0,0,0 117 | Canonical_H3_2,Canonical_H3_1,0,0 118 | Canonical_H3_2,Canonical_H3_3,0,0 119 | Canonical_H3_2,PI_0.0-3.5,0.092592593,0.042194093 120 | Canonical_H3_2,PI_3.5-3.9375,0.225352113,0.068813131 121 | Canonical_H3_2,PI_3.9375-4.375,0.127272727,0.054471545 122 | Canonical_H3_2,PI_4.375-4.8125,0.028571429,0.019704433 123 | Canonical_H3_2,PI_4.8125-5.25,0,0.055732484 124 | Canonical_H3_2,PI_5.25-5.6875,0,0.041509434 125 | Canonical_H3_2,PI_5.6875-6.125,0.048780488,0.049559471 126 | Canonical_H3_2,PI_6.125-7.0,0.029411765,0.040609137 127 | Canonical_H3_2,PI_7.0-14.0,0.027777778,0.071428571 128 | Canonical_H3_3,PI_0.0-3.5,0.209876543,0.085246843 129 | Canonical_H3_3,PI_3.5-3.9375,0.272727273,0.274024226 130 | Canonical_H3_3,PI_3.9375-4.375,0.231707317,0.204722222 131 | Canonical_H3_3,PI_4.375-4.8125,0.02739726,0.060237475 132 | Canonical_H3_3,PI_4.8125-5.25,0.013513514,0.060423826 133 | Canonical_H3_3,PI_5.25-5.6875,0.013513514,0.032480598 134 | Canonical_H3_3,PI_5.6875-6.125,0.037974684,0.121629213 135 | Canonical_H3_3,PI_6.125-7.0,0.01369863,0.04817895 136 | Canonical_H3_3,PI_7.0-14.0,0.013333333,0.070314715 137 | Germ_HJ_IGHJ1*01,Canonical_H1_0,0,0 138 | Germ_HJ_IGHJ1*01,Canonical_H1_1,0.007462687,0.007645968 139 | Germ_HJ_IGHJ1*01,Canonical_H1_2,0,0 140 | Germ_HJ_IGHJ1*01,Canonical_H1_3,0,0 141 | Germ_HJ_IGHJ1*01,Canonical_H2_0,0,0.008849558 142 | Germ_HJ_IGHJ1*01,Canonical_H2_6,0.007874016,0.007487923 143 | Germ_HJ_IGHJ1*01,Canonical_H3_0,0,0.01754386 144 | Germ_HJ_IGHJ1*01,Canonical_H3_1,0,0.022727273 145 | Germ_HJ_IGHJ1*01,Canonical_H3_2,0.03030303,0.007092199 146 | Germ_HJ_IGHJ1*01,Canonical_H3_3,0,0.005512039 147 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ2*01,0,0 148 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ4*02,0,0 149 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ5*02,0,0 150 | Germ_HJ_IGHJ1*01,Germ_HJ_IGHJ6*01,0,0 151 | Germ_HJ_IGHJ1*01,PI_0.0-3.5,0,0 152 | Germ_HJ_IGHJ1*01,PI_3.5-3.9375,0,0.00150263 153 | Germ_HJ_IGHJ1*01,PI_3.9375-4.375,0,0.002139037 154 | Germ_HJ_IGHJ1*01,PI_4.375-4.8125,0,0.007722008 155 | Germ_HJ_IGHJ1*01,PI_4.8125-5.25,0,0.027118644 156 | Germ_HJ_IGHJ1*01,PI_5.25-5.6875,0,0.010526316 157 | Germ_HJ_IGHJ1*01,PI_5.6875-6.125,0,0.003384095 158 | Germ_HJ_IGHJ1*01,PI_6.125-7.0,0,0.036585366 159 | Germ_HJ_IGHJ1*01,PI_7.0-14.0,0.25,0.016260163 160 | Germ_HJ_IGHJ2*01,Canonical_H1_0,0,0 161 | Germ_HJ_IGHJ2*01,Canonical_H1_1,0.014925373,0.003707136 162 | Germ_HJ_IGHJ2*01,Canonical_H1_2,0,0 163 | Germ_HJ_IGHJ2*01,Canonical_H1_3,0,0 164 | Germ_HJ_IGHJ2*01,Canonical_H2_0,0,0.004761905 165 | Germ_HJ_IGHJ2*01,Canonical_H2_6,0.015748031,0.003624064 166 | Germ_HJ_IGHJ2*01,Canonical_H3_0,0,0.012232416 167 | Germ_HJ_IGHJ2*01,Canonical_H3_1,0,0.009708738 168 | Germ_HJ_IGHJ2*01,Canonical_H3_2,0.060606061,0.009876543 169 | Germ_HJ_IGHJ2*01,Canonical_H3_3,0,0.001742666 170 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ4*02,0,0 171 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ5*02,0,0 172 | Germ_HJ_IGHJ2*01,Germ_HJ_IGHJ6*01,0,0 173 | Germ_HJ_IGHJ2*01,PI_0.0-3.5,0,0 174 | Germ_HJ_IGHJ2*01,PI_3.5-3.9375,0,0.00152207 175 | Germ_HJ_IGHJ2*01,PI_3.9375-4.375,0.068965517,0.003271538 176 | Germ_HJ_IGHJ2*01,PI_4.375-4.8125,0,0 177 | Germ_HJ_IGHJ2*01,PI_4.8125-5.25,0,0.003508772 178 | Germ_HJ_IGHJ2*01,PI_5.25-5.6875,0,0.005747126 179 | Germ_HJ_IGHJ2*01,PI_5.6875-6.125,0,0.003484321 180 | Germ_HJ_IGHJ2*01,PI_6.125-7.0,0,0.021459227 181 | Germ_HJ_IGHJ2*01,PI_7.0-14.0,0,0.005617978 182 | Germ_HJ_IGHJ3*01,Canonical_H1_0,0,0 183 | Germ_HJ_IGHJ3*01,Canonical_H1_1,0.007462687,0.002548656 184 | Germ_HJ_IGHJ3*01,Canonical_H1_2,0,0 185 | Germ_HJ_IGHJ3*01,Canonical_H1_3,0,0 186 | Germ_HJ_IGHJ3*01,Canonical_H2_0,0,0.009803922 187 | Germ_HJ_IGHJ3*01,Canonical_H2_6,0.007874016,0.002173913 188 | Germ_HJ_IGHJ3*01,Canonical_H3_0,0,0 189 | Germ_HJ_IGHJ3*01,Canonical_H3_1,0,0 190 | Germ_HJ_IGHJ3*01,Canonical_H3_2,0,0 191 | Germ_HJ_IGHJ3*01,Canonical_H3_3,0.013888889,0.003204195 192 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ1*01,0,0 193 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ2*01,0,0 194 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ4*02,0,0 195 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*01,0,0 196 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ5*02,0,0 197 | Germ_HJ_IGHJ3*01,Germ_HJ_IGHJ6*01,0,0 198 | Germ_HJ_IGHJ3*01,PI_0.0-3.5,0.038461538,0.008426966 199 | Germ_HJ_IGHJ3*01,PI_3.5-3.9375,0,0.003060444 200 | Germ_HJ_IGHJ3*01,PI_3.9375-4.375,0,0.002190581 201 | Germ_HJ_IGHJ3*01,PI_4.375-4.8125,0,0.004201681 202 | Germ_HJ_IGHJ3*01,PI_4.8125-5.25,0,0 203 | Germ_HJ_IGHJ3*01,PI_5.25-5.6875,0,0 204 | Germ_HJ_IGHJ3*01,PI_5.6875-6.125,0,0 205 | Germ_HJ_IGHJ3*01,PI_6.125-7.0,0,0 206 | Germ_HJ_IGHJ3*01,PI_7.0-14.0,0,0.002840909 207 | Germ_HJ_IGHJ3*02,Canonical_H1_0,0,0 208 | Germ_HJ_IGHJ3*02,Canonical_H1_1,0.164179104,0.007877665 209 | Germ_HJ_IGHJ3*02,Canonical_H1_2,0,0 210 | Germ_HJ_IGHJ3*02,Canonical_H1_3,0,0 211 | Germ_HJ_IGHJ3*02,Canonical_H2_0,0.035714286,0.004385965 212 | Germ_HJ_IGHJ3*02,Canonical_H2_6,0.1640625,0.00797294 213 | Germ_HJ_IGHJ3*02,Canonical_H3_0,0.138888889,0.002873563 214 | Germ_HJ_IGHJ3*02,Canonical_H3_1,0.066666667,0.013452915 215 | Germ_HJ_IGHJ3*02,Canonical_H3_2,0.078431373,0.007075472 216 | Germ_HJ_IGHJ3*02,Canonical_H3_3,0.13253012,0.007848837 217 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ1*01,0,0 218 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ2*01,0,0 219 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ3*01,0,0 220 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ4*02,0,0 221 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*01,0,0 222 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ5*02,0,0 223 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*01,0,0 224 | Germ_HJ_IGHJ3*02,Germ_HJ_IGHJ6*04,0,0 225 | Germ_HJ_IGHJ3*02,PI_0.0-3.5,0.090909091,0.013262599 226 | Germ_HJ_IGHJ3*02,PI_3.5-3.9375,0.151515152,0.014448669 227 | Germ_HJ_IGHJ3*02,PI_3.9375-4.375,0.159090909,0.003208556 228 | Germ_HJ_IGHJ3*02,PI_4.375-4.8125,0,0.011583012 229 | Germ_HJ_IGHJ3*02,PI_4.8125-5.25,0,0 230 | Germ_HJ_IGHJ3*02,PI_5.25-5.6875,0,0.005208333 231 | Germ_HJ_IGHJ3*02,PI_5.6875-6.125,0.032258065,0.005076142 232 | Germ_HJ_IGHJ3*02,PI_6.125-7.0,0,0 233 | Germ_HJ_IGHJ3*02,PI_7.0-14.0,0,0 234 | Germ_HJ_IGHJ4*02,Canonical_H1_0,0,0.001960784 235 | Germ_HJ_IGHJ4*02,Canonical_H1_1,0.358208955,0.350694444 236 | Germ_HJ_IGHJ4*02,Canonical_H1_2,0,0 237 | Germ_HJ_IGHJ4*02,Canonical_H1_3,0,0.000657895 238 | Germ_HJ_IGHJ4*02,Canonical_H2_0,0.057692308,0.042579075 239 | Germ_HJ_IGHJ4*02,Canonical_H2_6,0.346153846,0.344344106 240 | Germ_HJ_IGHJ4*02,Canonical_H3_0,0.155172414,0.11965812 241 | Germ_HJ_IGHJ4*02,Canonical_H3_1,0.094339623,0.079495268 242 | Germ_HJ_IGHJ4*02,Canonical_H3_2,0.08,0.06935123 243 | Germ_HJ_IGHJ4*02,Canonical_H3_3,0.304347826,0.276617685 244 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ5*02,0,0 245 | Germ_HJ_IGHJ4*02,Germ_HJ_IGHJ6*01,0,0 246 | Germ_HJ_IGHJ4*02,PI_0.0-3.5,0.104477612,0.058390023 247 | Germ_HJ_IGHJ4*02,PI_3.5-3.9375,0.2,0.185449958 248 | Germ_HJ_IGHJ4*02,PI_3.9375-4.375,0.203125,0.168837434 249 | Germ_HJ_IGHJ4*02,PI_4.375-4.8125,0.02,0.042985075 250 | Germ_HJ_IGHJ4*02,PI_4.8125-5.25,0.0625,0.055457227 251 | Germ_HJ_IGHJ4*02,PI_5.25-5.6875,0.02,0.03198032 252 | Germ_HJ_IGHJ4*02,PI_5.6875-6.125,0.054545455,0.118343195 253 | Germ_HJ_IGHJ4*02,PI_6.125-7.0,0,0.046274038 254 | Germ_HJ_IGHJ4*02,PI_7.0-14.0,0.06122449,0.062821245 255 | Germ_HJ_IGHJ5*01,Canonical_H1_0,0,0 256 | Germ_HJ_IGHJ5*01,Canonical_H1_1,0.007462687,0.041241891 257 | Germ_HJ_IGHJ5*01,Canonical_H1_2,0,0 258 | Germ_HJ_IGHJ5*01,Canonical_H1_3,0,0 259 | Germ_HJ_IGHJ5*01,Canonical_H2_0,0,0.041899441 260 | Germ_HJ_IGHJ5*01,Canonical_H2_6,0.007874016,0.039248736 261 | Germ_HJ_IGHJ5*01,Canonical_H3_0,0,0.040084388 262 | Germ_HJ_IGHJ5*01,Canonical_H3_1,0,0.022099448 263 | Germ_HJ_IGHJ5*01,Canonical_H3_2,0.03030303,0.010619469 264 | Germ_HJ_IGHJ5*01,Canonical_H3_3,0,0.041834968 265 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ1*01,0,0 266 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ2*01,0,0 267 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ4*02,0,0 268 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ5*02,0,0 269 | Germ_HJ_IGHJ5*01,Germ_HJ_IGHJ6*01,0,0 270 | Germ_HJ_IGHJ5*01,PI_0.0-3.5,0,0.017408124 271 | Germ_HJ_IGHJ5*01,PI_3.5-3.9375,0,0.041578576 272 | Germ_HJ_IGHJ5*01,PI_3.9375-4.375,0,0.045410628 273 | Germ_HJ_IGHJ5*01,PI_4.375-4.8125,0,0.01754386 274 | Germ_HJ_IGHJ5*01,PI_4.8125-5.25,0,0.037037037 275 | Germ_HJ_IGHJ5*01,PI_5.25-5.6875,0,0.018126888 276 | Germ_HJ_IGHJ5*01,PI_5.6875-6.125,0.1,0.026425591 277 | Germ_HJ_IGHJ5*01,PI_6.125-7.0,0,0.015228426 278 | Germ_HJ_IGHJ5*01,PI_7.0-14.0,0,0.017612524 279 | Germ_HJ_IGHJ5*02,Canonical_H1_0,0,0 280 | Germ_HJ_IGHJ5*02,Canonical_H1_1,0.067164179,0.038452629 281 | Germ_HJ_IGHJ5*02,Canonical_H1_2,0,0 282 | Germ_HJ_IGHJ5*02,Canonical_H1_3,0,0.005952381 283 | Germ_HJ_IGHJ5*02,Canonical_H2_0,0.066666667,0.025495751 284 | Germ_HJ_IGHJ5*02,Canonical_H2_6,0.0625,0.038099831 285 | Germ_HJ_IGHJ5*02,Canonical_H3_0,0.037037037,0.02771855 286 | Germ_HJ_IGHJ5*02,Canonical_H3_1,0.055555556,0.022792023 287 | Germ_HJ_IGHJ5*02,Canonical_H3_2,0.024390244,0.02003643 288 | Germ_HJ_IGHJ5*02,Canonical_H3_3,0.08,0.038961039 289 | Germ_HJ_IGHJ5*02,Germ_HJ_IGHJ6*01,0,0 290 | Germ_HJ_IGHJ5*02,PI_0.0-3.5,0,0.038306452 291 | Germ_HJ_IGHJ5*02,PI_3.5-3.9375,0.016129032,0.034555712 292 | Germ_HJ_IGHJ5*02,PI_3.9375-4.375,0,0.024880383 293 | Germ_HJ_IGHJ5*02,PI_4.375-4.8125,0.090909091,0.010230179 294 | Germ_HJ_IGHJ5*02,PI_4.8125-5.25,0,0.025821596 295 | Germ_HJ_IGHJ5*02,PI_5.25-5.6875,0.2,0.01242236 296 | Germ_HJ_IGHJ5*02,PI_5.6875-6.125,0.1875,0.043041607 297 | Germ_HJ_IGHJ5*02,PI_6.125-7.0,0.222222222,0.023684211 298 | Germ_HJ_IGHJ5*02,PI_7.0-14.0,0,0.030364372 299 | Germ_HJ_IGHJ6*01,Canonical_H1_0,0,0.004627682 300 | Germ_HJ_IGHJ6*01,Canonical_H1_1,0.291044776,0.545748614 301 | Germ_HJ_IGHJ6*01,Canonical_H1_2,0,0.00042123 302 | Germ_HJ_IGHJ6*01,Canonical_H1_3,0,0 303 | Germ_HJ_IGHJ6*01,Canonical_H2_0,0.045454545,0.038399353 304 | Germ_HJ_IGHJ6*01,Canonical_H2_6,0.286821705,0.53838885 305 | Germ_HJ_IGHJ6*01,Canonical_H3_0,0.054545455,0.029085343 306 | Germ_HJ_IGHJ6*01,Canonical_H3_1,0.042553191,0.015835313 307 | Germ_HJ_IGHJ6*01,Canonical_H3_2,0.263157895,0.095841584 308 | Germ_HJ_IGHJ6*01,Canonical_H3_3,0.206521739,0.531785808 309 | Germ_HJ_IGHJ6*01,PI_0.0-3.5,0.181818182,0.083167529 310 | Germ_HJ_IGHJ6*01,PI_3.5-3.9375,0.273972603,0.245423729 311 | Germ_HJ_IGHJ6*01,PI_3.9375-4.375,0.096774194,0.167794799 312 | Germ_HJ_IGHJ6*01,PI_4.375-4.8125,0.024390244,0.056435242 313 | Germ_HJ_IGHJ6*01,PI_4.8125-5.25,0,0.055489022 314 | Germ_HJ_IGHJ6*01,PI_5.25-5.6875,0,0.038114754 315 | Germ_HJ_IGHJ6*01,PI_5.6875-6.125,0.042553191,0.107169811 316 | Germ_HJ_IGHJ6*01,PI_6.125-7.0,0,0.046774194 317 | Germ_HJ_IGHJ6*01,PI_7.0-14.0,0,0.079062376 318 | Germ_HJ_IGHJ6*04,Canonical_H1_0,0,0 319 | Germ_HJ_IGHJ6*04,Canonical_H1_1,0.082089552,0.000231696 320 | Germ_HJ_IGHJ6*04,Canonical_H1_2,0,0 321 | Germ_HJ_IGHJ6*04,Canonical_H1_3,0,0 322 | Germ_HJ_IGHJ6*04,Canonical_H2_0,0,0 323 | Germ_HJ_IGHJ6*04,Canonical_H2_6,0.086614173,0.000241663 324 | Germ_HJ_IGHJ6*04,Canonical_H3_0,0.034482759,0 325 | Germ_HJ_IGHJ6*04,Canonical_H3_1,0,0 326 | Germ_HJ_IGHJ6*04,Canonical_H3_2,0.073170732,0 327 | Germ_HJ_IGHJ6*04,Canonical_H3_3,0.092105263,0.00029129 328 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ1*01,0,0 329 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ2*01,0,0 330 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ3*01,0,0 331 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ4*02,0,0 332 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*01,0,0 333 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ5*02,0,0 334 | Germ_HJ_IGHJ6*04,Germ_HJ_IGHJ6*01,0,0 335 | Germ_HJ_IGHJ6*04,PI_0.0-3.5,0.121212121,0 336 | Germ_HJ_IGHJ6*04,PI_3.5-3.9375,0.101694915,0 337 | Germ_HJ_IGHJ6*04,PI_3.9375-4.375,0.025641026,0 338 | Germ_HJ_IGHJ6*04,PI_4.375-4.8125,0,0 339 | Germ_HJ_IGHJ6*04,PI_4.8125-5.25,0,0.003703704 340 | Germ_HJ_IGHJ6*04,PI_5.25-5.6875,0,0 341 | Germ_HJ_IGHJ6*04,PI_5.6875-6.125,0,0 342 | Germ_HJ_IGHJ6*04,PI_6.125-7.0,0,0 343 | Germ_HJ_IGHJ6*04,PI_7.0-14.0,0,0 344 | Germ_HV_IGHV3-23*01,Canonical_H1_0,0,0.003231018 345 | Germ_HV_IGHV3-23*01,Canonical_H1_1,1,0.996076621 346 | Germ_HV_IGHV3-23*01,Canonical_H1_2,0,0.000230787 347 | Germ_HV_IGHV3-23*01,Canonical_H1_3,0,0.000461574 348 | Germ_HV_IGHV3-23*01,Canonical_H2_0,0.052238806,0.045003462 349 | Germ_HV_IGHV3-23*01,Canonical_H2_6,0.947761194,0.954996538 350 | Germ_HV_IGHV3-23*01,Canonical_H3_0,0.141791045,0.0726979 351 | Germ_HV_IGHV3-23*01,Canonical_H3_1,0.074626866,0.044311101 352 | Germ_HV_IGHV3-23*01,Canonical_H3_2,0.246268657,0.090699285 353 | Germ_HV_IGHV3-23*01,Canonical_H3_3,0.537313433,0.792291715 354 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ1*01,0.007462687,0.00761597 355 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ2*01,0.014925373,0.003692592 356 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*01,0.007462687,0.002538657 357 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ3*02,0.164179104,0.007846757 358 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ4*02,0.358208955,0.350565428 359 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*01,0.007462687,0.041080083 360 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ5*02,0.067164179,0.038541426 361 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*01,0.291044776,0.547888299 362 | Germ_HV_IGHV3-23*01,Germ_HJ_IGHJ6*04,0.082089552,0.000230787 363 | Germ_HV_IGHV3-23*01,PI_0.0-3.5,0.194029851,0.08031387 364 | Germ_HV_IGHV3-23*01,PI_3.5-3.9375,0.402985075,0.300023079 365 | Germ_HV_IGHV3-23*01,PI_3.9375-4.375,0.21641791,0.208631433 366 | Germ_HV_IGHV3-23*01,PI_4.375-4.8125,0.02238806,0.052619432 367 | Germ_HV_IGHV3-23*01,PI_4.8125-5.25,0.02238806,0.062312486 368 | Germ_HV_IGHV3-23*01,PI_5.25-5.6875,0.02238806,0.03669513 369 | Germ_HV_IGHV3-23*01,PI_5.6875-6.125,0.074626866,0.129240711 370 | Germ_HV_IGHV3-23*01,PI_6.125-7.0,0.014925373,0.05123471 371 | Germ_HV_IGHV3-23*01,PI_7.0-14.0,0.029850746,0.078929148 372 | PI_0.0-3.5,PI_3.5-3.9375,0,0 373 | PI_0.0-3.5,PI_3.9375-4.375,0,0 374 | PI_0.0-3.5,PI_4.375-4.8125,0,0 375 | PI_0.0-3.5,PI_4.8125-5.25,0,0 376 | PI_0.0-3.5,PI_5.25-5.6875,0,0 377 | PI_0.0-3.5,PI_5.6875-6.125,0,0 378 | PI_0.0-3.5,PI_6.125-7.0,0,0 379 | PI_0.0-3.5,PI_7.0-14.0,0,0 380 | PI_3.5-3.9375,PI_3.9375-4.375,0,0 381 | PI_3.5-3.9375,PI_4.375-4.8125,0,0 382 | PI_3.5-3.9375,PI_4.8125-5.25,0,0 383 | PI_3.5-3.9375,PI_5.25-5.6875,0,0 384 | PI_3.5-3.9375,PI_5.6875-6.125,0,0 385 | PI_3.5-3.9375,PI_6.125-7.0,0,0 386 | PI_3.5-3.9375,PI_7.0-14.0,0,0 387 | PI_3.9375-4.375,PI_4.375-4.8125,0,0 388 | PI_3.9375-4.375,PI_4.8125-5.25,0,0 389 | PI_3.9375-4.375,PI_5.25-5.6875,0,0 390 | PI_3.9375-4.375,PI_5.6875-6.125,0,0 391 | PI_3.9375-4.375,PI_6.125-7.0,0,0 392 | PI_3.9375-4.375,PI_7.0-14.0,0,0 393 | PI_4.375-4.8125,PI_4.8125-5.25,0,0 394 | PI_4.375-4.8125,PI_5.25-5.6875,0,0 395 | PI_4.375-4.8125,PI_5.6875-6.125,0,0 396 | PI_4.375-4.8125,PI_6.125-7.0,0,0 397 | PI_4.375-4.8125,PI_7.0-14.0,0,0 398 | PI_4.8125-5.25,PI_5.25-5.6875,0,0 399 | PI_4.8125-5.25,PI_5.6875-6.125,0,0 400 | PI_4.8125-5.25,PI_6.125-7.0,0,0 401 | PI_4.8125-5.25,PI_7.0-14.0,0,0 402 | PI_5.25-5.6875,PI_5.6875-6.125,0,0 403 | PI_5.25-5.6875,PI_6.125-7.0,0,0 404 | PI_5.25-5.6875,PI_7.0-14.0,0,0 405 | PI_5.6875-6.125,PI_6.125-7.0,0,0 406 | PI_5.6875-6.125,PI_7.0-14.0,0,0 407 | PI_6.125-7.0,PI_7.0-14.0,0,0 -------------------------------------------------------------------------------- /ASAP/FeatureExtraction.py: -------------------------------------------------------------------------------- 1 | import Bio.SeqUtils.ProtParam 2 | import os 3 | import numpy as np 4 | 5 | SET_NAME = 'MMP-cluster' 6 | IF_ONLY_HEAVY = False 7 | CNT_DB = 2 8 | CNT_TARGET = 1 9 | REFERENCE_PATH_TESTCASE = './testCase/MMP-cluster/reference-PDB/' 10 | TARGETING_PATH_TESTCASE = './testCase/MMP-cluster/targeting-MMP/' 11 | TARGET_DESIRE_SIZE = 166 #44 #MMP-cluster 12 | 13 | 14 | 15 | 16 | # Chothia numbering definition for CDR regions 17 | CHOTHIA_CDR = {'L': {'1': [24, 34], '2': [50, 56], '3': [89, 97]}, 'H':{'1': [26, 32], '2': [52, 56], '3': [95, 102]}} 18 | 19 | ################################################################################################################# 20 | # function ReadAminoAndNum: 21 | # Read in the Chothia number reference and targeting files. Store the numbering and putative germline. 22 | # 23 | # Input: targeting_direct, reference_direct 24 | # Output:1. dictionary of Amino, {'L': {}, 'H': {}} 25 | # 2. dictionary of Num , {'L': {}, 'H': {}} 26 | # 3. dictionary of Germ , {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}} 27 | # 4. list of DatasetName, [dh, dm, p1,....] 28 | # 5. list of DatasetSize, [ , , ,...] 29 | ################################################################################################################# 30 | 31 | def ReadAminoNumGerm(targeting_direct, reference_direct): 32 | Amino = {'L': {}, 'H': {}} 33 | Num ={'L': {}, 'H': {}} 34 | Germ = {'L': {'V': {}, 'J':{}}, 'H': {'V': {}, 'J':{}}} 35 | DatasetName = [] 36 | DatasetSize = [] 37 | 38 | targeting_filenames = sorted(os.listdir(targeting_direct)) 39 | reference_filenames = sorted(os.listdir(reference_direct)) 40 | 41 | for i, name in enumerate(reference_filenames + targeting_filenames): 42 | if not name.endswith('.txt'): 43 | continue 44 | if i < len(reference_filenames): 45 | direct = reference_direct 46 | else: 47 | direct = targeting_direct 48 | with open(direct + name, 'r') as fi: 49 | data = fi.readlines() 50 | DatasetName.append(name.split('_')[0]) 51 | cnt_pattern = 0 52 | cnt_seq = 0 53 | tmp_num = [] 54 | tmp_seq = [] 55 | tmp_germ_V = ' ' 56 | tmp_germ_J = ' ' 57 | buff = '' 58 | for j in range(len(data)): 59 | # if chain begin 60 | if data[j][0] =='L' or data[j][0] =='H': 61 | L_H = data[j][0] 62 | tmp_seq.append(data[j].split()[-1]) 63 | if len(data[j].split()) == 3: 64 | tmp_num.append(data[j].split()[-2]) 65 | else: 66 | tmp_num.append(data[j].split()[1] + data[j].split()[-2]) 67 | 68 | # second time of #|, line of germline 69 | if data[j][0]=='#' and data[j][1] == '|': 70 | cnt_pattern += 1 71 | if (cnt_pattern % 4) == 0: 72 | tmp_germ_V = data[j].split("|")[2] 73 | tmp_germ_J = data[j].split("|")[4] 74 | 75 | 76 | # time of \\, ending a sequence, need \\ to present \ 77 | if data[j][0] == '/': 78 | if IF_ONLY_HEAVY: 79 | seq_name = name.split('_')[0] + '_' + str(cnt_seq) 80 | else: 81 | seq_name = name.split('_')[0] + '_' + str(int(cnt_seq / 2)) 82 | cnt_seq += 1 83 | Amino[L_H][seq_name] = tmp_seq 84 | Num[L_H][seq_name] =tmp_num 85 | Germ[L_H]['V'][seq_name] = tmp_germ_V 86 | Germ[L_H]['J'][seq_name] = tmp_germ_J 87 | # if not tmp_germ_V.startswith('IGHV3-23'): 88 | # print(data[j - 8]) 89 | # print(seq_name) 90 | # print(tmp_germ_V, tmp_germ_J) 91 | tmp_num = [] 92 | tmp_seq = [] 93 | tmp_germ_V = ' ' 94 | tmp_germ_J = ' ' 95 | 96 | if IF_ONLY_HEAVY: 97 | DatasetSize.append(cnt_seq) 98 | else: 99 | DatasetSize.append(int(cnt_seq / 2)) 100 | return Amino, Num, Germ, DatasetName, DatasetSize 101 | 102 | 103 | ################################################################################################################# 104 | # function GetOneHotGerm: 105 | # Transform the stored putative germline into one-hot encoded features. 106 | # 107 | # Input: Germ, DatasetSize, DatasetName 108 | # Output: 1. array of OneHotGerm, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...] 109 | # 2. list of GermFeatureNames according to one hot, [LV_IGLV1*1, LV_IGLV1*2,.... 110 | # LJ_XXXX, 111 | # HV_XXXX, 112 | # HJ_XXXX ...] 113 | ################################################################################################################# 114 | 115 | def GetOneHotGerm(Germ, DatasetSize, DatasetName): 116 | OneHotGerm = [] 117 | GermFeatureNames = [] 118 | # for every feature type 119 | for H_L in Germ: 120 | if IF_ONLY_HEAVY: 121 | if H_L=='L': 122 | continue 123 | for V_J in Germ[H_L]: 124 | # every feature name in that type 125 | candidate = list(sorted(set(Germ[H_L][V_J].values()))) 126 | for can in candidate: 127 | GermFeatureNames.append('Germ_' +H_L+ V_J+'_'+can) 128 | 129 | # for every dataset 130 | for i, name in enumerate(DatasetName): 131 | tmp = [[] for j in range(int(DatasetSize[i]))] 132 | # for every seq in that dataset 133 | for j in range(int(DatasetSize[i])): 134 | seq_name = name + '_' + str(j) 135 | 136 | for k in range(len(GermFeatureNames)): 137 | H_L = GermFeatureNames[k].split('_')[1][0] 138 | V_J = GermFeatureNames[k].split('_')[1][1] 139 | if Germ[H_L][V_J][seq_name] == GermFeatureNames[k].split('_')[2]: 140 | tmp[j].append(1) 141 | else: 142 | tmp[j].append(0) 143 | OneHotGerm += tmp 144 | 145 | return OneHotGerm, GermFeatureNames 146 | 147 | 148 | ################################################################################################################# 149 | # function ReadCanonTemp: 150 | # Read in the template file (default PIGS) and store it. 151 | # 152 | # Output: 1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}} 153 | ################################################################################################################# 154 | def ReadCanonTemp(canonical_direct): 155 | CanonTemp = {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}} 156 | with open(canonical_direct, 'r') as fi: 157 | data = fi.readlines() 158 | for i in range(len(data)): 159 | if data[i].split()[1] not in CanonTemp[data[i][0]][data[i][1]]: 160 | CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]] = [] 161 | CanonTemp[data[i][0]][data[i][1]][data[i].split()[1]].append(data[i].split()[2:]) 162 | return CanonTemp 163 | 164 | ################################################################################################################# 165 | # function GetCanon: 166 | # Assign each sequence witht the predicted type of canonical structure according to the template. 167 | # 168 | # Input: Amino, Num 169 | # Output: 1. dictionary of CanonTemp, {'L': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}, 'H': {'1': {'1':[]}, '2': {'1':[]}, '3': {'1':[]}}} 170 | # optional: PIGS / Chothia 171 | ################################################################################################################# 172 | 173 | def GetCanon(canonical_direct, Amino, Num): 174 | CanonTemp = ReadCanonTemp(canonical_direct) 175 | Canon = {'L': {'1': {}, '2': {}, '3': {}}, 'H': {'1': {}, '2': {}, '3': {}}} 176 | # for every sequence 177 | for seq_name in Num['H']: 178 | 179 | for L_H in Canon: 180 | if IF_ONLY_HEAVY: 181 | if L_H == 'L': 182 | continue 183 | 184 | for j in Canon[L_H]: 185 | cnt_len = 0 186 | 187 | for k in Num[L_H][seq_name]: 188 | if k[-1]>='A'and k[-1]<='Z': 189 | num_i = int(k[:-1]) 190 | else: 191 | num_i = int(k) 192 | if num_i >= CHOTHIA_CDR[L_H][j][0] and num_i <= CHOTHIA_CDR[L_H][j][1]: 193 | cnt_len += 1 194 | length = cnt_len 195 | # for every type number on specific CDR region 196 | for k in CanonTemp[L_H][j]: 197 | ############## same type have diff version of template 198 | for m in range(len(CanonTemp[L_H][j][k])): 199 | # if have matched CDR length, then give zero type 200 | if CanonTemp[L_H][j][k][m][0] == str(length): 201 | # check if length is the only restriction 202 | if len(CanonTemp[L_H][j][k][m]) == 1: 203 | Canon[L_H][j][seq_name] = k 204 | # check for each position with in specific motif 205 | else: 206 | restriction = CanonTemp[L_H][j][k][m][1:] 207 | for l in range(0,len(restriction),2): 208 | 209 | pos = CanonTemp[L_H][j][k][m][l+1] 210 | 211 | # index of the number 212 | if pos not in Num[L_H][seq_name]: 213 | break 214 | else: 215 | id = int(Num[L_H][seq_name].index(pos)) 216 | s=CanonTemp[L_H][j][k][m][l + 2] 217 | 218 | if Amino[L_H][seq_name][id] not in CanonTemp[L_H][j][k][m][l+2]: 219 | break 220 | Canon[L_H][j][seq_name] = k 221 | # if no match canonical structure found, then append 0 222 | if seq_name not in Canon[L_H][j]: 223 | Canon[L_H][j][seq_name] = '0' 224 | return Canon 225 | 226 | ################################################################################################################# 227 | # function GetOneHotCanon: 228 | # Similar to GetOneHotGerm, transform the stored canonical structure into one-hot encoded features. 229 | # 230 | # Input: Amino, Num, DatasetSize, DatasetName 231 | # Output: 1. array of OneHotCanon, [[seq1 onehot], [seq2 onehot], [seq3 onehot], ...] 232 | # 2. list of CanonFeatureNames according to one hot, [Canon_L1_1, Canon_L1_2,.... 233 | # Canon_L2_1, 234 | # Canon_L3_1, 235 | # Canon_H1_1, 236 | # Canon_H2_1, 237 | # Canon_H3_1,...] 238 | ################################################################################################################# 239 | 240 | def GetOneHotCanon(canonical_direct, Amino, Num, DatasetSize, DatasetName): 241 | Canon = GetCanon(canonical_direct, Amino, Num) 242 | OneHotCanon = [] 243 | CanonFeatureNames = [] 244 | # for every feature type 245 | 246 | for H_L in Canon: 247 | if IF_ONLY_HEAVY: 248 | if H_L=='L': 249 | continue 250 | # O_T_T stands for 1_2_3 251 | for O_T_T in Canon[H_L]: 252 | # every feature name in that type 253 | candidate = list(sorted(set(Canon[H_L][O_T_T].values()))) 254 | for can in candidate: 255 | CanonFeatureNames.append('Canonical_' +H_L+ O_T_T+'_'+can) 256 | 257 | # for every dataset 258 | for i, name in enumerate(DatasetName): 259 | tmp = [[] for j in range(int(DatasetSize[i]))] 260 | # for every seq in that dataset 261 | for j in range(int(DatasetSize[i])): 262 | seq_name = name + '_' + str(j) 263 | for k in range(len(CanonFeatureNames)): 264 | H_L = CanonFeatureNames[k].split('_')[1][0] 265 | O_T_T = CanonFeatureNames[k].split('_')[1][1] 266 | if Canon[H_L][O_T_T][seq_name] == CanonFeatureNames[k].split('_')[2]: 267 | tmp[j].append(1) 268 | else: 269 | tmp[j].append(0) 270 | OneHotCanon += tmp 271 | 272 | return OneHotCanon, CanonFeatureNames 273 | 274 | ################################################################################################################# 275 | # function GetCDRH3: 276 | # Take the CDR-H3 of each seqeunce. 277 | # 278 | # Input: Amino, Num 279 | # Output: 1. dictionary of CDRH3, {} 280 | ################################################################################################################# 281 | 282 | def GetCDRH3(Amino, Num): 283 | CDRH3={} 284 | for seq_name in Amino['H']: 285 | CDRH3[seq_name]='' 286 | for i in range(len(Num['H'][seq_name])): 287 | number = Num['H'][seq_name][i] 288 | if number[-1] >= 'A' and number[-1] <= 'Z': 289 | num_i = int(number[:-1]) 290 | else: 291 | num_i = int(number) 292 | if num_i >= CHOTHIA_CDR['H']['3'][0] and num_i <= CHOTHIA_CDR['H']['3'][1]: 293 | CDRH3[seq_name] += Amino['H'][seq_name][i] 294 | return CDRH3 295 | 296 | ################################################################################################################# 297 | # function GetCDRH3PI: 298 | # Calculate the pI value for each sequence 299 | # 300 | # Input: CDRH3 301 | # Output: 1. dictionary of PI, {} 302 | ################################################################################################################# 303 | 304 | def GetCDRH3PI(CDRH3): 305 | void = ['KYPLAVSGIIT', '-------V', 'GVVTAAIDGMDV','DLYSGYRSYGLDV', 'GGTSYYGTDV','EEGDIPGTTCMDV'] 306 | PI_CDRH3={} 307 | for seq_name in CDRH3: 308 | prot = Bio.SeqUtils.ProtParam.ProteinAnalysis(CDRH3[seq_name]) 309 | try: 310 | PI_CDRH3[seq_name] = prot.isoelectric_point() 311 | except: 312 | PI_CDRH3[seq_name] = -1 313 | 314 | return PI_CDRH3 315 | 316 | 317 | ################################################################################################################# 318 | # function GetPIBin: 319 | # Halve the bin of pI following the binning method using sequence's pI information. 320 | # 321 | # Input: PI_CDRH3 322 | # Output: 1. a list of PITheresholds, [] 323 | ################################################################################################################# 324 | 325 | def GetPIBin(PI_CDRH3): 326 | PITheresholds = [0.0, 7.0, 14.0] 327 | tenPercent = 0.1*len(PI_CDRH3) 328 | PITolerance = 0.3 329 | cnt = 0 330 | while cnt > tenPercent or len(PITheresholds) == 3: 331 | # count how many sequence over threshold 332 | for i in range(1, len(PITheresholds)): 333 | cnt = 0 334 | if (PITheresholds[i] - PITheresholds[i-1])< (2 * PITolerance): 335 | continue 336 | # go over the dict 337 | for seq in PI_CDRH3: 338 | if PI_CDRH3[seq]> PITheresholds[i-1] and PI_CDRH3[seq] tenPercent: 343 | PITheresholds.append((PITheresholds[i-1] + PITheresholds[i])/2.0) 344 | PITheresholds = sorted(PITheresholds) 345 | break 346 | return PITheresholds 347 | 348 | ################################################################################################################# 349 | # function GetOneHotPI: 350 | # Transform the pI values into one-hot encoded pI bin features. 351 | # 352 | # Input: CDRH3, DatasetSize, DatasetName 353 | # Output: 1. array of OneHotPI, [[seq1 onehot], 354 | # [seq2 onehot], 355 | # [seq3 onehot], 356 | # ...] 357 | # 2. list of PIFeatureNames according to one hot, [PI_bin1, PI_bin2, PI_bin3...] 358 | ################################################################################################################# 359 | 360 | def GetOneHotPI(CDRH3, DatasetSize, DatasetName): 361 | 362 | PI_CDRH3 = GetCDRH3PI(CDRH3) 363 | 364 | PITheresholds = GetPIBin(PI_CDRH3) 365 | 366 | PIFeatureNames = [] 367 | OneHotPI = [] 368 | for i in range(1, len(PITheresholds)): 369 | PIFeatureNames.append('PI_'+str(PITheresholds[i-1])+'-'+str(PITheresholds[i])) 370 | 371 | # for every dataset 372 | for i, name in enumerate(DatasetName): 373 | tmp = [[0 for k in range(len(PIFeatureNames))] for j in range(int(DatasetSize[i]))] 374 | # for every seq in that dataset 375 | for j in range(int(DatasetSize[i])): 376 | seq_name = name + '_' + str(j) 377 | for k in range(1, len(PITheresholds)): 378 | if PI_CDRH3[seq_name] >= float(PITheresholds[k-1]) and PI_CDRH3[seq_name] <= float(PITheresholds[k]): 379 | tmp[j][k-1] = 1 380 | break 381 | OneHotPI += tmp 382 | return OneHotPI, PIFeatureNames 383 | 384 | ################################################################################################################# 385 | # function GetPositionalMotifFreq: 386 | # Count the frequency of each possible frequent possitional motif for each dataset. 387 | # 388 | # Input: CDRH3 389 | # Output: 1. dictionary of MotifFreq, {'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}} 390 | ################################################################################################################# 391 | 392 | def GetPositionalMotifFreq(CDRH3): 393 | MotifFreq ={'r1':{}, 'r2':{},'t1':{}, 't2':{}, 't3':{}, 't4':{}, 't5':{}, 't6':{}, 't7':{}, 't8':{}} 394 | MotifDict = {} 395 | for seq_name in CDRH3: 396 | MotifDict[seq_name] = [] 397 | f_name = seq_name.split('_')[0] 398 | # length of motif 399 | for i in range(2, 10): 400 | if i > len(CDRH3[seq_name]): 401 | continue 402 | else: 403 | for j in range(len(CDRH3[seq_name])-i): 404 | PostionalMotif = str(j) +'_'+CDRH3[seq_name][j:j+i] 405 | 406 | MotifDict[seq_name].append(PostionalMotif) 407 | if PostionalMotif in MotifFreq[f_name]: 408 | MotifFreq[f_name][PostionalMotif] += 1 409 | else: 410 | MotifFreq[f_name][PostionalMotif] = 1 411 | return MotifFreq, MotifDict 412 | 413 | ################################################################################################################# 414 | # function GetImpMotif (Version 1.0): 415 | # Take only the most 2 frequent motif in each data set, top 2 * 10 set * 9 length = 180 416 | # 417 | # Input: MotifFreq 418 | # Output: 1. list of ImpMotif, [motif1, motif2, ...] 419 | ################################################################################################################# 420 | 421 | def GetImpMotif(MotifFreq): 422 | ImpMotif = [] 423 | Top2 = 2 424 | for f_name in MotifFreq: 425 | motif_dic = MotifFreq[f_name] 426 | for i in range(2, 11): 427 | tmp = {} 428 | for motif in motif_dic: 429 | 430 | if motif.split('_')[0] == str(i): 431 | tmp[motif]= motif_dic[motif] 432 | sorted_tmp = sorted(tmp.items(),key= lambda k: k[1],reverse= True) 433 | for j in range(Top2): 434 | if len(sorted_tmp)> j: 435 | ImpMotif.append(sorted_tmp[j][0]) 436 | ImpMotif = list(sorted(set(ImpMotif))) 437 | return ImpMotif 438 | 439 | ################################################################################################################# 440 | # function GetCDRH3Motif: 441 | # Assign present frequent motif for each sequence 442 | # 443 | # Input: ImpMotif, CDRH3 444 | # Output: 1. dictionary of Motif_CDRH3, {} 445 | ################################################################################################################# 446 | 447 | def GetCDRH3Motif(ImpMotif, CDRH3, MotifDict): 448 | Motif_CDRH3={} 449 | for seq_name in CDRH3: 450 | # seq_len = len(CDRH3[seq_name]) 451 | Motif_CDRH3[seq_name]=[0 for z in range(len(ImpMotif))] 452 | for i in range(len(ImpMotif)): 453 | if ImpMotif[i] in MotifDict[seq_name]: 454 | Motif_CDRH3[seq_name][i] = 1 455 | return Motif_CDRH3 456 | 457 | ################################################################################################################# 458 | # function MultiHotMotif: 459 | # Transfer motif information for each sequence to multi-hot encoded features. 460 | # 461 | # Input: CDRH3, DatasetSize, DatasetName 462 | # Output: 1. array of MultiHotMotif, [[seq1 multihot], [seq2 multihot], [seq3 multihot],...] 463 | # 2. list of MotifFeatureNames according to multi hot, [Motif1, Motif2, ...] 464 | ################################################################################################################# 465 | 466 | def MultiHotMotif(CDRH3, DatasetSize, DatasetName): 467 | MotifFreq, MotifDict = GetPositionalMotifFreq(CDRH3) 468 | 469 | ImpMotif = GetImpMotif(MotifFreq) 470 | 471 | Motif_CDRH3 = GetCDRH3Motif(ImpMotif, CDRH3, MotifDict) 472 | 473 | MotifFeatureNames = [] 474 | for motif in ImpMotif: 475 | MotifFeatureNames.append("Motif_"+ motif) 476 | 477 | MultiHotMotif =[] 478 | for i, name in enumerate(DatasetName): 479 | tmp = [[] for j in range(int(DatasetSize[i]))] 480 | # for every seq in that dataset 481 | for j in range(int(DatasetSize[i])): 482 | seq_name = name + '_' + str(j) 483 | tmp[j]= Motif_CDRH3[seq_name] 484 | MultiHotMotif+=tmp 485 | return MultiHotMotif, MotifFeatureNames 486 | 487 | ################################################################################################################# 488 | # function GetFeatureVectors: 489 | # Combine germline, canonical structure, pI, motif features to feature vectors 490 | # 491 | # Input: OneHotGerm, GermFeatureNames, OneHotCanon, CanonFeatureNames, OneHotPI, PIFeatureNames, MultiHotMotif, MotifFeatureNames 492 | # Output: 1. AllFeatureVectors for every sequence, [[seq1 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...], 493 | # [seq2 LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...], 494 | # ...] 495 | # 496 | # 2. AllFeatureNames [LV, LJ, HV, HJ, L1, L2, L3, L1, L2, L3, pI, motif1, motif2, motifi...] 497 | ################################################################################################################# 498 | 499 | def GetFeatureVectors(OneHotGerm, GermFeatureNames, 500 | OneHotCanon, CanonFeatureNames, 501 | OneHotPI, PIFeatureNames, 502 | MultiHotMotif, MotifFeatureNames): 503 | AllFeatureNames= GermFeatureNames + CanonFeatureNames + PIFeatureNames + MotifFeatureNames 504 | AllFeatureVectors =[[] for i in range(len(OneHotGerm))] 505 | # num of seq 506 | for i in range(len(OneHotGerm)): 507 | AllFeatureVectors[i] += OneHotGerm[i] 508 | AllFeatureVectors[i] += OneHotCanon[i] 509 | AllFeatureVectors[i] += OneHotPI[i] 510 | AllFeatureVectors[i] += MultiHotMotif[i] 511 | 512 | 513 | AllFeatureVectors = np.array(AllFeatureVectors) 514 | ExcludeIGHVVectors = AllFeatureVectors 515 | ExcludeFeatureNames = AllFeatureNames 516 | if SET_NAME == 'IGHV': 517 | name_index = [] 518 | ExcludeFeatureNames = [] 519 | for i, name in enumerate(AllFeatureNames): 520 | if not name.startswith('Germ_HV_IGHV3-23'): 521 | name_index.append(i) 522 | ExcludeFeatureNames.append(AllFeatureNames[i]) 523 | 524 | ExcludeIGHVVectors = AllFeatureVectors[:, name_index] 525 | 526 | return AllFeatureVectors, AllFeatureNames, ExcludeIGHVVectors, ExcludeFeatureNames 527 | 528 | if __name__=='__main__': 529 | targeting_direct = '../testCase-MMP/data/IGHV/' 530 | reference_direct = '../testCase-MMP/data/IGHV/' 531 | Amino, Num, Germ, DatasetName, DatasetSize = ReadAminoNumGerm(targeting_direct, reference_direct) 532 | 533 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 162 | 163 | 164 | 165 | distance 166 | motif 167 | PostionalMotifposi 168 | jaccar 169 | corr 170 | print 171 | moti 172 | head 173 | heatmap 174 | rank 175 | less' 176 | .index 177 | open( 178 | WriteFisherFS 179 | sta 180 | heat map 181 | float( 182 | feature 183 | print( 184 | all 185 | shuffle 186 | referen 187 | set 188 | frequency 189 | _new 190 | set( 191 | startswith 192 | 2_GG 193 | mean 194 | importance 195 | 196 | 197 | 198 | 200 | 201 | 218 | 219 | 220 | 221 | 222 | true 223 | DEFINITION_ORDER 224 | 225 | 226 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 |